784 files changed, 29869 insertions, 15673 deletions
diff --git a/.gitignore b/.gitignore
index 2e2713a..d7dcc54 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,8 @@
 *.orig
 # Byte compiled python modules.
 *.pyc
+# vim swap files
+.*.swp
 
 #==============================================================================#
 # Explicit files to ignore (only matches one).
diff --git a/Makefile b/Makefile
index 7dad07b..88e63e9 100644
--- a/Makefile
+++ b/Makefile
@@ -69,7 +69,7 @@ endif
 ifeq ($(MAKECMDGOALS),install-clang)
   DIRS := tools/clang/tools/driver tools/clang/lib/Headers \
           tools/clang/runtime tools/clang/docs \
-          tools/lto
+          tools/lto runtime
   OPTIONAL_DIRS :=
   NO_INSTALL = 1
 endif
@@ -83,7 +83,7 @@ ifeq ($(MAKECMDGOALS),install-clang-c)
 endif
 
 ifeq ($(MAKECMDGOALS),clang-only)
-  DIRS := $(filter-out tools runtime docs unittests, $(DIRS)) \
+  DIRS := $(filter-out tools docs unittests, $(DIRS)) \
           tools/clang tools/lto
   OPTIONAL_DIRS :=
 endif
diff --git a/autoconf/config.guess b/autoconf/config.guess
index 865fe53..9807c91 100755
--- a/autoconf/config.guess
+++ b/autoconf/config.guess
@@ -789,13 +789,12 @@ EOF
 	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
 	exit ;;
     *:FreeBSD:*:*)
+        UNAME_PROCESSOR=`/usr/bin/uname -p`
 	case ${UNAME_MACHINE} in
-	    pc98)
-		echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
 	    amd64)
 		echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
 	    *)
-		echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+		echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
 	esac
 	exit ;;
     i*:CYGWIN*:*)
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index b55f564..f0e7c42 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -1528,7 +1528,7 @@ dnl Set the flags needed to emit LLVM IR and to disable optimizations
 dnl in llvmgcc
 if test "$llvm_cv_llvmgcc_dragonegg" = "yes" ; then
   LLVMCC_EMITIR_FLAG="-fplugin-arg-dragonegg-emit-ir"
-  LLVMCC_DISABLEOPT_FLAGS="-fplugin-arg-dragonegg-disable-llvm-optzns"
+  LLVMCC_DISABLEOPT_FLAGS="-fplugin-arg-dragonegg-llvm-ir-optimize=0"
 else
   LLVMCC_EMITIR_FLAG="-emit-llvm"
   LLVMCC_DISABLEOPT_FLAGS="-mllvm -disable-llvm-optzns"
@@ -1723,6 +1723,12 @@ AC_CONFIG_FILES([Makefile.config])
 dnl Configure the RPM spec file for LLVM
 AC_CONFIG_FILES([llvm.spec])
 
+dnl Configure doxygen's configuration file
+AC_CONFIG_FILES([docs/doxygen.cfg])
+if test -f ${srcdir}/tools/clang/README.txt; then
+  AC_CONFIG_FILES([tools/clang/docs/doxygen.cfg])
+fi
+
 dnl Configure llvmc's Base plugin
 AC_CONFIG_FILES([tools/llvmc/src/Base.td])
 
diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 0633ac9..d0427ac 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -1,5 +1,11 @@
 include(AddLLVMDefinitions)
 
+if( CMAKE_COMPILER_IS_GNUCXX )
+  set(LLVM_COMPILER_IS_GCC_COMPATIBLE ON)
+elseif( "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" )
+  set(LLVM_COMPILER_IS_GCC_COMPATIBLE ON)
+endif()
+
 # Run-time build mode; It is used for unittests.
 if(MSVC_IDE)
   # Expect "$(Configuration)", "$(OutDir)", etc.
@@ -167,7 +173,7 @@ if( MSVC )
   if (LLVM_ENABLE_WERROR)
     add_llvm_definitions( /WX )
   endif (LLVM_ENABLE_WERROR)
-elseif( CMAKE_COMPILER_IS_GNUCXX )
+elseif( LLVM_COMPILER_IS_GCC_COMPATIBLE )
   if (LLVM_ENABLE_WARNINGS)
     add_llvm_definitions( -Wall -W -Wno-unused-parameter -Wwrite-strings )
     if (LLVM_ENABLE_PEDANTIC)
diff --git a/cmake/modules/LLVMProcessSources.cmake b/cmake/modules/LLVMProcessSources.cmake
index 270292a..641f1b3 100644
--- a/cmake/modules/LLVMProcessSources.cmake
+++ b/cmake/modules/LLVMProcessSources.cmake
@@ -56,7 +56,7 @@ function(llvm_process_sources OUT_VAR)
 
   # Set common compiler options:
   if( NOT LLVM_REQUIRES_EH )
-    if( CMAKE_COMPILER_IS_GNUCXX )
+    if( LLVM_COMPILER_IS_GCC_COMPATIBLE )
       add_definitions( -fno-exceptions )
     elseif( MSVC )
       llvm_replace_compiler_option(CMAKE_CXX_FLAGS "/EHsc" "/EHs-c-")
@@ -64,7 +64,7 @@ function(llvm_process_sources OUT_VAR)
     endif()
   endif()
   if( NOT LLVM_REQUIRES_RTTI )
-    if( CMAKE_COMPILER_IS_GNUCXX )
+    if( LLVM_COMPILER_IS_GCC_COMPATIBLE )
       llvm_replace_compiler_option(CMAKE_CXX_FLAGS "-frtti" "-fno-rtti")
     elseif( MSVC )
       llvm_replace_compiler_option(CMAKE_CXX_FLAGS "/GR" "/GR-")
diff --git a/configure b/configure
index f1f4dd3..0b10a90 100755
--- a/configure
+++ b/configure
@@ -22245,7 +22245,7 @@ echo "${ECHO_T}$llvm_cv_llvmgcc_dragonegg" >&6; }
 
 if test "$llvm_cv_llvmgcc_dragonegg" = "yes" ; then
   LLVMCC_EMITIR_FLAG="-fplugin-arg-dragonegg-emit-ir"
-  LLVMCC_DISABLEOPT_FLAGS="-fplugin-arg-dragonegg-disable-llvm-optzns"
+  LLVMCC_DISABLEOPT_FLAGS="-fplugin-arg-dragonegg-llvm-ir-optimize=0"
 else
   LLVMCC_EMITIR_FLAG="-emit-llvm"
   LLVMCC_DISABLEOPT_FLAGS="-mllvm -disable-llvm-optzns"
@@ -22574,6 +22574,13 @@ ac_config_files="$ac_config_files Makefile.config"
 ac_config_files="$ac_config_files llvm.spec"
 
 
+ac_config_files="$ac_config_files docs/doxygen.cfg"
+
+if test -f ${srcdir}/tools/clang/README.txt; then
+  ac_config_files="$ac_config_files tools/clang/docs/doxygen.cfg"
+
+fi
+
 ac_config_files="$ac_config_files tools/llvmc/src/Base.td"
 
 
@@ -23194,6 +23201,8 @@ do
     "include/llvm/Support/DataTypes.h") CONFIG_HEADERS="$CONFIG_HEADERS include/llvm/Support/DataTypes.h" ;;
     "Makefile.config") CONFIG_FILES="$CONFIG_FILES Makefile.config" ;;
     "llvm.spec") CONFIG_FILES="$CONFIG_FILES llvm.spec" ;;
+    "docs/doxygen.cfg") CONFIG_FILES="$CONFIG_FILES docs/doxygen.cfg" ;;
+    "tools/clang/docs/doxygen.cfg") CONFIG_FILES="$CONFIG_FILES tools/clang/docs/doxygen.cfg" ;;
     "tools/llvmc/src/Base.td") CONFIG_FILES="$CONFIG_FILES tools/llvmc/src/Base.td" ;;
     "tools/llvm-config/llvm-config.in") CONFIG_FILES="$CONFIG_FILES tools/llvm-config/llvm-config.in" ;;
     "setup") CONFIG_COMMANDS="$CONFIG_COMMANDS setup" ;;
diff --git a/docs/AliasAnalysis.html b/docs/AliasAnalysis.html
index b93cff0..770cfe2 100644
--- a/docs/AliasAnalysis.html
+++ b/docs/AliasAnalysis.html
@@ -745,9 +745,9 @@ for monitoring and evaluating different implementations.</p>
 <div>
 
 <p>This section lists the various implementations of the <tt>AliasAnalysis</tt>
-interface.  With the exception of the <a href="#no-aa"><tt>-no-aa</tt></a> and
-<a href="#basic-aa"><tt>-basicaa</tt></a> implementations, all of these <a
-href="#chaining">chain</a> to other alias analysis implementations.</p>
+interface.  With the exception of the <a href="#no-aa"><tt>-no-aa</tt></a>
+implementation, all of these <a href="#chaining">chain</a> to other alias
+analysis implementations.</p>
 
 <!-- _______________________________________________________________________ -->
 <h4>
@@ -1060,7 +1060,7 @@ analysis directly.</p>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-04-21 03:52:00 +0200 (Thu, 21 Apr 2011) $
+  Last modified: $Date: 2011-05-25 00:01:32 +0200 (Wed, 25 May 2011) $
 </address>
 
 </body>
diff --git a/docs/CodeGenerator.html b/docs/CodeGenerator.html
index d082acc..29a2cce 100644
--- a/docs/CodeGenerator.html
+++ b/docs/CodeGenerator.html
@@ -2019,6 +2019,11 @@ def : InstAlias&lt;"fcomi $reg", (COM_FIr RST:$reg)&gt;;
 <p>Instruction aliases can also have a Requires clause to make them
 subtarget specific.</p>
 
+<p>If the back-end supports it, the instruction printer can automatically emit
+   the alias rather than what's being aliased. It typically leads to better,
+   more readable code. If it's better to print out what's being aliased, then
+   pass a '0' as the third parameter to the InstAlias definition.</p>
+
 </div>
 
 </div>
@@ -2464,11 +2469,14 @@ entry:
 <p>The following target-specific calling conventions are known to backend:</p>
 
 <ul>
-  <li><b>x86_StdCall</b> &mdash; stdcall calling convention seen on Microsoft
-      Windows platform (CC ID = 64).</li>
-
-  <li><b>x86_FastCall</b> &mdash; fastcall calling convention seen on Microsoft
-      Windows platform (CC ID = 65).</li>
+<li><b>x86_StdCall</b> &mdash; stdcall calling convention seen on Microsoft
+    Windows platform (CC ID = 64).</li>
+<li><b>x86_FastCall</b> &mdash; fastcall calling convention seen on Microsoft
+    Windows platform (CC ID = 65).</li>
+<li><b>x86_ThisCall</b> &mdash; Similar to X86_StdCall. Passes first argument
+    in ECX,  others via stack. Callee is responsible for stack cleaning. This
+    convention is used by MSVC by default for methods in its ABI
+    (CC ID = 70).</li>
 </ul>
 
 </div>
@@ -2798,7 +2806,7 @@ MOVSX32rm16 -&gt; movsx, 32-bit register, 16-bit memory
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
+  Last modified: $Date: 2011-05-23 00:28:47 +0200 (Mon, 23 May 2011) $
 </address>
 
 </body>
diff --git a/docs/CompilerDriver.html b/docs/CompilerDriver.html
index 03db3a0..1b2f808 100644
--- a/docs/CompilerDriver.html
+++ b/docs/CompilerDriver.html
@@ -679,7 +679,7 @@ the <tt class="docutils literal">llvmc</tt> program behaves when it needs to cho
 <a href="mailto:foldr@codedgers.com">Mikhail Glushenkov</a><br />
 <a href="http://llvm.org">LLVM Compiler Infrastructure</a><br />
 
-Last modified: $Date: 2011-04-24 16:17:37 +0200 (Sun, 24 Apr 2011) $
+Last modified: $Date: 2011-05-07 00:11:29 +0200 (Sat, 07 May 2011) $
 </address></div>
 </div>
 </div>
diff --git a/docs/ExceptionHandling.html b/docs/ExceptionHandling.html
index 16820f3..c0f50e3 100644
--- a/docs/ExceptionHandling.html
+++ b/docs/ExceptionHandling.html
@@ -35,6 +35,7 @@
   <ol>
   	<li><a href="#llvm_eh_exception"><tt>llvm.eh.exception</tt></a></li>
   	<li><a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a></li>
+  	<li><a href="#llvm_eh_resume"><tt>llvm.eh.resume</tt></a></li>
   	<li><a href="#llvm_eh_typeid_for"><tt>llvm.eh.typeid.for</tt></a></li>
   	<li><a href="#llvm_eh_sjlj_setjmp"><tt>llvm.eh.sjlj.setjmp</tt></a></li>
   	<li><a href="#llvm_eh_sjlj_longjmp"><tt>llvm.eh.sjlj.longjmp</tt></a></li>
@@ -317,15 +318,28 @@
 
 <div>
 
-<p>To handle destructors and cleanups in <tt>try</tt> code, control may not run
-   directly from a landing pad to the first catch.  Control may actually flow
-   from the landing pad to clean up code and then to the first catch.  Since the
-   required clean up for each <tt>invoke</tt> in a <tt>try</tt> may be different
-   (e.g. intervening constructor), there may be several landing pads for a given
-   try.  If cleanups need to be run, an <tt>i32 0</tt> should be passed as the
-   last <a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> argument.
-   However, when using DWARF exception handling with C++, a <tt>i8* null</tt>
-   <a href="#restrictions">must</a> be passed instead.</p>
+<p>A cleanup is extra code which needs to be run as part of unwinding
+   a scope.  C++ destructors are a prominent example, but other
+   languages and language extensions provide a variety of different
+   kinds of cleanup.  In general, a landing pad may need to run
+   arbitrary amounts of cleanup code before actually entering a catch
+   block.  To indicate the presence of cleanups, a landing pad's call
+   to <a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> should
+   end with the argument <tt>i32 0</tt>; otherwise, the unwinder will
+   not stop at the landing pad if there are no catches or filters that
+   require it to.</p>
+
+<p>Do not allow a new exception to propagate out of the execution of a
+   cleanup.  This can corrupt the internal state of the unwinder.
+   Different languages describe different high-level semantics for
+   these situations: for example, C++ requires that the process be
+   terminated, whereas Ada cancels both exceptions and throws a third.</p>
+
+<p>When all cleanups have completed, if the exception is not handled
+   by the current function, resume unwinding by calling the
+   <a href="#llvm_eh_resume"><tt>llvm.eh.resume</tt></a> intrinsic,
+   passing in the results of <tt>llvm.eh.exception</tt> and
+   <tt>llvm.eh.selector</tt> for the original landing pad.</p>
 
 </div>
 
@@ -363,22 +377,29 @@
 
 <div>
 
-<p>The semantics of the invoke instruction require that any exception that
-   unwinds through an invoke call should result in a branch to the invoke's
-   unwind label.  However such a branch will only happen if the
-   <a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> matches. Thus in
-   order to ensure correct operation, the front-end must only generate
-   <a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> calls that are
-   guaranteed to always match whatever exception unwinds through the invoke.
-   For most languages it is enough to pass zero, indicating the presence of
-   a <a href="#cleanups">cleanup</a>, as the
-   last <a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> argument.
-   However for C++ this is not sufficient, because the C++ personality function
-   will terminate the program if it detects that unwinding the exception only
-   results in matches with cleanups.  For C++ a <tt>null i8*</tt> should be
-   passed as the last <a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a>
-   argument instead.  This is interpreted as a catch-all by the C++ personality
-   function, and will always match.</p>
+<p>The unwinder delegates the decision of whether to stop in a call
+   frame to that call frame's language-specific personality function.
+   Not all personalities functions guarantee that they will stop to
+   perform cleanups: for example, the GNU C++ personality doesn't do
+   so unless the exception is actually caught somewhere further up the
+   stack.  When using this personality to implement EH for a language
+   that guarantees that cleanups will always be run, be sure to
+   indicate a catch-all in the
+   <a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> call
+   rather than just cleanups.</p>
+
+<p>In order for inlining to behave correctly, landing pads must be
+   prepared to handle selector results that they did not originally
+   advertise.  Suppose that a function catches exceptions of
+   type <tt>A</tt>, and it's inlined into a function that catches
+   exceptions of type <tt>B</tt>.  The inliner will update the
+   selector for the inlined landing pad to include the fact
+   that <tt>B</tt> is caught.  If that landing pad assumes that it
+   will only be entered to catch an <tt>A</tt>, it's in for a rude
+   surprise.  Consequently, landing pads must test for the selector
+   results they understand and then resume exception propagation
+   with the <a href="#llvm_eh_resume"><tt>llvm.eh.resume</tt></a>
+   intrinsic if none of the conditions match.</p>
 
 </div>
 
@@ -424,22 +445,32 @@
 <p>This intrinsic is used to compare the exception with the given type infos,
    filters and cleanups.</p>
 
-<p><a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> takes a minimum of
-   three arguments.  The first argument is the reference to the exception
-   structure. The second argument is a reference to the personality function to
-   be used for this try catch sequence. Each of the remaining arguments is
-   either a reference to the type info for a catch statement,
-   a <a href="#throw_filters">filter</a> expression, or the number zero
-   representing a <a href="#cleanups">cleanup</a>.  The exception is tested
-   against the arguments sequentially from first to last.  The result of
-   the <a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> is a positive
-   number if the exception matched a type info, a negative number if it matched
-   a filter, and zero if it matched a cleanup.  If nothing is matched, the
-   behaviour of the program is <a href="#restrictions">undefined</a>.  If a type
-   info matched then the selector value is the index of the type info in the
-   exception table, which can be obtained using the
+<p><a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> takes a
+   minimum of three arguments.  The first argument is the reference to
+   the exception structure. The second argument is a reference to the
+   personality function to be used for this try catch sequence. Each
+   of the remaining arguments is either a reference to the type info
+   for a catch statement, a <a href="#throw_filters">filter</a>
+   expression, or the number zero representing
+   a <a href="#cleanups">cleanup</a>.  The exception is tested against
+   the arguments sequentially from first to last.  The result of
+   the <a href="#llvm_eh_selector"><tt>llvm.eh.selector</tt></a> is a
+   positive number if the exception matched a type info, a negative
+   number if it matched a filter, and zero if it matched a cleanup.
+   If nothing is matched, or if only a cleanup is matched, different
+   personality functions may or may not cause control to stop at the
+   landing pad; see <a href="#restrictions">the restrictions</a> for
+   more information.  If a type info matched then the selector value
+   is the index of the type info in the exception table, which can be
+   obtained using the
    <a href="#llvm_eh_typeid_for"><tt>llvm.eh.typeid.for</tt></a> intrinsic.</p>
 
+<p>If a landing pad containing a call to <tt>llvm.eh.selector</tt> is
+   inlined into an <tt>invoke</tt> instruction, the selector arguments
+   for the outer landing pad are appended to those of the inlined
+   landing pad.  Consequently, landing pads must be written to ignore
+   selector values that they did not originally advertise.</p>
+
 </div>
 
 <!-- ======================================================================= -->
@@ -462,6 +493,33 @@
 
 <!-- ======================================================================= -->
 <h4>
+  <a name="llvm_eh_resume">llvm.eh.resume</a>
+</h4>
+
+<div>
+
+<pre>
+  void %<a href="#llvm_eh_resume">llvm.eh.resume</a>(i8*, i32) noreturn
+</pre>
+
+<p>This intrinsic is used to resume propagation of an exception after
+   landing at a landing pad.  The first argument should be the result
+   of <a href="#llvm_eh_exception">llvm.eh.exception</a> for that
+   landing pad, and the second argument should be the result of
+   <a href="#llvm_eh_selector">llvm.eh.selector</a>.  When a call to
+   this intrinsic is inlined into an invoke, the call is transformed
+   into a branch to the invoke's unwind destination, using its
+   arguments in place of the calls
+   to <a href="#llvm_eh_exception">llvm.eh.exception</a> and
+   <a href="#llvm_eh_selector">llvm.eh.selector</a> there.</p>
+
+<p>This intrinsic is not implicitly <tt>nounwind</tt>; calls to it
+   will always throw.  It may not be invoked.</p>
+
+</div>
+
+<!-- ======================================================================= -->
+<h4>
   <a name="llvm_eh_sjlj_setjmp">llvm.eh.sjlj.setjmp</a>
 </h4>
 
@@ -637,7 +695,7 @@
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
+  Last modified: $Date: 2011-05-28 09:45:59 +0200 (Sat, 28 May 2011) $
 </address>
 
 </body>
diff --git a/docs/HowToSubmitABug.html b/docs/HowToSubmitABug.html
index 54f548c..81d0f99 100644
--- a/docs/HowToSubmitABug.html
+++ b/docs/HowToSubmitABug.html
@@ -151,7 +151,7 @@ Then run:</p>
 </div>
 
 <p>This command should do two things: it should print out a list of passes, and
-then it should crash in the same was as llvm-gcc.  If it doesn't crash, please
+then it should crash in the same way as llvm-gcc.  If it doesn't crash, please
 follow the instructions for a <a href="#front-end">front-end bug</a>.</p>
 
 <p>If this does crash, then you should be able to debug this with the following
@@ -340,7 +340,7 @@ the following:</p>
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a>
   <br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
+  Last modified: $Date: 2011-06-07 22:03:13 +0200 (Tue, 07 Jun 2011) $
 </address>
 
 </body>
diff --git a/docs/LangRef.html b/docs/LangRef.html
index 0e37e82..48bddfd 100644
--- a/docs/LangRef.html
+++ b/docs/LangRef.html
@@ -239,6 +239,8 @@
           <li><a href="#int_sin">'<tt>llvm.sin.*</tt>' Intrinsic</a></li>
           <li><a href="#int_cos">'<tt>llvm.cos.*</tt>' Intrinsic</a></li>
           <li><a href="#int_pow">'<tt>llvm.pow.*</tt>' Intrinsic</a></li>
+          <li><a href="#int_exp">'<tt>llvm.exp.*</tt>' Intrinsic</a></li>
+          <li><a href="#int_log">'<tt>llvm.log.*</tt>' Intrinsic</a></li>
         </ol>
       </li>
       <li><a href="#int_manip">Bit Manipulation Intrinsics</a>
@@ -2446,11 +2448,11 @@ entry:
 
   %narrowaddr = bitcast i32* @g to i16*
   %wideaddr = bitcast i32* @g to i64*
-  %trap3 = load 16* %narrowaddr      ; Returns a trap value.
-  %trap4 = load i64* %widaddr        ; Returns a trap value.
+  %trap3 = load i16* %narrowaddr     ; Returns a trap value.
+  %trap4 = load i64* %wideaddr       ; Returns a trap value.
 
-  %cmp = icmp i32 slt %trap, 0       ; Returns a trap value.
-  %br i1 %cmp, %true, %end           ; Branch to either destination.
+  %cmp = icmp slt i32 %trap, 0       ; Returns a trap value.
+  br i1 %cmp, label %true, label %end ; Branch to either destination.
 
 true:
   volatile store i32 0, i32* @g      ; This is control-dependent on %cmp, so
@@ -2467,19 +2469,19 @@ end:
                                      ; if %cmp is true, or the store in %entry
                                      ; otherwise, so this is undefined behavior.
 
-  %br i1 %cmp, %second_true, %second_end
+  br i1 %cmp, label %second_true, label %second_end
                                      ; The same branch again, but this time the
                                      ; true block doesn't have side effects.
 
 second_true:
   ; No side effects!
-  br label %end
+  ret void
 
 second_end:
   volatile store i32 0, i32* @g      ; This time, the instruction always depends
                                      ; on the store in %end. Also, it is
                                      ; control-equivalent to %end, so this is
-                                     ; well- defined (again, ignoring earlier
+                                     ; well-defined (again, ignoring earlier
                                      ; undefined behavior in this example).
 </pre>
 
@@ -4302,7 +4304,7 @@ that the invoke/unwind semantics are likely to change in future versions.</p>
 
 <h5>Syntax:</h5>
 <pre>
-  &lt;result&gt; = insertvalue &lt;aggregate type&gt; &lt;val&gt;, &lt;ty&gt; &lt;elt&gt;, &lt;idx&gt;    <i>; yields &lt;aggregate type&gt;</i>
+  &lt;result&gt; = insertvalue &lt;aggregate type&gt; &lt;val&gt;, &lt;ty&gt; &lt;elt&gt;, &lt;idx&gt;{, <idx>}*    <i>; yields &lt;aggregate type&gt;</i>
 </pre>
 
 <h5>Overview:</h5>
@@ -4326,8 +4328,9 @@ that the invoke/unwind semantics are likely to change in future versions.</p>
 
 <h5>Example:</h5>
 <pre>
-  %agg1 = insertvalue {i32, float} undef, i32 1, 0         <i>; yields {i32 1, float undef}</i>
-  %agg2 = insertvalue {i32, float} %agg1, float %val, 1    <i>; yields {i32 1, float %val}</i>
+  %agg1 = insertvalue {i32, float} undef, i32 1, 0              <i>; yields {i32 1, float undef}</i>
+  %agg2 = insertvalue {i32, float} %agg1, float %val, 1         <i>; yields {i32 1, float %val}</i>
+  %agg3 = insertvalue {i32, {float}} %agg1, float %val, 1, 0    <i>; yields {i32 1, float %val}</i>
 </pre>
 
 </div>
@@ -6496,6 +6499,72 @@ LLVM</a>.</p>
 
 </div>
 
+<!-- _______________________________________________________________________ -->
+<h4>
+  <a name="int_exp">'<tt>llvm.exp.*</tt>' Intrinsic</a>
+</h4>
+
+<div>
+
+<h5>Syntax:</h5>
+<p>This is an overloaded intrinsic. You can use <tt>llvm.exp</tt> on any
+   floating point or vector of floating point type. Not all targets support all
+   types however.</p>
+
+<pre>
+  declare float     @llvm.exp.f32(float  %Val)
+  declare double    @llvm.exp.f64(double %Val)
+  declare x86_fp80  @llvm.exp.f80(x86_fp80  %Val)
+  declare fp128     @llvm.exp.f128(fp128 %Val)
+  declare ppc_fp128 @llvm.exp.ppcf128(ppc_fp128  %Val)
+</pre>
+
+<h5>Overview:</h5>
+<p>The '<tt>llvm.exp.*</tt>' intrinsics perform the exp function.</p>
+
+<h5>Arguments:</h5>
+<p>The argument and return value are floating point numbers of the same
+   type.</p>
+
+<h5>Semantics:</h5>
+<p>This function returns the same values as the libm <tt>exp</tt> functions
+   would, and handles error conditions in the same way.</p>
+
+</div>
+
+<!-- _______________________________________________________________________ -->
+<h4>
+  <a name="int_log">'<tt>llvm.log.*</tt>' Intrinsic</a>
+</h4>
+
+<div>
+
+<h5>Syntax:</h5>
+<p>This is an overloaded intrinsic. You can use <tt>llvm.log</tt> on any
+   floating point or vector of floating point type. Not all targets support all
+   types however.</p>
+
+<pre>
+  declare float     @llvm.log.f32(float  %Val)
+  declare double    @llvm.log.f64(double %Val)
+  declare x86_fp80  @llvm.log.f80(x86_fp80  %Val)
+  declare fp128     @llvm.log.f128(fp128 %Val)
+  declare ppc_fp128 @llvm.log.ppcf128(ppc_fp128  %Val)
+</pre>
+
+<h5>Overview:</h5>
+<p>The '<tt>llvm.log.*</tt>' intrinsics perform the log function.</p>
+
+<h5>Arguments:</h5>
+<p>The argument and return value are floating point numbers of the same
+   type.</p>
+
+<h5>Semantics:</h5>
+<p>This function returns the same values as the libm <tt>log</tt> functions
+   would, and handles error conditions in the same way.</p>
+
+</div>
+
 <!-- ======================================================================= -->
 <h3>
   <a name="int_manip">Bit Manipulation Intrinsics</a>
@@ -7948,7 +8017,7 @@ LLVM</a>.</p>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
+  Last modified: $Date: 2011-05-27 02:36:31 +0200 (Fri, 27 May 2011) $
 </address>
 
 </body>
diff --git a/docs/Projects.html b/docs/Projects.html
index 3c6d4ad..068acde 100644
--- a/docs/Projects.html
+++ b/docs/Projects.html
@@ -49,7 +49,7 @@ these facilities, a Makefile from a project must do the following things:</p>
     <li><tt>PROJ_SRC_ROOT</tt> - The root of the project's source tree.</li>
     <li><tt>PROJ_OBJ_ROOT</tt> - The root of the project's object tree.</li>
     <li><tt>PROJ_INSTALL_ROOT</tt> - The root installation directory.</li>
-    <li><tt>LEVEL</tt> - The relative path from the current directory to the 
+    <li><tt>LEVEL</tt> - The relative path from the current directory to the
     project's root ($PROJ_OBJ_ROOT).</li>
   </ul></li>
   <li>Include <tt>Makefile.config</tt> from <tt>$(LLVM_OBJ_ROOT)</tt>.</li>
@@ -59,9 +59,9 @@ these facilities, a Makefile from a project must do the following things:</p>
 <p>There are two ways that you can set all of these variables:</p>
 <ol>
   <li>You can write your own Makefiles which hard-code these values.</li>
-  <li>You can use the pre-made LLVM sample project. This sample project 
-  includes Makefiles, a configure script that can be used to configure the 
-  location of LLVM, and the ability to support multiple object directories 
+  <li>You can use the pre-made LLVM sample project. This sample project
+  includes Makefiles, a configure script that can be used to configure the
+  location of LLVM, and the ability to support multiple object directories
   from a single source directory.</li>
 </ol>
 
@@ -88,9 +88,9 @@ choosing.  You can place it anywhere you like.  Rename the directory to match
 the name of your project.</li>
 
 <li>
-If you downloaded LLVM using Subversion, remove all the directories named .svn 
-(and all the files therein) from your project's new source tree.  This will 
-keep Subversion from thinking that your project is inside 
+If you downloaded LLVM using Subversion, remove all the directories named .svn
+(and all the files therein) from your project's new source tree.  This will
+keep Subversion from thinking that your project is inside
 <tt>llvm/trunk/projects/sample</tt>.</li>
 
 <li>Add your source code and Makefiles to your source tree.</li>
@@ -139,7 +139,7 @@ can find LLVM:
 </ol>
 
 <p>That's it!  Now all you have to do is type <tt>gmake</tt> (or <tt>make</tt>
-if your on a GNU/Linux system) in the root of your object directory, and your 
+if your on a GNU/Linux system) in the root of your object directory, and your
 project should build.</p>
 
 </div>
@@ -209,7 +209,7 @@ directories:</p>
     test procedure uses RUN lines in the actual test case to determine
     how to run the test.  See the <a
     href="TestingGuide.html">TestingGuide</a> for more details. You
-    can easily write Makefile support similar to the Makefiles in 
+    can easily write Makefile support similar to the Makefiles in
     <tt>llvm/test</tt> to use Dejagnu to run your project's tests.<br></li>
     <li>
     LLVM contains an optional package called <tt>llvm-test</tt>
@@ -339,16 +339,41 @@ do:</p>
 
   <dt>USEDLIBS
   <dd>
-  This variable holds a space separated list of libraries that
-  should be linked into the program.  These libraries must either
-  be LLVM libraries or libraries that come from your <b>lib</b>
-  directory.  The libraries must be specified by their base name.
-  For example, to link libsample.a, you would set USEDLIBS to
-  <tt>sample</tt>.
+  This variable holds a space separated list of libraries that should
+  be linked into the program.  These libraries must be libraries that
+  come from your <b>lib</b> directory.  The libraries must be
+  specified without their "lib" prefix.  For example, to link
+  libsample.a, you would set USEDLIBS to
+  <tt>sample.a</tt>.
   <p>
   Note that this works only for statically linked libraries.
   <p>
 
+  <dt>LLVMLIBS
+  <dd>
+  This variable holds a space separated list of libraries that should
+  be linked into the program.  These libraries must be LLVM libraries.
+  The libraries must be specified without their "lib" prefix.  For
+  example, to link with a driver that performs an IR transformation
+  you might set LLVMLIBS to this minimal set of libraries
+  <tt>LLVMSupport.a LLVMCore.a LLVMBitReader.a LLVMAsmParser.a LLVMAnalysis.a LLVMTransformUtils.a LLVMScalarOpts.a LLVMTarget.a</tt>.
+  <p>
+  Note that this works only for statically linked libraries. LLVM is
+  split into a large number of static libraries, and the list of libraries you
+  require may be much longer than the list above. To see a full list
+  of libraries use:
+  <tt>llvm-config --libs all</tt>.
+  Using LINK_COMPONENTS as described below, obviates the need to set LLVMLIBS.
+  <p>
+
+  <dt>LINK_COMPONENTS
+  <dd>This variable holds a space separated list of components that
+  the LLVM Makefiles pass to the <tt>llvm-config</tt> tool to generate
+  a link line for the program. For example, to link with all LLVM
+  libraries use
+  <tt>LINK_COMPONENTS = all</tt>.
+  <p>
+
   <dt>LIBS
   <dd>
   To link dynamic libraries, add <tt>-l&lt;library base name&gt;</tt> to
@@ -361,6 +386,9 @@ do:</p>
   <tt>
   LIBS += -lsample
   </tt>
+  <p>
+  Note that LIBS must occur in the Makefile after the inclusion of Makefile.common.
+  <p>
 </dl>
 
 </div>
@@ -441,7 +469,7 @@ href="http://mail.cs.uiuc.edu/mailman/listinfo/llvmdev">LLVM Developers
 Mailing List</a>.</p>
 
 </div>
-  
+
 <!-- *********************************************************************** -->
 <hr>
 <address>
@@ -453,7 +481,7 @@ Mailing List</a>.</p>
   <a href="mailto:criswell@uiuc.edu">John Criswell</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a>
   <br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
+  Last modified: $Date: 2011-06-03 04:20:48 +0200 (Fri, 03 Jun 2011) $
 </address>
 
 </body>
diff --git a/docs/ReleaseNotes.html b/docs/ReleaseNotes.html
index 71bf16e..12546c8 100644
--- a/docs/ReleaseNotes.html
+++ b/docs/ReleaseNotes.html
@@ -4,11 +4,11 @@
 <head>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
   <link rel="stylesheet" href="llvm.css" type="text/css">
-  <title>LLVM 2.9 Release Notes</title>
+  <title>LLVM 3.0 Release Notes</title>
 </head>
 <body>
 
-<h1>LLVM 2.9 Release Notes</h1>
+<h1>LLVM 3.0 Release Notes</h1>
 
 <img align=right src="http://llvm.org/img/DragonSmall.png"
     width="136" height="136" alt="LLVM Dragon Logo">
@@ -16,8 +16,8 @@
 <ol>
   <li><a href="#intro">Introduction</a></li>
   <li><a href="#subproj">Sub-project Status Update</a></li>
-  <li><a href="#externalproj">External Projects Using LLVM 2.9</a></li>
-  <li><a href="#whatsnew">What's New in LLVM 2.9?</a></li>
+  <li><a href="#externalproj">External Projects Using LLVM 3.0</a></li>
+  <li><a href="#whatsnew">What's New in LLVM 3.0?</a></li>
   <li><a href="GettingStarted.html">Installation Instructions</a></li>
   <li><a href="#knownproblems">Known Problems</a></li>
   <li><a href="#additionalinfo">Additional Information</a></li>
@@ -28,10 +28,10 @@
 </div>
 
 <!--
-<h1 style="color:red">These are in-progress notes for the upcoming LLVM 2.9
+<h1 style="color:red">These are in-progress notes for the upcoming LLVM 3.0
 release.<br>
 You may prefer the
-<a href="http://llvm.org/releases/2.8/docs/ReleaseNotes.html">LLVM 2.8
+<a href="http://llvm.org/releases/2.9/docs/ReleaseNotes.html">LLVM 2.9
 Release Notes</a>.</h1>
  -->
 
@@ -44,7 +44,7 @@ Release Notes</a>.</h1>
 <div>
 
 <p>This document contains the release notes for the LLVM Compiler
-Infrastructure, release 2.9.  Here we describe the status of LLVM, including
+Infrastructure, release 3.0.  Here we describe the status of LLVM, including
 major improvements from the previous release and significant known problems.
 All LLVM releases may be downloaded from the <a
 href="http://llvm.org/releases/">LLVM releases web site</a>.</p>
@@ -79,7 +79,7 @@ current one.  To see the release notes for a specific release, please see the
 
 <div>
 <p>
-The LLVM 2.9 distribution currently consists of code from the core LLVM
+The LLVM 3.0 distribution currently consists of code from the core LLVM
 repository (which roughly includes the LLVM optimizers, code generators
 and supporting tools), the Clang repository and the llvm-gcc repository.  In
 addition to this code, the LLVM Project includes other sub-projects that are in
@@ -102,13 +102,7 @@ integrating with other development tools. Clang is considered a
 production-quality compiler for C, Objective-C, C++ and Objective-C++ on x86
 (32- and 64-bit), and for darwin/arm targets.</p>
 
-<p>In the LLVM 2.9 time-frame, the Clang team has made many improvements in C,
-C++ and Objective-C support.  C++ support is now generally rock solid, has
-been exercised on a broad variety of code, and has several new <a 
-href="http://clang.llvm.org/cxx_status.html#cxx0x">C++'0x features</a>
-implemented (such as rvalue references and variadic templates).  LLVM 2.9 has
-also brought in a large range of bug fixes and minor features (e.g. __label__
-support), and is much more compatible with the Linux Kernel.</p>  
+<p>In the LLVM 3.0 time-frame, the Clang team has made many improvements:</p>
   
 <p>If Clang rejects your code but another compiler accepts it, please take a
 look at the <a href="http://clang.llvm.org/compatibility.html">language
@@ -136,15 +130,11 @@ not known whether the compiled code actually works or not!
 </p>
 
 <p>
-The 2.9 release has the following notable changes:
+The 3.0 release has the following notable changes:
 <ul>
-<li>The plugin is much more stable when compiling Fortran.</li>
-<li>Inline assembly where an asm output is tied to an input of a different size
-is now supported in many more cases.</li>
-<li>Basic support for the __float128 type was added.  It is now possible to
-generate LLVM IR from programs using __float128 but code generation does not
-work yet.</li>
-<li>Compiling Java programs no longer systematically crashes the plugin.</li>
+<!--
+<li></li>
+-->
 </ul>
 
 </div>
@@ -165,13 +155,7 @@ function. The compiler-rt library provides highly optimized implementations of
 this and other low-level routines (some are 3x faster than the equivalent
 libgcc routines).</p>
 
-<p>In the LLVM 2.9 timeframe, compiler_rt has had several minor changes for
-  better ARM support, and a fairly major license change.  All of the code in the
-  compiler-rt project is now <a href="DeveloperPolicy.html#license">dual
-  licensed</a> under MIT and UIUC license, which allows you to use compiler-rt
-  in applications without the binary copyright reproduction clause.  If you
-  prefer the LLVM/UIUC license, you are free to continue using it under that
-  license as well.</p>
+<p>In the LLVM 3.0 timeframe,</p>
 
 </div>
 
@@ -189,7 +173,7 @@ libraries in the larger LLVM Project, such as the Clang expression parser, the
 LLVM disassembler and the LLVM JIT.</p>
 
 <p>
-LLDB is has advanced by leaps and bounds in the 2.9 timeframe.  It is
+LLDB is has advanced by leaps and bounds in the 3.0 timeframe.  It is
 dramatically more stable and useful, and includes both a new <a 
 href="http://lldb.llvm.org/tutorial.html">tutorial</a> and a <a
 href="http://lldb.llvm.org/lldb-gdb.html">side-by-side comparison with 
@@ -210,8 +194,7 @@ ground up to specifically target the forthcoming C++'0X standard and focus on
 delivering great performance.</p>
 
 <p>
-In the LLVM 2.9 timeframe, libc++ has had numerous bugs fixed, and is now being
-co-developed with Clang's C++'0x mode.</p>
+In the LLVM 3.0 timeframe,</p>
   
 <p>
 Like compiler_rt, libc++ is now <a href="DeveloperPolicy.html#license">dual
@@ -245,7 +228,7 @@ Like compiler_rt, libc++ is now <a href="DeveloperPolicy.html#license">dual
 <div>
 <p>The <a href="http://vmkit.llvm.org/">VMKit project</a> is an implementation
   of a Java Virtual Machine (Java VM or JVM) that uses LLVM for static and
-  just-in-time compilation. As of LLVM 2.9, VMKit now supports generational
+  just-in-time compilation. As of LLVM 3.0, VMKit now supports generational
   garbage collectors. The garbage collectors are provided by the MMTk framework,
   and VMKit can be configured to use one of the numerous implemented collectors
   of MMTk.
@@ -275,7 +258,7 @@ be used to verify some algorithms.
 
 <!-- *********************************************************************** -->
 <h2>
-  <a name="externalproj">External Open Source Projects Using LLVM 2.9</a>
+  <a name="externalproj">External Open Source Projects Using LLVM 3.0</a>
 </h2>
 <!-- *********************************************************************** -->
 
@@ -283,7 +266,7 @@ be used to verify some algorithms.
 
 <p>An exciting aspect of LLVM is that it is used as an enabling technology for
    a lot of other language and tools projects.  This section lists some of the
-   projects that have already been updated to work with LLVM 2.9.</p>
+   projects that have already been updated to work with LLVM 3.0.</p>
 
 <!--=========================================================================-->
 <h3>Crack Programming Language</h3>
@@ -344,7 +327,7 @@ bitcode with SystemC-specific information.</p>
   modules, and inline C, C++, Fortran and Faust code in Pure programs if
   the corresponding LLVM-enabled compilers are installed).</p>
   
-<p>Pure version 0.47 has been tested and is known to work with LLVM 2.9
+<p>Pure version 0.47 has been tested and is known to work with LLVM 3.0
   (and continues to work with older LLVM releases &gt;= 2.5).</p>
 </div>
 
@@ -363,7 +346,7 @@ code.
 </p>
 
 <p> OpenJDK 7 b112, IcedTea6 1.9 and IcedTea7 1.13 and later have been tested
-and are known to work with LLVM 2.9 (and continue to work with older LLVM
+and are known to work with LLVM 3.0 (and continue to work with older LLVM
 releases &gt;= 2.6 as well).</p>
 </div>
 
@@ -420,7 +403,7 @@ and parallelism.</p>
 audio signal processing. The name FAUST stands for Functional AUdio STream. Its
 programming model combines two approaches: functional programming and block
 diagram composition. In addition with the C, C++, JAVA output formats, the
-Faust compiler can now generate LLVM bitcode, and works with LLVM 2.7-2.9.</p>
+Faust compiler can now generate LLVM bitcode, and works with LLVM 2.7-3.0.</p>
 
 </div>
   
@@ -428,7 +411,7 @@ Faust compiler can now generate LLVM bitcode, and works with LLVM 2.7-2.9.</p>
 
 <!-- *********************************************************************** -->
 <h2>
-  <a name="whatsnew">What's New in LLVM 2.9?</a>
+  <a name="whatsnew">What's New in LLVM 3.0?</a>
 </h2>
 <!-- *********************************************************************** -->
 
@@ -446,24 +429,13 @@ in this section.
 
 <div>
 
-<p>LLVM 2.9 includes several major new capabilities:</p>
+<p>LLVM 3.0 includes several major new capabilities:</p>
 
 <ul>
-  
-<li>Type Based Alias Analysis (TBAA) is now implemented and turned on by default
-  in Clang.  This allows substantially better load/store optimization in some
-  cases.  TBAA can be disabled by passing -fno-strict-aliasing.
-</li>
-
-<li>This release has seen a continued focus on quality of debug information. 
-  LLVM now generates much higher fidelity debug information, particularly when
-  debugging optimized code.</li>
 
-<li>Inline assembly now supports multiple alternative constraints.</li>  
-
-<li>A new backend for the NVIDIA PTX virtual ISA (used to target its GPUs) is
-  under rapid development.  It is not generally useful in 2.9, but is making
-  rapid progress.</li>
+<!--
+<li></li>
+-->
   
 </ul>
   
@@ -479,19 +451,9 @@ in this section.
 expose new optimization opportunities:</p>
 
 <ul>
-<li>The <a href="LangRef.html#bitwiseops">udiv, ashr, lshr, and shl</a>
-  instructions now have support exact and nuw/nsw bits to indicate that they
-  don't overflow or shift out bits.  This is useful for optimization of <a
-    href="http://llvm.org/PR8862">pointer differences</a> and other cases.</li>
-  
-<li>LLVM IR now supports the <a href="LangRef.html#globalvars">unnamed_addr</a>
-  attribute to indicate that constant global variables with identical
-  initializers can be merged.  This fixed <a href="http://llvm.org/PR8927">an
-  issue</a> where LLVM would incorrectly merge two globals which were supposed
-  to have distinct addresses.</li>
-  
-<li>The new <a href="LangRef.html#fnattrs">hotpatch attribute</a> has been added
-  to allow runtime patching of functions.</li> 
+<!--
+<li></li>
+-->
 </ul>
 
 </div>
@@ -507,57 +469,9 @@ expose new optimization opportunities:</p>
 release includes a few major enhancements and additions to the optimizers:</p>
 
 <ul>
-<li>Link Time Optimization (LTO) has been improved to use MC for parsing inline
-  assembly and now can build large programs like Firefox 4 on both Mac OS X and
-  Linux.</li>
-  
-<li>The new -loop-idiom pass recognizes memset/memcpy loops (and memset_pattern
-  on darwin), turning them into library calls, which are typically better
-  optimized than inline code.  If you are building a libc and notice that your
-  memcpy and memset functions are compiled into infinite recursion, please build
-  with -ffreestanding or -fno-builtin to disable this pass.</li>
-  
-<li>A new -early-cse pass does a fast pass over functions to fold constants,
-  simplify expressions, perform simple dead store elimination, and perform
-  common subexpression elimination.  It does a good job at catching some of the
-  trivial redundancies that exist in unoptimized code, making later passes more
-  effective.</li>
-
-<li>A new -loop-instsimplify pass is used to clean up loop bodies in the loop
-  optimizer.</li>
-  
-<li>The new TargetLibraryInfo interface allows mid-level optimizations to know
-  whether the current target's runtime library has certain functions.  For
-  example, the optimizer can now transform integer-only printf calls to call
-  iprintf, allowing reduced code size for embedded C libraries (e.g. newlib).
-</li>
-    
-<li>LLVM has a new <a href="WritingAnLLVMPass.html#RegionPass">RegionPass</a>
-  infrastructure for region-based optimizations.</li>
-
-<li>Several optimizer passes have been substantially sped up:
-  GVN is much faster on functions with deep dominator trees and lots of basic
-  blocks.  The dominator tree and dominance frontier passes are much faster to
-  compute, and preserved by more passes (so they are computed less often).  The
-  -scalar-repl pass is also much faster and doesn't use DominanceFrontier.
-</li>
-
-<li>The Dead Store Elimination pass is more aggressive optimizing stores of
-  different types: e.g. a large store following a small one to the same address.
-  The MemCpyOptimizer pass handles several new forms of memcpy elimination.</li>
-  
-<li>LLVM now optimizes various idioms for overflow detection into check of the
-  flag register on various CPUs.  For example, we now compile:
-  
-  <pre>
-   unsigned long t = a+b;
-   if (t &lt; a) ...
-  </pre>
-  into:
-  <pre>
-   addq %rdi, %rbx
-   jno  LBB0_2
-  </pre>
+<!--
+<li></li>
+-->
 </li>
   
 </ul>
@@ -577,38 +491,9 @@ and a number of other related areas that CPU instruction-set level tools work
 in.</p>
 
 <ul>
-<li>ELF MC support has matured enough for the integrated assembler to be turned
-  on by default in Clang on X86-32 and X86-64 ELF systems.</li>
-  
-<li>MC supports and CodeGen uses the <tt>.file</tt> and <tt>.loc</tt> directives
-  for producing line number debug info. This produces more compact line
-  tables and easier to read .s files.</li>
-  
-<li>MC supports the <tt>.cfi_*</tt> directives for producing DWARF
-  frame information, but it is still not used by CodeGen by default.</li>
-
-  
-<li>The MC assembler now generates much better diagnostics for common errors,
-  is much faster at matching instructions, is much more bug-compatible with
-  the GAS assembler, and is now generally useful for a broad range of X86
-  assembly.</li>
-  
-<li>We now have some basic <a href="CodeGenerator.html#mc">internals
-  documentation</a> for MC.</li>
-  
-<li>.td files can now specify assembler aliases directly with the <a 
-   href="CodeGenerator.html#na_instparsing">MnemonicAlias and InstAlias</a>
-   tblgen classes.</li>
-  
-<li>LLVM now has an experimental format-independent object file manipulation
-  library (lib/Object).  It supports both PE/COFF and ELF.  The llvm-nm tool has
-  been extended to work with native object files, and the new llvm-objdump tool
-  supports disassembly of object files (but no relocations are displayed yet).
-</li>
-  
-<li>Win32 PE-COFF support in the MC assembler has made a lot of progress in the
-  2.9 timeframe, but is still not generally useful.</li>
-
+<!--
+<li></li>
+-->
 </ul>
 
 <p>For more information, please see the <a
@@ -630,34 +515,9 @@ infrastructure, which allows us to implement more aggressive algorithms and make
 it run faster:</p>
 
 <ul>
-<li>The pre-register-allocation (preRA) instruction scheduler models register
-  pressure much more accurately in some cases. This allows the adoption of more
-  aggressive scheduling heuristics without causing spills to be generated.
-</li>
-  
-<li>LiveDebugVariables is a new pass that keeps track of debugging information
-  for user variables that are promoted to registers in optimized builds.</li>  
-
-<li>The scheduler now models operand latency and pipeline forwarding.</li>
-
-<li>A major register allocator infrastructure rewrite is underway.  It is not on
-    by default for 2.9 and you are not advised to use it, but it has made
-    substantial progress in the 2.9 timeframe:
-  <ul>
-  <li>A new -regalloc=basic "basic" register allocator can be used as a simple
-      fallback when debugging.  It uses the new infrastructure.</li>
-  <li>New infrastructure is in place for live range splitting.  "SplitKit" can
-      break a live interval into smaller pieces while preserving SSA form, and
-      SpillPlacement can help find the best split points. This is a work in
-      progress so the API is changing quickly.</li>
-   <li>The inline spiller has learned to clean up after live range splitting. It
-      can hoist spills out of loops, and it can eliminate redundant spills.</li>
-   <li>Rematerialization works with live range splitting.</li>
-   <li>The new "greedy" register allocator using live range splitting. This will
-     be the default register allocator in the next LLVM release, but it is not
-     turned on by default in 2.9.</li>
-   </ul>
-</li>
+<!--
+<li></li>
+-->
 </ul>
 </div>
 
@@ -671,31 +531,11 @@ it run faster:</p>
 </p>
 
 <ul>
-<li>LLVM 2.9 includes a complete reimplementation of the MMX instruction set.
-  The reimplementation uses a new LLVM IR <a 
-  href="LangRef.html#t_x86mmx">x86_mmx</a> type to ensure that MMX operations
-  are <em>only</em> generated from source that uses MMX builtin operations. With
-  this, random types like &lt;2 x i32&gt; are not turned into MMX operations
-  (which can be catastrophic without proper "emms" insertion).  Because the X86
-  code generator always generates reliable code, the -disable-mmx flag is now
-  removed.
-</li>
-  
-<li>X86 support for FS/GS relative loads and stores using <a 
-    href="CodeGenerator.html#x86_memory">address space 256/257</a> works reliably
-    now.</li>
-  
-<li>LLVM 2.9 generates much better code in several cases by using adc/sbb to
-   avoid generation of conditional move instructions for conditional increment
-   and other idioms.</li>
+<li>The CRC32 intrinsics have been renamed.  The intrinsics were previously
+  @llvm.x86.sse42.crc32.[8|16|32] and @llvm.x86.sse42.crc64.[8|64].  They have
+  been renamed to @llvm.x86.sse42.crc32.32.[8|16|32] and 
+  @llvm.x86.sse42.crc32.64.[8|64].</li>
 
-<li>The X86 backend has adopted a new preRA scheduling mode, "list-ilp", to
-  shorten the height of instruction schedules without inducing register spills.
-</li>
-
-<li>The MC assembler supports 3dNow! and 3DNowA instructions.</li>
-  
-<li>Several bugs have been fixed for Windows x64 code generator.</li>
 </ul>
 
 </div>
@@ -710,17 +550,9 @@ it run faster:</p>
 </p>
 
 <ul>
-<li>The ARM backend now has a fast instruction selector, which dramatically
-     improves -O0 compile times.</li>
-<li>The ARM backend has new tuning for Cortex-A8 and Cortex-A9 CPUs.</li>
-<li>The __builtin_prefetch builtin (and llvm.prefetch intrinsic) is compiled
-    into prefetch instructions instead of being discarded.</li>
-
-<li>  The ARM backend preRA scheduler now models machine resources at cycle
-  granularity. This allows the scheduler to both accurately model
-  instruction latency and avoid overcommitting functional units.</li>
-
-<li>Countless ARM microoptimizations have landed in LLVM 2.9.</li>
+<!--
+<li></li>
+-->
 </ul>
 </div>
   
@@ -731,21 +563,9 @@ it run faster:</p>
 
 <div>
 <ul>
-<li>MicroBlaze: major updates for aggressive delay slot filler, MC-based
-  assembly printing, assembly instruction parsing, ELF .o file emission, and MC
-  instruction disassembler have landed.</li>
-
-<li>SPARC: Many improvements, including using the Y registers for
-  multiplications and addition of a simple delay slot filler.</li>
-
-<li>PowerPC: The backend has been largely MC'ized and is ready to support
-  directly writing out mach-o object files.  No one seems interested in finishing
-  this final step though.</li>
-
-<li>Mips: Improved o32 ABI support, including better varags handling.
-More instructions supported in codegen: madd, msub, rotr, rotrv and clo.
-It also now supports lowering block addresses.</li>
-
+<!--
+<li></li>
+-->
 </ul>
 </div>
 
@@ -757,30 +577,13 @@ It also now supports lowering block addresses.</li>
 <div>
 
 <p>If you're already an LLVM user or developer with out-of-tree changes based
-on LLVM 2.8, this section lists some "gotchas" that you may run into upgrading
+on LLVM 2.9, this section lists some "gotchas" that you may run into upgrading
 from the previous release.</p>
 
 <ul>
-<li><b>This is the last release to support the llvm-gcc frontend.</b></li>
-
-<li>LLVM has a new <a href="CodingStandards.html#ll_naming">naming
-  convention standard</a>, though the codebase hasn't fully adopted it yet.</li>
-  
-<li>The new DIBuilder class provides a simpler interface for front ends to
-    encode debug info in LLVM IR, and has replaced DIFactory.</li>
-
-<li>LLVM IR and other tools always work on normalized target triples (which have
-  been run through <tt>Triple::normalize</tt>).</li>
-
-<li>The target triple x86_64--mingw64 is obsoleted. Use x86_64--mingw32 
-  instead.</li>
-
-<li>The PointerTracking pass has been removed from mainline, and moved to The
-  ClamAV project (its only client).</li>
-    
-<li>The LoopIndexSplit, LiveValues, SimplifyHalfPowrLibCalls, GEPSplitter, and
-  PartialSpecialization passes were removed.  They were unmaintained,
-  buggy, or deemed to be a bad idea.</li>
+<!--
+<li></li>
+-->
 </ul>
 
 </div>
@@ -796,18 +599,9 @@ from the previous release.</p>
   LLVM API changes are:</p>
 
 <ul>
-<li>include/llvm/System merged into include/llvm/Support.</li>
-<li>The <a href="http://llvm.org/PR5207">llvm::APInt API</a> was significantly
-  cleaned up.</li>
-
-<li>In the code generator, MVT::Flag was renamed to MVT::Glue to more accurately
-  describe its behavior.</li>
-
-<li>The system_error header from C++0x was added, and is now pervasively used to
-  capture and handle i/o and other errors in LLVM.</li>
-  
-<li>The old sys::Path API has been deprecated in favor of the new PathV2 API,
-    which is more efficient and flexible.</li>
+<!--
+<li></ld>
+-->
 </ul>
 </div>
 
@@ -986,7 +780,7 @@ Depending on it for anything serious is not advised.</p>
 
 <div>
 
-<p><b>LLVM 2.9 will be the last release of llvm-gcc.</b></p>
+<p><b>LLVM 3.0 will be the last release of llvm-gcc.</b></p>
 
 <p>llvm-gcc is generally very stable for the C family of languages.  The only
    major language feature of GCC not supported by llvm-gcc is the
@@ -1041,7 +835,7 @@ lists</a>.</p>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-04-21 03:52:00 +0200 (Thu, 21 Apr 2011) $
+  Last modified: $Date: 2011-05-28 00:50:46 +0200 (Sat, 28 May 2011) $
 </address>
 
 </body>
diff --git a/docs/SourceLevelDebugging.html b/docs/SourceLevelDebugging.html
index 4cfb52e..bab42a8 100644
--- a/docs/SourceLevelDebugging.html
+++ b/docs/SourceLevelDebugging.html
@@ -174,22 +174,15 @@ height="369">
       as setting program variables, or calling functions that have been
       deleted.</li>
 
-  <li>LLVM optimizations gracefully interact with debugging information.  If
-      they are not aware of debug information, they are automatically disabled
-      as necessary in the cases that would invalidate the debug info.  This
-      retains the LLVM features, making it easy to write new
-      transformations.</li>
-
   <li>As desired, LLVM optimizations can be upgraded to be aware of the LLVM
       debugging information, allowing them to update the debugging information
       as they perform aggressive optimizations.  This means that, with effort,
       the LLVM optimizers could optimize debug code just as well as non-debug
       code.</li>
 
-  <li>LLVM debug information does not prevent many important optimizations from
+  <li>LLVM debug information does not prevent optimizations from
       happening (for example inlining, basic block reordering/merging/cleanup,
-      tail duplication, etc), further reducing the amount of the compiler that
-      eventually is "aware" of debugging information.</li>
+      tail duplication, etc).</li>
 
   <li>LLVM debug information is automatically optimized along with the rest of
       the program, using existing facilities.  For example, duplicate
@@ -342,7 +335,9 @@ height="369">
    that produced it.</p>
 
 <p>Compile unit descriptors provide the root context for objects declared in a
-   specific compilation unit. File descriptors are defined using this context.</p>
+   specific compilation unit. File descriptors are defined using this context.
+   These descriptors are collected by a named metadata 
+   <tt>!llvm.dbg.cu</tt>.
 
 </div>
 
@@ -1799,7 +1794,7 @@ enum Trees {
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
+  Last modified: $Date: 2011-05-31 20:06:14 +0200 (Tue, 31 May 2011) $
 </address>
 
 </body>
diff --git a/docs/TestingGuide.html b/docs/TestingGuide.html
index 4fc4c70..5a28c44 100644
--- a/docs/TestingGuide.html
+++ b/docs/TestingGuide.html
@@ -365,8 +365,8 @@ clang/test directory. </p>
   obtained by using Tcl's glob command.  Any directory that contains only
   directories does not need the <tt>dg.exp</tt> file.</p>
 
-  <p>The <tt>llvm-runtests</tt> function lookas at each file that is passed to
-  it and gathers any lines together that match "RUN:". This are the "RUN" lines
+  <p>The <tt>llvm-runtests</tt> function looks at each file that is passed to
+  it and gathers any lines together that match "RUN:". These are the "RUN" lines
   that specify how the test is to be run. So, each test script must contain
   RUN lines if it is to do anything. If there are no RUN lines, the
   <tt>llvm-runtests</tt> function will issue an error and the test will
@@ -1206,7 +1206,7 @@ example reports that can do fancy stuff.</p>
 
   John T. Criswell, Daniel Dunbar, Reid Spencer, and Tanya Lattner<br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
+  Last modified: $Date: 2011-05-18 20:07:16 +0200 (Wed, 18 May 2011) $
 </address>
 </body>
 </html>
diff --git a/docs/index.html b/docs/index.html
index a25148a..e53b4c3 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -43,9 +43,8 @@ your documentation.</p>
 <ul>
 <li><a href="LangRef.html">LLVM Language Reference Manual</a> - Defines the LLVM
 intermediate representation.</li>
-<li><a href="http://llvm.org/pubs/2008-10-04-ACAT-LLVM-Intro.html">Introduction to the LLVM Compiler </a> - Presentation describing LLVM.</li>
-<li><a href="http://llvm.org/pubs/2004-09-22-LCPCLLVMTutorial.html">The LLVM Compiler Framework and
-Infrastructure Tutorial</a> - Tutorial for writing passes, exploring the system.</li>
+<li><a href="http://llvm.org/pubs/2008-10-04-ACAT-LLVM-Intro.html">Introduction to the LLVM Compiler </a> - Presentation providing a users introduction to LLVM.</li>
+<li><a href="http://www.aosabook.org/en/llvm.html">Intro to LLVM</a> - book chapter providing a compiler hacker's introduction to LLVM.</li>
 <li><a href="http://llvm.org/pubs/2004-01-30-CGO-LLVM.html">LLVM: A Compilation Framework for
 Lifelong Program Analysis &amp; Transformation</a> - Design overview.</li>
 <li><a href="http://llvm.org/pubs/2002-12-LattnerMSThesis.html">LLVM: An Infrastructure for
@@ -285,7 +284,7 @@ times each day, making it a high volume list.</li>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-04-23 02:30:22 +0200 (Sat, 23 Apr 2011) $
+  Last modified: $Date: 2011-05-30 05:36:58 +0200 (Mon, 30 May 2011) $
 </address>
 </body></html>
 
diff --git a/examples/HowToUseJIT/HowToUseJIT.cpp b/examples/HowToUseJIT/HowToUseJIT.cpp
index 8e3b6dc..2fb2b5e 100644
--- a/examples/HowToUseJIT/HowToUseJIT.cpp
+++ b/examples/HowToUseJIT/HowToUseJIT.cpp
@@ -45,6 +45,8 @@
 #include "llvm/Target/TargetSelect.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/IRBuilder.h"
+
 using namespace llvm;
 
 int main() {
@@ -68,8 +70,12 @@ int main() {
   // because of the last argument.
   BasicBlock *BB = BasicBlock::Create(Context, "EntryBlock", Add1F);
 
+  // Create a basic block builder with default parameters.  The builder will
+  // automatically append instructions to the basic block `BB'.
+  IRBuilder<> builder(BB);
+
   // Get pointers to the constant `1'.
-  Value *One = ConstantInt::get(Type::getInt32Ty(Context), 1);
+  Value *One = builder.getInt32(1);
 
   // Get pointers to the integer argument of the add1 function...
   assert(Add1F->arg_begin() != Add1F->arg_end()); // Make sure there's an arg
@@ -77,15 +83,15 @@ int main() {
   ArgX->setName("AnArg");            // Give it a nice symbolic name for fun.
 
   // Create the add instruction, inserting it into the end of BB.
-  Instruction *Add = BinaryOperator::CreateAdd(One, ArgX, "addresult", BB);
+  Value *Add = builder.CreateAdd(One, ArgX);
 
   // Create the return instruction and add it to the basic block
-  ReturnInst::Create(Context, Add, BB);
+  builder.CreateRet(Add);
 
   // Now, function add1 is ready.
 
 
-  // Now we going to create function `foo', which returns an int and takes no
+  // Now we're going to create function `foo', which returns an int and takes no
   // arguments.
   Function *FooF =
     cast<Function>(M->getOrInsertFunction("foo", Type::getInt32Ty(Context),
@@ -94,15 +100,18 @@ int main() {
   // Add a basic block to the FooF function.
   BB = BasicBlock::Create(Context, "EntryBlock", FooF);
 
-  // Get pointers to the constant `10'.
-  Value *Ten = ConstantInt::get(Type::getInt32Ty(Context), 10);
+  // Tell the basic block builder to attach itself to the new basic block
+  builder.SetInsertPoint(BB);
+
+  // Get pointer to the constant `10'.
+  Value *Ten = builder.getInt32(10);
 
-  // Pass Ten to the call call:
-  CallInst *Add1CallRes = CallInst::Create(Add1F, Ten, "add1", BB);
+  // Pass Ten to the call to Add1F
+  CallInst *Add1CallRes = builder.CreateCall(Add1F, Ten);
   Add1CallRes->setTailCall(true);
 
   // Create the return instruction and add it to the basic block.
-  ReturnInst::Create(Context, Add1CallRes, BB);
+  builder.CreateRet(Add1CallRes);
 
   // Now we create the JIT.
   ExecutionEngine* EE = EngineBuilder(M).create();
diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index 39c3cb4..2eccc11 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -282,6 +282,8 @@ typedef enum {
   LLVMRealPredicateTrue   /**< Always true (always folded) */
 } LLVMRealPredicate;
 
+void LLVMInitializeCore(LLVMPassRegistryRef R);
+
 
 /*===-- Error handling ----------------------------------------------------===*/
 
@@ -1164,6 +1166,7 @@ namespace llvm {
     for (LLVMValueRef *I = Vals, *E = Vals + Length; I != E; ++I)
       cast<T>(*I);
     #endif
+    (void)Length;
     return reinterpret_cast<T**>(Vals);
   }
   
diff --git a/include/llvm-c/Disassembler.h b/include/llvm-c/Disassembler.h
index 9f10973..3a3eb23 100644
--- a/include/llvm-c/Disassembler.h
+++ b/include/llvm-c/Disassembler.h
@@ -7,16 +7,16 @@
 |*                                                                            *|
 |*===----------------------------------------------------------------------===*|
 |*                                                                            *|
-|* This header provides public interface to a disassembler library.           *|
+|* This header provides a public interface to a disassembler library.         *|
 |* LLVM provides an implementation of this interface.                         *|
 |*                                                                            *|
 \*===----------------------------------------------------------------------===*/
 
 #ifndef LLVM_C_DISASSEMBLER_H
-#define LLVM_C_DISASSEMBLER_H  1
+#define LLVM_C_DISASSEMBLER_H
 
-#include <stddef.h>
 #include "llvm/Support/DataTypes.h"
+#include <stddef.h>
 
 /**
  * An opaque reference to a disassembler context.
@@ -38,14 +38,11 @@ typedef void *LLVMDisasmContextRef;
  * will be the instruction width.  The information is returned in TagBuf and is 
  * Triple specific with its specific information defined by the value of
  * TagType for that Triple.  If symbolic information is returned the function
- * returns 1 else it returns 0.
+ * returns 1, otherwise it returns 0.
  */
-typedef int (*LLVMOpInfoCallback)(void *DisInfo,
-                                  uint64_t PC,
-                                  uint64_t Offset,
-                                  uint64_t Size,
-                                  int TagType,
-                                  void *TagBuf);
+typedef int (*LLVMOpInfoCallback)(void *DisInfo, uint64_t PC,
+                                  uint64_t Offset, uint64_t Size,
+                                  int TagType, void *TagBuf);
 
 /**
  * The initial support in LLVM MC for the most general form of a relocatable
@@ -68,10 +65,11 @@ typedef int (*LLVMOpInfoCallback)(void *DisInfo,
  * operands like "_foo@GOT", ":lower16:_foo", etc.
  */
 struct LLVMOpInfoSymbol1 {
-  uint64_t Present; /* 1 if this symbol is present */
-  char *Name;     /* symbol name if not NULL */
-  uint64_t Value; /* symbol value if name is NULL */
+  uint64_t Present;  /* 1 if this symbol is present */
+  char *Name;        /* symbol name if not NULL */
+  uint64_t Value;    /* symbol value if name is NULL */
 };
+
 struct LLVMOpInfo1 {
   struct LLVMOpInfoSymbol1 AddSymbol;
   struct LLVMOpInfoSymbol1 SubtractSymbol;
@@ -92,11 +90,11 @@ struct LLVMOpInfo1 {
 
 /**
  * The type for the symbol lookup function.  This may be called by the
- * disassembler for such things like adding a comment for a PC plus a constant
+ * disassembler for things like adding a comment for a PC plus a constant
  * offset load instruction to use a symbol name instead of a load address value.
  * It is passed the block information is saved when the disassembler context is
  * created and a value of a symbol to look up.  If no symbol is found NULL is
- * to be returned.
+ * returned.
  */
 typedef const char *(*LLVMSymbolLookupCallback)(void *DisInfo,
                                                 uint64_t SymbolValue);
@@ -107,40 +105,33 @@ extern "C" {
 
 /**
  * Create a disassembler for the TripleName.  Symbolic disassembly is supported
- * by passing a block of information in the DisInfo parameter and specifing the
- * TagType and call back functions as described above.  These can all be passed
- * as NULL.  If successful this returns a disassembler context if not it
+ * by passing a block of information in the DisInfo parameter and specifying the
+ * TagType and callback functions as described above.  These can all be passed
+ * as NULL.  If successful, this returns a disassembler context.  If not, it
  * returns NULL.
  */
-extern LLVMDisasmContextRef
-LLVMCreateDisasm(const char *TripleName,
-                 void *DisInfo,
-                 int TagType,
-                 LLVMOpInfoCallback GetOpInfo,
-                 LLVMSymbolLookupCallback SymbolLookUp);
+LLVMDisasmContextRef LLVMCreateDisasm(const char *TripleName, void *DisInfo,
+                                      int TagType, LLVMOpInfoCallback GetOpInfo,
+                                      LLVMSymbolLookupCallback SymbolLookUp);
 
 /**
  * Dispose of a disassembler context.
  */
-extern void
-LLVMDisasmDispose(LLVMDisasmContextRef DC);
+void LLVMDisasmDispose(LLVMDisasmContextRef DC);
 
 /**
- * Disassmble a single instruction using the disassembler context specified in
- * the parameter DC.  The bytes of the instruction are specified in the parameter
- * Bytes, and contains at least BytesSize number of bytes.  The instruction is
- * at the address specified by the PC parameter.  If a valid instruction can be
- * disassembled its string is returned indirectly in OutString which whos size
- * is specified in the parameter OutStringSize.  This function returns the
- * number of bytes in the instruction or zero if there was no valid instruction.
+ * Disassemble a single instruction using the disassembler context specified in
+ * the parameter DC.  The bytes of the instruction are specified in the
+ * parameter Bytes, and contains at least BytesSize number of bytes.  The
+ * instruction is at the address specified by the PC parameter.  If a valid
+ * instruction can be disassembled, its string is returned indirectly in
+ * OutString whose size is specified in the parameter OutStringSize.  This
+ * function returns the number of bytes in the instruction or zero if there was
+ * no valid instruction.
  */
-extern size_t
-LLVMDisasmInstruction(LLVMDisasmContextRef DC,
-                      uint8_t *Bytes,
-                      uint64_t BytesSize,
-                      uint64_t PC,
-                      char *OutString,
-                      size_t OutStringSize);
+size_t LLVMDisasmInstruction(LLVMDisasmContextRef DC, uint8_t *Bytes,
+                             uint64_t BytesSize, uint64_t PC,
+                             char *OutString, size_t OutStringSize);
 
 #ifdef __cplusplus
 }
diff --git a/include/llvm/ADT/FoldingSet.h b/include/llvm/ADT/FoldingSet.h
index 52e0434..d2e0b8f 100644
--- a/include/llvm/ADT/FoldingSet.h
+++ b/include/llvm/ADT/FoldingSet.h
@@ -671,17 +671,10 @@ public:
 // Partial specializations of FoldingSetTrait.
 
 template<typename T> struct FoldingSetTrait<T*> {
-  static inline void Profile(const T *X, FoldingSetNodeID &ID) {
+  static inline void Profile(T *X, FoldingSetNodeID &ID) {
     ID.AddPointer(X);
   }
 };
-
-template<typename T> struct FoldingSetTrait<const T*> {
-  static inline void Profile(const T *X, FoldingSetNodeID &ID) {
-    ID.AddPointer(X);
-  }
-};
-
 } // End of namespace llvm.
 
 #endif
diff --git a/include/llvm/ADT/PackedVector.h b/include/llvm/ADT/PackedVector.h
new file mode 100644
index 0000000..272322a
--- /dev/null
+++ b/include/llvm/ADT/PackedVector.h
@@ -0,0 +1,158 @@
+//===- llvm/ADT/PackedVector.h - Packed values vector -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PackedVector class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_PACKEDVECTOR_H
+#define LLVM_ADT_PACKEDVECTOR_H
+
+#include "llvm/ADT/BitVector.h"
+#include <limits>
+
+namespace llvm {
+
+template <typename T, unsigned BitNum, bool isSigned>
+class PackedVectorBase;
+
+// This won't be necessary if we can specialize members without specializing
+// the parent template.
+template <typename T, unsigned BitNum>
+class PackedVectorBase<T, BitNum, false> {
+protected:
+  static T getValue(const llvm::BitVector &Bits, unsigned Idx) {
+    T val = T();
+    for (unsigned i = 0; i != BitNum; ++i)
+      val = T(val | ((Bits[(Idx << (BitNum-1)) + i] ? 1UL : 0UL) << i));
+    return val;
+  }
+
+  static void setValue(llvm::BitVector &Bits, unsigned Idx, T val) {
+    assert((val >> BitNum) == 0 && "value is too big");
+    for (unsigned i = 0; i != BitNum; ++i)
+      Bits[(Idx << (BitNum-1)) + i] = val & (T(1) << i);
+  }
+};
+
+template <typename T, unsigned BitNum>
+class PackedVectorBase<T, BitNum, true> {
+protected:
+  static T getValue(const llvm::BitVector &Bits, unsigned Idx) {
+    T val = T();
+    for (unsigned i = 0; i != BitNum-1; ++i)
+      val = T(val | ((Bits[(Idx << (BitNum-1)) + i] ? 1UL : 0UL) << i));
+    if (Bits[(Idx << (BitNum-1)) + BitNum-1])
+      val = ~val;
+    return val;
+  }
+
+  static void setValue(llvm::BitVector &Bits, unsigned Idx, T val) {
+    if (val < 0) {
+      val = ~val;
+      Bits.set((Idx << (BitNum-1)) + BitNum-1);
+    }
+    assert((val >> (BitNum-1)) == 0 && "value is too big");
+    for (unsigned i = 0; i != BitNum-1; ++i)
+      Bits[(Idx << (BitNum-1)) + i] = val & (T(1) << i);
+  }
+};
+
+/// \brief Store a vector of values using a specific number of bits for each
+/// value. Both signed and unsigned types can be used, e.g
+/// @code
+///   PackedVector<signed, 2> vec;
+/// @endcode
+/// will create a vector accepting values -2, -1, 0, 1. Any other value will hit
+/// an assertion.
+template <typename T, unsigned BitNum>
+class PackedVector : public PackedVectorBase<T, BitNum,
+                                            std::numeric_limits<T>::is_signed> {
+  llvm::BitVector Bits;
+  typedef PackedVectorBase<T, BitNum, std::numeric_limits<T>::is_signed> base;
+
+public:
+  class reference {
+    PackedVector &Vec;
+    const unsigned Idx;
+
+    reference();  // Undefined    
+  public:
+    reference(PackedVector &vec, unsigned idx) : Vec(vec), Idx(idx) { }    
+
+    reference &operator=(T val) {
+      Vec.setValue(Vec.Bits, Idx, val);
+      return *this;
+    }
+    operator T() {
+      return Vec.getValue(Vec.Bits, Idx);
+    }
+  };
+
+  PackedVector() { }
+  explicit PackedVector(unsigned size) : Bits(size << (BitNum-1)) { }
+
+  bool empty() const { return Bits.empty(); }
+
+  unsigned size() const { return Bits.size() >> (BitNum-1); }
+  
+  void clear() { Bits.clear(); }
+  
+  void resize(unsigned N) { Bits.resize(N << (BitNum-1)); }
+
+  void reserve(unsigned N) { Bits.reserve(N << (BitNum-1)); }
+
+  PackedVector &reset() {
+    Bits.reset();
+    return *this;
+  }
+
+  void push_back(T val) {
+    resize(size()+1);
+    (*this)[size()-1] = val;
+  }
+
+  reference operator[](unsigned Idx) {
+    return reference(*this, Idx);
+  }
+
+  T operator[](unsigned Idx) const {
+    return base::getValue(Bits, Idx);
+  }
+
+  bool operator==(const PackedVector &RHS) const {
+    return Bits == RHS.Bits;
+  }
+
+  bool operator!=(const PackedVector &RHS) const {
+    return Bits != RHS.Bits;
+  }
+
+  const PackedVector &operator=(const PackedVector &RHS) {
+    Bits = RHS.Bits;
+    return *this;
+  }
+
+  PackedVector &operator|=(const PackedVector &RHS) {
+    Bits |= RHS.Bits;
+    return *this;
+  }
+
+  void swap(PackedVector &RHS) {
+    Bits.swap(RHS.Bits);
+  }
+};
+
+// Leave BitNum=0 undefined. 
+template <typename T>
+class PackedVector<T, 0>;
+
+} // end llvm namespace
+
+#endif
diff --git a/include/llvm/ADT/StringRef.h b/include/llvm/ADT/StringRef.h
index 1766d2b..8396921 100644
--- a/include/llvm/ADT/StringRef.h
+++ b/include/llvm/ADT/StringRef.h
@@ -46,7 +46,14 @@ namespace llvm {
     // integer works around this bug.
     static size_t min(size_t a, size_t b) { return a < b ? a : b; }
     static size_t max(size_t a, size_t b) { return a > b ? a : b; }
-
+    
+    // Workaround memcmp issue with null pointers (undefined behavior)
+    // by providing a specialized version
+    static int compareMemory(const char *Lhs, const char *Rhs, size_t Length) {
+      if (Length == 0) { return 0; }
+      return ::memcmp(Lhs,Rhs,Length);
+    }
+    
   public:
     /// @name Constructors
     /// @{
@@ -56,11 +63,17 @@ namespace llvm {
 
     /// Construct a string ref from a cstring.
     /*implicit*/ StringRef(const char *Str)
-      : Data(Str), Length(::strlen(Str)) {}
+      : Data(Str) {
+        assert(Str && "StringRef cannot be built from a NULL argument");
+        Length = ::strlen(Str); // invoking strlen(NULL) is undefined behavior
+      }
 
     /// Construct a string ref from a pointer and length.
     /*implicit*/ StringRef(const char *data, size_t length)
-      : Data(data), Length(length) {}
+      : Data(data), Length(length) {
+        assert((data || length == 0) &&
+        "StringRef cannot be built from a NULL argument with non-null length");
+      }
 
     /// Construct a string ref from an std::string.
     /*implicit*/ StringRef(const std::string &Str)
@@ -104,7 +117,7 @@ namespace llvm {
     /// compare() when the relative ordering of inequal strings isn't needed.
     bool equals(StringRef RHS) const {
       return (Length == RHS.Length &&
-              memcmp(Data, RHS.Data, RHS.Length) == 0);
+              compareMemory(Data, RHS.Data, RHS.Length) == 0);
     }
 
     /// equals_lower - Check for string equality, ignoring case.
@@ -116,7 +129,7 @@ namespace llvm {
     /// is lexicographically less than, equal to, or greater than the \arg RHS.
     int compare(StringRef RHS) const {
       // Check the prefix for a mismatch.
-      if (int Res = memcmp(Data, RHS.Data, min(Length, RHS.Length)))
+      if (int Res = compareMemory(Data, RHS.Data, min(Length, RHS.Length)))
         return Res < 0 ? -1 : 1;
 
       // Otherwise the prefixes match, so we only need to check the lengths.
@@ -183,13 +196,13 @@ namespace llvm {
     /// startswith - Check if this string starts with the given \arg Prefix.
     bool startswith(StringRef Prefix) const {
       return Length >= Prefix.Length &&
-             memcmp(Data, Prefix.Data, Prefix.Length) == 0;
+             compareMemory(Data, Prefix.Data, Prefix.Length) == 0;
     }
 
     /// endswith - Check if this string ends with the given \arg Suffix.
     bool endswith(StringRef Suffix) const {
       return Length >= Suffix.Length &&
-             memcmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
+        compareMemory(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
     }
 
     /// @}
@@ -447,6 +460,10 @@ namespace llvm {
     return LHS.compare(RHS) != -1;
   }
 
+  inline std::string &operator+=(std::string &buffer, llvm::StringRef string) {
+    return buffer.append(string.data(), string.size());
+  }
+
   /// @}
 
   // StringRefs can be treated like a POD type.
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index 2659bce..078033d 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -225,7 +225,7 @@ public:
   /// if the environment component is present).
   StringRef getOSAndEnvironmentName() const;
 
-  /// getOSNumber - Parse the version number from the OS name component of the
+  /// getOSVersion - Parse the version number from the OS name component of the
   /// triple, if present.
   ///
   /// For example, "fooos1.2.3" would return (1, 2, 3).
diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h
index 8f9708b..5d8edd1 100644
--- a/include/llvm/Analysis/AliasAnalysis.h
+++ b/include/llvm/Analysis/AliasAnalysis.h
@@ -38,6 +38,7 @@
 #define LLVM_ANALYSIS_ALIAS_ANALYSIS_H
 
 #include "llvm/Support/CallSite.h"
+#include "llvm/ADT/DenseMap.h"
 
 namespace llvm {
 
@@ -488,6 +489,32 @@ public:
   }
 };
 
+// Specialize DenseMapInfo for Location.
+template<>
+struct DenseMapInfo<AliasAnalysis::Location> {
+  static inline AliasAnalysis::Location getEmptyKey() {
+    return
+      AliasAnalysis::Location(DenseMapInfo<const Value *>::getEmptyKey(),
+                              0, 0);
+  }
+  static inline AliasAnalysis::Location getTombstoneKey() {
+    return
+      AliasAnalysis::Location(DenseMapInfo<const Value *>::getTombstoneKey(),
+                              0, 0);
+  }
+  static unsigned getHashValue(const AliasAnalysis::Location &Val) {
+    return DenseMapInfo<const Value *>::getHashValue(Val.Ptr) ^
+           DenseMapInfo<uint64_t>::getHashValue(Val.Size) ^
+           DenseMapInfo<const MDNode *>::getHashValue(Val.TBAATag);
+  }
+  static bool isEqual(const AliasAnalysis::Location &LHS,
+                      const AliasAnalysis::Location &RHS) {
+    return LHS.Ptr == RHS.Ptr &&
+           LHS.Size == RHS.Size &&
+           LHS.TBAATag == RHS.TBAATag;
+  }
+};
+
 /// isNoAliasCall - Return true if this pointer is returned by a noalias
 /// function.
 bool isNoAliasCall(const Value *V);
diff --git a/include/llvm/Analysis/BranchProbabilityInfo.h b/include/llvm/Analysis/BranchProbabilityInfo.h
new file mode 100644
index 0000000..91f289d
--- /dev/null
+++ b/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -0,0 +1,78 @@
+//===--- BranchProbabilityInfo.h - Branch Probability Analysis --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is used to evaluate branch probabilties.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_BRANCHPROBABILITYINFO_H
+#define LLVM_ANALYSIS_BRANCHPROBABILITYINFO_H
+
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Analysis/LoopInfo.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class BranchProbabilityInfo : public FunctionPass {
+
+  // Default weight value. Used when we don't have information about the edge.
+  static const uint32_t DEFAULT_WEIGHT = 16;
+
+  typedef std::pair<BasicBlock *, BasicBlock *> Edge;
+
+  DenseMap<Edge, uint32_t> Weights;
+
+  // Get sum of the block successors' weights.
+  uint32_t getSumForBlock(BasicBlock *BB) const;
+
+public:
+  static char ID;
+
+  BranchProbabilityInfo() : FunctionPass(ID) {
+    initializeBranchProbabilityInfoPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<LoopInfo>();
+    AU.setPreservesAll();
+  }
+
+  bool runOnFunction(Function &F);
+
+  // Returned value is between 1 and UINT32_MAX. Look at
+  // BranchProbabilityInfo.cpp for details.
+  uint32_t getEdgeWeight(BasicBlock *Src, BasicBlock *Dst) const;
+
+  // Look at BranchProbabilityInfo.cpp for details. Use it with caution!
+  void setEdgeWeight(BasicBlock *Src, BasicBlock *Dst, uint32_t Weight);
+
+  // A 'Hot' edge is an edge which probability is >= 80%.
+  bool isEdgeHot(BasicBlock *Src, BasicBlock *Dst) const;
+
+  // Return a hot successor for the block BB or null if there isn't one.
+  BasicBlock *getHotSucc(BasicBlock *BB) const;
+
+  // Return a probability as a fraction between 0 (0% probability) and
+  // 1 (100% probability), however the value is never equal to 0, and can be 1
+  // only iff SRC block has only one successor.
+  BranchProbability getEdgeProbability(BasicBlock *Src, BasicBlock *Dst) const;
+
+  // Print value between 0 (0% probability) and 1 (100% probability),
+  // however the value is never equal to 0, and can be 1 only iff SRC block
+  // has only one successor.
+  raw_ostream &printEdgeProbability(raw_ostream &OS, BasicBlock *Src,
+                                    BasicBlock *Dst) const;
+};
+
+}
+
+#endif
diff --git a/include/llvm/Analysis/CallGraph.h b/include/llvm/Analysis/CallGraph.h
index 089f322..fb77da7 100644
--- a/include/llvm/Analysis/CallGraph.h
+++ b/include/llvm/Analysis/CallGraph.h
@@ -259,6 +259,9 @@ public:
   /// addCalledFunction - Add a function to the list of functions called by this
   /// one.
   void addCalledFunction(CallSite CS, CallGraphNode *M) {
+    assert(!CS.getInstruction() ||
+           !CS.getCalledFunction() ||
+           !CS.getCalledFunction()->isIntrinsic());
     CalledFunctions.push_back(std::make_pair(CS.getInstruction(), M));
     M->AddRef();
   }
diff --git a/include/llvm/Analysis/DIBuilder.h b/include/llvm/Analysis/DIBuilder.h
index 5846dbf..96c6587 100644
--- a/include/llvm/Analysis/DIBuilder.h
+++ b/include/llvm/Analysis/DIBuilder.h
@@ -117,8 +117,9 @@ namespace llvm {
     /// @param Name        Typedef name.
     /// @param File        File where this type is defined.
     /// @param LineNo      Line number.
+    /// @param Context     The surrounding context for the typedef.
     DIType createTypedef(DIType Ty, StringRef Name, DIFile File, 
-                         unsigned LineNo);
+                         unsigned LineNo, DIDescriptor Context);
 
     /// createFriend - Create debugging information entry for a 'friend'.
     DIType createFriend(DIType Ty, DIType FriendTy);
diff --git a/include/llvm/Analysis/DebugInfo.h b/include/llvm/Analysis/DebugInfo.h
index c6cc8f7..fbee5a6 100644
--- a/include/llvm/Analysis/DebugInfo.h
+++ b/include/llvm/Analysis/DebugInfo.h
@@ -49,15 +49,16 @@ namespace llvm {
   class DIDescriptor {
   public:
     enum {
-      FlagPrivate          = 1 << 0,
-      FlagProtected        = 1 << 1,
-      FlagFwdDecl          = 1 << 2,
-      FlagAppleBlock       = 1 << 3,
-      FlagBlockByrefStruct = 1 << 4,
-      FlagVirtual          = 1 << 5,
-      FlagArtificial       = 1 << 6,
-      FlagExplicit         = 1 << 7,
-      FlagPrototyped       = 1 << 8
+      FlagPrivate            = 1 << 0,
+      FlagProtected          = 1 << 1,
+      FlagFwdDecl            = 1 << 2,
+      FlagAppleBlock         = 1 << 3,
+      FlagBlockByrefStruct   = 1 << 4,
+      FlagVirtual            = 1 << 5,
+      FlagArtificial         = 1 << 6,
+      FlagExplicit           = 1 << 7,
+      FlagPrototyped         = 1 << 8,
+      FlagObjcClassComplete  = 1 << 9
     };
   protected:
     const MDNode *DbgNode;
@@ -271,6 +272,9 @@ namespace llvm {
     bool isArtificial() const {
       return (getFlags() & FlagArtificial) != 0;
     }
+    bool isObjcClassComplete() const {
+      return (getFlags() & FlagObjcClassComplete) != 0;
+    }
     bool isValid() const {
       return DbgNode && (isBasicType() || isDerivedType() || isCompositeType());
     }
diff --git a/include/llvm/Analysis/FindUsedTypes.h b/include/llvm/Analysis/FindUsedTypes.h
index fc57e1a..3e5da57 100644
--- a/include/llvm/Analysis/FindUsedTypes.h
+++ b/include/llvm/Analysis/FindUsedTypes.h
@@ -14,8 +14,8 @@
 #ifndef LLVM_ANALYSIS_FINDUSEDTYPES_H
 #define LLVM_ANALYSIS_FINDUSEDTYPES_H
 
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Pass.h"
-#include <set>
 
 namespace llvm {
 
@@ -23,7 +23,7 @@ class Type;
 class Value;
 
 class FindUsedTypes : public ModulePass {
-  std::set<const Type *> UsedTypes;
+  SetVector<const Type *> UsedTypes;
 public:
   static char ID; // Pass identification, replacement for typeid
   FindUsedTypes() : ModulePass(ID) {
@@ -33,7 +33,7 @@ public:
   /// getTypes - After the pass has been run, return the set containing all of
   /// the types used in the module.
   ///
-  const std::set<const Type *> &getTypes() const { return UsedTypes; }
+  const SetVector<const Type *> &getTypes() const { return UsedTypes; }
 
   /// Print the types found in the module.  If the optional Module parameter is
   /// passed in, then the types are printed symbolically if possible, using the
diff --git a/include/llvm/Analysis/IVUsers.h b/include/llvm/Analysis/IVUsers.h
index e56d24d..1b78fe4 100644
--- a/include/llvm/Analysis/IVUsers.h
+++ b/include/llvm/Analysis/IVUsers.h
@@ -37,8 +37,8 @@ class TargetData;
 class IVStrideUse : public CallbackVH, public ilist_node<IVStrideUse> {
   friend class IVUsers;
 public:
-  IVStrideUse(IVUsers *P, Instruction* U, Value *O)
-    : CallbackVH(U), Parent(P), OperandValToReplace(O) {
+  IVStrideUse(IVUsers *P, Instruction* U, Value *O, Value *PN)
+    : CallbackVH(U), Parent(P), OperandValToReplace(O), Phi(PN) {
   }
 
   /// getUser - Return the user instruction for this use.
@@ -51,6 +51,11 @@ public:
     setValPtr(NewUser);
   }
 
+  /// getPhi - Return the phi node that represents this IV.
+  PHINode *getPhi() const {
+    return cast<PHINode>(Phi);
+  }
+
   /// getOperandValToReplace - Return the Value of the operand in the user
   /// instruction that this IVStrideUse is representing.
   Value *getOperandValToReplace() const {
@@ -81,6 +86,9 @@ private:
   /// that this IVStrideUse is representing.
   WeakVH OperandValToReplace;
 
+  /// Phi - The loop header phi that represents this IV.
+  WeakVH Phi;
+
   /// PostIncLoops - The set of loops for which Expr has been adjusted to
   /// use post-inc mode. This corresponds with SCEVExpander's post-inc concept.
   PostIncLoopSet PostIncLoops;
@@ -143,9 +151,9 @@ public:
   /// AddUsersIfInteresting - Inspect the specified Instruction.  If it is a
   /// reducible SCEV, recursively add its users to the IVUsesByStride set and
   /// return true.  Otherwise, return false.
-  bool AddUsersIfInteresting(Instruction *I);
+  bool AddUsersIfInteresting(Instruction *I, PHINode *Phi);
 
-  IVStrideUse &AddUser(Instruction *User, Value *Operand);
+  IVStrideUse &AddUser(Instruction *User, Value *Operand, PHINode *Phi);
 
   /// getReplacementExpr - Return a SCEV expression which computes the
   /// value of the OperandValToReplace of the given IVStrideUse.
diff --git a/include/llvm/Analysis/RegionPass.h b/include/llvm/Analysis/RegionPass.h
index 5403e09..1a93859 100644
--- a/include/llvm/Analysis/RegionPass.h
+++ b/include/llvm/Analysis/RegionPass.h
@@ -109,7 +109,7 @@ public:
   /// @brief Print passes managed by this manager.
   void dumpPassStructure(unsigned Offset);
 
-  /// @brief Print passes contained by this manager.
+  /// @brief Get passes contained by this manager.
   Pass *getContainedPass(unsigned N) {
     assert(N < PassVector.size() && "Pass number out of range!");
     Pass *FP = static_cast<Pass *>(PassVector[N]);
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index a62f6a8..554524a 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -270,30 +270,30 @@ namespace llvm {
 
     /// BackedgeTakenCounts - Cache the backedge-taken count of the loops for
     /// this function as they are computed.
-    std::map<const Loop*, BackedgeTakenInfo> BackedgeTakenCounts;
+    DenseMap<const Loop*, BackedgeTakenInfo> BackedgeTakenCounts;
 
     /// ConstantEvolutionLoopExitValue - This map contains entries for all of
     /// the PHI instructions that we attempt to compute constant evolutions for.
     /// This allows us to avoid potentially expensive recomputation of these
     /// properties.  An instruction maps to null if we are unable to compute its
     /// exit value.
-    std::map<PHINode*, Constant*> ConstantEvolutionLoopExitValue;
+    DenseMap<PHINode*, Constant*> ConstantEvolutionLoopExitValue;
 
     /// ValuesAtScopes - This map contains entries for all the expressions
     /// that we attempt to compute getSCEVAtScope information for, which can
     /// be expensive in extreme cases.
-    std::map<const SCEV *,
+    DenseMap<const SCEV *,
              std::map<const Loop *, const SCEV *> > ValuesAtScopes;
 
     /// LoopDispositions - Memoized computeLoopDisposition results.
-    std::map<const SCEV *,
+    DenseMap<const SCEV *,
              std::map<const Loop *, LoopDisposition> > LoopDispositions;
 
     /// computeLoopDisposition - Compute a LoopDisposition value.
     LoopDisposition computeLoopDisposition(const SCEV *S, const Loop *L);
 
     /// BlockDispositions - Memoized computeBlockDisposition results.
-    std::map<const SCEV *,
+    DenseMap<const SCEV *,
              std::map<const BasicBlock *, BlockDisposition> > BlockDispositions;
 
     /// computeBlockDisposition - Compute a BlockDisposition value.
diff --git a/include/llvm/Argument.h b/include/llvm/Argument.h
index 71c001f..ff86378 100644
--- a/include/llvm/Argument.h
+++ b/include/llvm/Argument.h
@@ -51,6 +51,9 @@ public:
   /// hasByValAttr - Return true if this argument has the byval attribute on it
   /// in its containing function.
   bool hasByValAttr() const;
+  
+  /// getParamAlignment - If this is a byval argument, return its alignment.
+  unsigned getParamAlignment() const;
 
   /// hasNestAttr - Return true if this argument has the nest attribute on
   /// it in its containing function.
diff --git a/include/llvm/Attributes.h b/include/llvm/Attributes.h
index da6188b..8b69d6e 100644
--- a/include/llvm/Attributes.h
+++ b/include/llvm/Attributes.h
@@ -67,6 +67,20 @@ const Attributes StackAlignment  = 7<<26; ///< Alignment of stack for
                                           ///alignstack(1))
 const Attributes Hotpatch    = 1<<29;     ///< Function should have special
                                           ///'hotpatch' sequence in prologue
+const Attributes UWTable     = 1<<30;     ///< Function must be in a unwind
+                                          ///table
+
+/// Note that uwtable is about the ABI or the user mandating an entry in the
+/// unwind table. The nounwind attribute is about an exception passing by the
+/// function.
+/// In a theoretical system that uses tables for profiling and sjlj for
+/// exceptions, they would be fully independent. In a normal system that
+/// uses tables for both, the semantics are:
+/// nil                = Needs an entry because an exception might pass by.
+/// nounwind           = No need for an entry
+/// uwtable            = Needs an entry because the ABI says so and because
+///                      an exception might pass by.
+/// uwtable + nounwind = Needs an entry because the ABI says so.
 
 /// @brief Attributes that only apply to function parameters.
 const Attributes ParameterOnly = ByVal | Nest | StructRet | NoCapture;
@@ -76,7 +90,7 @@ const Attributes ParameterOnly = ByVal | Nest | StructRet | NoCapture;
 const Attributes FunctionOnly = NoReturn | NoUnwind | ReadNone | ReadOnly |
   NoInline | AlwaysInline | OptimizeForSize | StackProtect | StackProtectReq |
   NoRedZone | NoImplicitFloat | Naked | InlineHint | StackAlignment |
-  Hotpatch;
+  Hotpatch | UWTable;
 
 /// @brief Parameter attributes that do not apply to vararg call arguments.
 const Attributes VarArgsIncompatible = StructRet;
diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index 58395ba..5eea099 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h
@@ -185,7 +185,14 @@ namespace llvm {
 
     void emitPrologLabel(const MachineInstr &MI);
 
-    bool needsCFIMoves();
+    enum CFIMoveType {
+      CFI_M_None,
+      CFI_M_EH,
+      CFI_M_Debug
+    };
+    CFIMoveType needsCFIMoves();
+
+    bool needsSEHMoves();
 
     /// EmitConstantPool - Print to the current output stream assembly
     /// representations of the constants in the constant pool MCP. This is
@@ -381,10 +388,6 @@ namespace llvm {
     /// operands.
     virtual MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
 
-    /// getDwarfRegOpSize - get size required to emit given machine location
-    /// using dwarf encoding.
-    virtual unsigned getDwarfRegOpSize(const MachineLocation &MLoc) const;
-
     /// getISAEncoding - Get the value for DW_AT_APPLE_isa. Zero if no isa
     /// encoding specified.
     virtual unsigned getISAEncoding() { return 0; }
@@ -396,12 +399,9 @@ namespace llvm {
     // Dwarf Lowering Routines
     //===------------------------------------------------------------------===//
 
-    /// EmitFrameMoves - Emit frame instructions to describe the layout of the
+    /// EmitCFIFrameMove - Emit frame instruction to describe the layout of the
     /// frame.
-    void EmitFrameMoves(const std::vector<MachineMove> &Moves,
-                        MCSymbol *BaseLabel, bool isEH) const;
     void EmitCFIFrameMove(const MachineMove &Move) const;
-    void EmitCFIFrameMoves(const std::vector<MachineMove> &Moves) const;
 
     //===------------------------------------------------------------------===//
     // Inline Asm Support
diff --git a/include/llvm/CodeGen/CallingConvLower.h b/include/llvm/CodeGen/CallingConvLower.h
index 9018ea3..77dc644 100644
--- a/include/llvm/CodeGen/CallingConvLower.h
+++ b/include/llvm/CodeGen/CallingConvLower.h
@@ -16,6 +16,7 @@
 #define LLVM_CODEGEN_CALLINGCONVLOWER_H
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/Target/TargetCallingConv.h"
 #include "llvm/CallingConv.h"
@@ -141,14 +142,19 @@ typedef bool CCCustomFn(unsigned &ValNo, MVT &ValVT,
                         MVT &LocVT, CCValAssign::LocInfo &LocInfo,
                         ISD::ArgFlagsTy &ArgFlags, CCState &State);
 
-typedef enum { Invalid, Prologue, Call } ParmContext;
+/// ParmContext - This enum tracks whether calling convention lowering is in
+/// the context of prologue or call generation. Not all backends make use of
+/// this information.
+typedef enum { Unknown, Prologue, Call } ParmContext;
 
 /// CCState - This class holds information needed while lowering arguments and
 /// return values.  It captures which registers are already assigned and which
 /// stack slots are used.  It provides accessors to allocate these values.
 class CCState {
+private:
   CallingConv::ID CallingConv;
   bool IsVarArg;
+  MachineFunction &MF;
   const TargetMachine &TM;
   const TargetRegisterInfo &TRI;
   SmallVector<CCValAssign, 16> &Locs;
@@ -158,10 +164,14 @@ class CCState {
   SmallVector<uint32_t, 16> UsedRegs;
   unsigned FirstByValReg;
   bool FirstByValRegValid;
+
+protected:
   ParmContext CallOrPrologue;
+
 public:
-  CCState(CallingConv::ID CC, bool isVarArg, const TargetMachine &TM,
-          SmallVector<CCValAssign, 16> &locs, LLVMContext &C);
+  CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+          const TargetMachine &TM, SmallVector<CCValAssign, 16> &locs,
+          LLVMContext &C);
 
   void addLoc(const CCValAssign &V) {
     Locs.push_back(V);
@@ -169,6 +179,7 @@ public:
 
   LLVMContext &getContext() const { return Context; }
   const TargetMachine &getTarget() const { return TM; }
+  MachineFunction &getMachineFunction() const { return MF; }
   CallingConv::ID getCallingConv() const { return CallingConv; }
   bool isVarArg() const { return IsVarArg; }
 
@@ -301,7 +312,6 @@ public:
   bool isFirstByValRegValid() { return FirstByValRegValid; }
 
   ParmContext getCallOrPrologue() { return CallOrPrologue; }
-  void setCallOrPrologue(ParmContext pc) { CallOrPrologue = pc; }
 
 private:
   /// MarkAllocated - Mark a register and all of its aliases as allocated.
diff --git a/include/llvm/CodeGen/FastISel.h b/include/llvm/CodeGen/FastISel.h
index 10c4c33d..962a4e2 100644
--- a/include/llvm/CodeGen/FastISel.h
+++ b/include/llvm/CodeGen/FastISel.h
@@ -241,6 +241,15 @@ protected:
                            unsigned Op0, bool Op0IsKill,
                            unsigned Op1, bool Op1IsKill);
 
+  /// FastEmitInst_rrr - Emit a MachineInstr with three register operands
+  /// and a result register in the given register class.
+  ///
+  unsigned FastEmitInst_rrr(unsigned MachineInstOpcode,
+                           const TargetRegisterClass *RC,
+                           unsigned Op0, bool Op0IsKill,
+                           unsigned Op1, bool Op1IsKill,
+                           unsigned Op2, bool Op2IsKill);
+
   /// FastEmitInst_ri - Emit a MachineInstr with a register operand,
   /// an immediate, and a result register in the given register class.
   ///
@@ -301,7 +310,7 @@ protected:
   /// the CFG.
   void FastEmitBranch(MachineBasicBlock *MBB, DebugLoc DL);
 
-  unsigned UpdateValueMap(const Value* I, unsigned Reg);
+  void UpdateValueMap(const Value* I, unsigned Reg, unsigned NumRegs = 1);
 
   unsigned createResultReg(const TargetRegisterClass *RC);
 
@@ -334,6 +343,8 @@ private:
 
   bool SelectCast(const User *I, unsigned Opcode);
 
+  bool SelectExtractValue(const User *I);
+
   /// HandlePHINodesInSuccessorBlocks - Handle PHI nodes in successor blocks.
   /// Emit code to ensure constants are copied into registers when needed.
   /// Remember the virtual registers that need to be added to the Machine PHI
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index f0de936..e765cad 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -107,11 +107,11 @@ namespace ISD {
     // and returns an outchain.
     EH_SJLJ_LONGJMP,
 
-    // OUTCHAIN = EH_SJLJ_DISPATCHSETUP(INCHAIN, context)
+    // OUTCHAIN = EH_SJLJ_DISPATCHSETUP(INCHAIN, setjmpval)
     // This corresponds to the eh.sjlj.dispatchsetup intrinsic. It takes an
-    // input chain and a pointer to the sjlj function context as inputs and
-    // returns an outchain. By default, this does nothing. Targets can lower
-    // this to unwind setup code if needed.
+    // input chain and the value returning from setjmp as inputs and returns an
+    // outchain. By default, this does nothing. Targets can lower this to unwind
+    // setup code if needed.
     EH_SJLJ_DISPATCHSETUP,
 
     // TargetConstant* - Like Constant*, but the DAG does not do any folding,
diff --git a/include/llvm/CodeGen/LiveInterval.h b/include/llvm/CodeGen/LiveInterval.h
index c5285ce..5fd4d3d 100644
--- a/include/llvm/CodeGen/LiveInterval.h
+++ b/include/llvm/CodeGen/LiveInterval.h
@@ -492,9 +492,10 @@ namespace llvm {
 
     /// Returns true if the live interval is zero length, i.e. no live ranges
     /// span instructions. It doesn't pay to spill such an interval.
-    bool isZeroLength() const {
+    bool isZeroLength(SlotIndexes *Indexes) const {
       for (const_iterator i = begin(), e = end(); i != e; ++i)
-        if (i->end.getPrevIndex() > i->start)
+        if (Indexes->getNextNonNullIndex(i->start).getBaseIndex() <
+            i->end.getBaseIndex())
           return false;
       return true;
     }
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index 2724689..c36dd69 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -229,6 +229,7 @@ public:
 
   enum MICheckType {
     CheckDefs,      // Check all operands for equality
+    CheckKillDead,  // Check all operands including kill / dead markers
     IgnoreDefs,     // Ignore all definitions
     IgnoreVRegDefs  // Ignore virtual register definitions
   };
diff --git a/include/llvm/CodeGen/MachineInstrBuilder.h b/include/llvm/CodeGen/MachineInstrBuilder.h
index 967e019..c8183a3 100644
--- a/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -88,7 +88,7 @@ public:
     return *this;
   }
 
-  const MachineInstrBuilder &addFrameIndex(unsigned Idx) const {
+  const MachineInstrBuilder &addFrameIndex(int Idx) const {
     MI->addOperand(MachineOperand::CreateFI(Idx));
     return *this;
   }
diff --git a/include/llvm/CodeGen/MachineModuleInfo.h b/include/llvm/CodeGen/MachineModuleInfo.h
index 6bc80b0..fa185c4 100644
--- a/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/include/llvm/CodeGen/MachineModuleInfo.h
@@ -52,27 +52,13 @@ namespace llvm {
 class Constant;
 class GlobalVariable;
 class MDNode;
+class MMIAddrLabelMap;
 class MachineBasicBlock;
 class MachineFunction;
 class Module;
 class PointerType;
 class StructType;
 
-/// MachineModuleInfoImpl - This class can be derived from and used by targets
-/// to hold private target-specific information for each Module.  Objects of
-/// type are accessed/created with MMI::getInfo and destroyed when the
-/// MachineModuleInfo is destroyed.
-class MachineModuleInfoImpl {
-public:
-  typedef PointerIntPair<MCSymbol*, 1, bool> StubValueTy;
-  virtual ~MachineModuleInfoImpl();
-  typedef std::vector<std::pair<MCSymbol*, StubValueTy> > SymbolListTy;
-protected:
-  static SymbolListTy GetSortedStubs(const DenseMap<MCSymbol*, StubValueTy>&);
-};
-
-
-
 //===----------------------------------------------------------------------===//
 /// LandingPadInfo - This structure is used to retain landing pad info for
 /// the current function.
@@ -89,7 +75,20 @@ struct LandingPadInfo {
     : LandingPadBlock(MBB), LandingPadLabel(0), Personality(0) {}
 };
 
-class MMIAddrLabelMap;
+//===----------------------------------------------------------------------===//
+/// MachineModuleInfoImpl - This class can be derived from and used by targets
+/// to hold private target-specific information for each Module.  Objects of
+/// type are accessed/created with MMI::getInfo and destroyed when the
+/// MachineModuleInfo is destroyed.
+/// 
+class MachineModuleInfoImpl {
+public:
+  typedef PointerIntPair<MCSymbol*, 1, bool> StubValueTy;
+  virtual ~MachineModuleInfoImpl();
+  typedef std::vector<std::pair<MCSymbol*, StubValueTy> > SymbolListTy;
+protected:
+  static SymbolListTy GetSortedStubs(const DenseMap<MCSymbol*, StubValueTy>&);
+};
 
 //===----------------------------------------------------------------------===//
 /// MachineModuleInfo - This class contains meta information specific to a
diff --git a/include/llvm/CodeGen/MachineOperand.h b/include/llvm/CodeGen/MachineOperand.h
index 8acc949..140c6e8 100644
--- a/include/llvm/CodeGen/MachineOperand.h
+++ b/include/llvm/CodeGen/MachineOperand.h
@@ -94,8 +94,8 @@ private:
   /// not a real instruction.  Such uses should be ignored during codegen.
   bool IsDebug : 1;
 
-  /// SmallContents - Thisreally should be part of the Contents union, but lives
-  /// out here so we can get a better packed struct.
+  /// SmallContents - This really should be part of the Contents union, but
+  /// lives out here so we can get a better packed struct.
   /// MO_Register: Register number.
   /// OffsetedInfo: Low bits of offset.
   union {
@@ -473,7 +473,7 @@ public:
     Op.setTargetFlags(TargetFlags);
     return Op;
   }
-  static MachineOperand CreateFI(unsigned Idx) {
+  static MachineOperand CreateFI(int Idx) {
     MachineOperand Op(MachineOperand::MO_FrameIndex);
     Op.setIndex(Idx);
     return Op;
diff --git a/include/llvm/CodeGen/PseudoSourceValue.h b/include/llvm/CodeGen/PseudoSourceValue.h
index bace631..7dab4f9 100644
--- a/include/llvm/CodeGen/PseudoSourceValue.h
+++ b/include/llvm/CodeGen/PseudoSourceValue.h
@@ -21,7 +21,7 @@ namespace llvm {
   class raw_ostream;
 
   /// PseudoSourceValue - Special value supplied for machine level alias
-  /// analysis. It indicates that the a memory access references the functions
+  /// analysis. It indicates that a memory access references the functions
   /// stack frame (e.g., a spill slot), below the stack frame (e.g., argument
   /// space), or constant pool.
   class PseudoSourceValue : public Value {
diff --git a/include/llvm/CodeGen/RegAllocPBQP.h b/include/llvm/CodeGen/RegAllocPBQP.h
index 7e8745e..8139c65 100644
--- a/include/llvm/CodeGen/RegAllocPBQP.h
+++ b/include/llvm/CodeGen/RegAllocPBQP.h
@@ -94,7 +94,7 @@ namespace llvm {
     typedef std::map<PBQP::Graph::ConstNodeItr, unsigned,
                      PBQP::NodeItrComparator>  Node2VReg;
     typedef DenseMap<unsigned, PBQP::Graph::NodeItr> VReg2Node;
-    typedef std::map<unsigned, AllowedSet> AllowedSetMap;
+    typedef DenseMap<unsigned, AllowedSet> AllowedSetMap;
 
     PBQP::Graph graph;
     Node2VReg node2VReg;
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index 2eb3db3..2f01948 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -265,7 +265,6 @@ namespace llvm {
     bool isCloned         : 1;          // True if this node has been cloned.
     Sched::Preference SchedulingPref;   // Scheduling preference.
 
-    SmallVector<MachineInstr*, 4> DbgInstrList; // dbg_values referencing this.
   private:
     bool isDepthCurrent   : 1;          // True if Depth is current.
     bool isHeightCurrent  : 1;          // True if Height is current.
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 92fd0c9..1c42bef 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -284,7 +284,7 @@ public:
   ///
   /// Note that this is an involved process that may invalidate pointers into
   /// the graph.
-  void Legalize(CodeGenOpt::Level OptLevel);
+  void Legalize();
 
   /// LegalizeVectors - This transforms the SelectionDAG into a SelectionDAG
   /// that only uses vector math operations supported by the target.  This is
@@ -985,10 +985,6 @@ public:
   /// other positive zero.
   bool isEqualTo(SDValue A, SDValue B) const;
 
-  /// isVerifiedDebugInfoDesc - Returns true if the specified SDValue has
-  /// been verified as a debug information descriptor.
-  bool isVerifiedDebugInfoDesc(SDValue Op) const;
-
   /// UnrollVectorOp - Utility function used by legalize and lowering to
   /// "unroll" a vector operation by splitting out the scalars and operating
   /// on each element individually.  If the ResNE is 0, fully unroll the vector
diff --git a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
index 829f580..54e5751 100644
--- a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
+++ b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
@@ -58,6 +58,10 @@ public:
   virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
 
   virtual const MCSection *getEHFrameSection() const;
+  virtual const MCSection *getWin64EHFuncTableSection(StringRef) const {
+    return NULL;
+  }
+  virtual const MCSection *getWin64EHTableSection(StringRef) const{return NULL;}
 
   virtual void emitPersonalityValue(MCStreamer &Streamer,
                                     const TargetMachine &TM,
@@ -133,6 +137,10 @@ public:
   virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
 
   virtual const MCSection *getEHFrameSection() const;
+  virtual const MCSection *getWin64EHFuncTableSection(StringRef) const {
+    return NULL;
+  }
+  virtual const MCSection *getWin64EHTableSection(StringRef) const{return NULL;}
 
   virtual const MCSection *
   SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
@@ -196,6 +204,8 @@ public:
 
 class TargetLoweringObjectFileCOFF : public TargetLoweringObjectFile {
   const MCSection *DrectveSection;
+  const MCSection *PDataSection;
+  const MCSection *XDataSection;
 public:
   TargetLoweringObjectFileCOFF() {}
   ~TargetLoweringObjectFileCOFF() {}
@@ -203,6 +213,8 @@ public:
   virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
 
   virtual const MCSection *getEHFrameSection() const;
+  virtual const MCSection *getWin64EHFuncTableSection(StringRef) const;
+  virtual const MCSection *getWin64EHTableSection(StringRef) const;
 
   virtual const MCSection *getDrectveSection() const { return DrectveSection; }
 
diff --git a/include/llvm/CompilerDriver/Common.td b/include/llvm/CompilerDriver/Common.td
index 84e8783..6ba30aa 100644
--- a/include/llvm/CompilerDriver/Common.td
+++ b/include/llvm/CompilerDriver/Common.td
@@ -56,8 +56,11 @@ def forward_not_split;
 def case;
 
 // Boolean constants.
-def true;
-def false;
+class Bool<bit val> {
+      bit Value = val;
+}
+def true : Bool<1>;
+def false : Bool<0>;
 
 // Boolean operators.
 def and;
diff --git a/include/llvm/Config/llvm-config.h.cmake b/include/llvm/Config/llvm-config.h.cmake
index 9a9cb3b..ee81f7a 100644
--- a/include/llvm/Config/llvm-config.h.cmake
+++ b/include/llvm/Config/llvm-config.h.cmake
@@ -95,7 +95,7 @@
 #cmakedefine LLVM_PATH_TWOPI "${LLVM_PATH_TWOPI}"
 
 /* Define to path to xdot.py program if found or 'echo xdot.py' otherwise */
-#cmakedefine LLVM_PATH_XDOT_PY "${LLVM_PATH_XDOT.PY}"
+#cmakedefine LLVM_PATH_XDOT_PY "${LLVM_PATH_XDOT_PY}"
 
 /* Installation prefix directory */
 #cmakedefine LLVM_PREFIX "${LLVM_PREFIX}"
diff --git a/include/llvm/DefaultPasses.h b/include/llvm/DefaultPasses.h
new file mode 100644
index 0000000..e2e58a5b
--- /dev/null
+++ b/include/llvm/DefaultPasses.h
@@ -0,0 +1,167 @@
+//===- llvm/DefaultPasses.h - Default Pass Support code --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file defines the infrastructure for registering the standard pass list.
+// This defines sets of standard optimizations that plugins can modify and
+// front ends can use.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEFAULT_PASS_SUPPORT_H
+#define LLVM_DEFAULT_PASS_SUPPORT_H
+
+namespace llvm {
+
+class PassManagerBase;
+
+/// Unique identifiers for the default standard passes.  The addresses of
+/// these symbols are used to uniquely identify passes from the default list.
+namespace DefaultStandardPasses {
+extern unsigned char AggressiveDCEID;
+extern unsigned char ArgumentPromotionID;
+extern unsigned char BasicAliasAnalysisID;
+extern unsigned char CFGSimplificationID;
+extern unsigned char ConstantMergeID;
+extern unsigned char CorrelatedValuePropagationID;
+extern unsigned char DeadArgEliminationID;
+extern unsigned char DeadStoreEliminationID;
+extern unsigned char DeadTypeEliminationID;
+extern unsigned char EarlyCSEID;
+extern unsigned char FunctionAttrsID;
+extern unsigned char FunctionInliningID;
+extern unsigned char GVNID;
+extern unsigned char GlobalDCEID;
+extern unsigned char GlobalOptimizerID;
+extern unsigned char GlobalsModRefID;
+extern unsigned char IPSCCPID;
+extern unsigned char IndVarSimplifyID;
+extern unsigned char InlinerPlaceholderID;
+extern unsigned char InstructionCombiningID;
+extern unsigned char JumpThreadingID;
+extern unsigned char LICMID;
+extern unsigned char LoopDeletionID;
+extern unsigned char LoopIdiomID;
+extern unsigned char LoopRotateID;
+extern unsigned char LoopUnrollID;
+extern unsigned char LoopUnswitchID;
+extern unsigned char MemCpyOptID;
+extern unsigned char PruneEHID;
+extern unsigned char ReassociateID;
+extern unsigned char SCCPID;
+extern unsigned char ScalarReplAggregatesID;
+extern unsigned char SimplifyLibCallsID;
+extern unsigned char StripDeadPrototypesID;
+extern unsigned char TailCallEliminationID;
+extern unsigned char TypeBasedAliasAnalysisID;
+}
+
+/// StandardPass - The class responsible for maintaining the lists of standard 
+class StandardPass {
+  friend class RegisterStandardPassLists;
+  public:
+  /// Predefined standard sets of passes
+  enum StandardSet {
+    AliasAnalysis,
+    Function,
+    Module,
+    LTO
+  };
+  /// Flags to specify whether a pass should be enabled.  Passes registered
+  /// with the standard sets may specify a minimum optimization level and one
+  /// or more flags that must be set when constructing the set for the pass to
+  /// be used.
+  enum OptimizationFlags {
+    /// Optimize for size was requested.
+    OptimizeSize = 1<<0,
+    /// Allow passes which may make global module changes.
+    UnitAtATime = 1<<1,
+    /// UnrollLoops - Allow loop unrolling.
+    UnrollLoops = 1<<2,
+    /// Allow library calls to be simplified.
+    SimplifyLibCalls = 1<<3,
+    /// Whether the module may have code using exceptions.
+    HaveExceptions = 1<<4,
+    // Run an inliner pass as part of this set.
+    RunInliner = 1<<5
+  };
+  enum OptimizationFlagComponents {
+    /// The low bits are used to store the optimization level.  When requesting
+    /// passes, this should store the requested optimisation level.  When
+    /// setting passes, this should set the minimum optimization level at which
+    /// the pass will run.
+    OptimizationLevelMask=0xf,
+    /// The maximum optimisation level at which the pass is run.
+    MaxOptimizationLevelMask=0xf0,
+    // Flags that must be set
+    RequiredFlagMask=0xff00,
+    // Flags that may not be set.
+    DisallowedFlagMask=0xff0000,
+    MaxOptimizationLevelShift=4,
+    RequiredFlagShift=8,
+    DisallowedFlagShift=16
+  };
+  /// Returns the optimisation level from a set of flags.
+  static unsigned OptimizationLevel(unsigned flags) {
+      return flags & OptimizationLevelMask;
+  }
+  /// Returns the maximum optimization level for this set of flags
+  static unsigned MaxOptimizationLevel(unsigned flags) {
+      return (flags & MaxOptimizationLevelMask) >> 4;
+  }
+  /// Constructs a set of flags from the specified minimum and maximum
+  /// optimisation level
+  static unsigned OptimzationFlags(unsigned minLevel=0, unsigned maxLevel=0xf,
+      unsigned requiredFlags=0, unsigned disallowedFlags=0) {
+    return ((minLevel & OptimizationLevelMask) |
+            ((maxLevel<<MaxOptimizationLevelShift) & MaxOptimizationLevelMask)
+            | ((requiredFlags<<RequiredFlagShift) & RequiredFlagMask)
+            | ((disallowedFlags<<DisallowedFlagShift) & DisallowedFlagMask));
+  }
+  /// Returns the flags that must be set for this to match
+  static unsigned RequiredFlags(unsigned flags) {
+      return (flags & RequiredFlagMask) >> RequiredFlagShift;
+  }
+  /// Returns the flags that must not be set for this to match
+  static unsigned DisallowedFlags(unsigned flags) {
+      return (flags & DisallowedFlagMask) >> DisallowedFlagShift;
+  }
+  /// Register a standard pass in the specified set.  If flags is non-zero,
+  /// then the pass will only be returned when the specified flags are set.
+  template<typename passName>
+  class RegisterStandardPass {
+    public:
+    RegisterStandardPass(StandardSet set, unsigned char *runBefore=0,
+        unsigned flags=0, unsigned char *ID=0) {
+      // Use the pass's ID if one is not specified
+      RegisterDefaultPass(PassInfo::NormalCtor_t(callDefaultCtor<passName>),
+               ID ? ID : (unsigned char*)&passName::ID, runBefore, set, flags);
+    }
+  };
+  /// Adds the passes from the specified set to the provided pass manager
+  static void AddPassesFromSet(PassManagerBase *PM,
+                               StandardSet set,
+                               unsigned flags=0,
+                               bool VerifyEach=false,
+                               Pass *inliner=0);
+  private:
+  /// Registers the default passes.  This is set by RegisterStandardPassLists
+  /// and is called lazily.
+  static void (*RegisterDefaultPasses)(void);
+  /// Creates the verifier pass that is inserted when a VerifyEach is passed to
+  /// AddPassesFromSet()
+  static Pass* (*CreateVerifierPass)(void);
+  /// Registers the pass
+  static void RegisterDefaultPass(PassInfo::NormalCtor_t constructor,
+                                  unsigned char *newPass,
+                                  unsigned char *oldPass,
+                                  StandardSet set,
+                                  unsigned flags=0);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h
index a01ad3a..88b21cd 100644
--- a/include/llvm/ExecutionEngine/ExecutionEngine.h
+++ b/include/llvm/ExecutionEngine/ExecutionEngine.h
@@ -135,20 +135,14 @@ protected:
     JITMemoryManager *JMM,
     CodeGenOpt::Level OptLevel,
     bool GVsWithCode,
-    CodeModel::Model CMM,
-    StringRef MArch,
-    StringRef MCPU,
-    const SmallVectorImpl<std::string>& MAttrs);
+    TargetMachine *TM);
   static ExecutionEngine *(*MCJITCtor)(
     Module *M,
     std::string *ErrorStr,
     JITMemoryManager *JMM,
     CodeGenOpt::Level OptLevel,
     bool GVsWithCode,
-    CodeModel::Model CMM,
-    StringRef MArch,
-    StringRef MCPU,
-    const SmallVectorImpl<std::string>& MAttrs);
+    TargetMachine *TM);
   static ExecutionEngine *(*InterpCtor)(Module *M,
                                         std::string *ErrorStr);
 
@@ -569,6 +563,14 @@ public:
     return *this;
   }
 
+  /// selectTarget - Pick a target either via -march or by guessing the native
+  /// arch.  Add any CPU features specified via -mcpu or -mattr.
+  static TargetMachine *selectTarget(Module *M,
+                                     StringRef MArch,
+                                     StringRef MCPU,
+                                     const SmallVectorImpl<std::string>& MAttrs,
+                                     std::string *Err);
+
   ExecutionEngine *create();
 };
 
diff --git a/include/llvm/Function.h b/include/llvm/Function.h
index 9a0825a..1edc176 100644
--- a/include/llvm/Function.h
+++ b/include/llvm/Function.h
@@ -253,6 +253,23 @@ public:
     else removeFnAttr(Attribute::NoUnwind);
   }
 
+  /// @brief True if the ABI mandates (or the user requested) that this
+  /// function be in a unwind table.
+  bool hasUWTable() const {
+    return hasFnAttr(Attribute::UWTable);
+  }
+  void setHasUWTable(bool HasUWTable = true) {
+    if (HasUWTable)
+      addFnAttr(Attribute::UWTable);
+    else
+      removeFnAttr(Attribute::UWTable);
+  }
+
+  /// @brief True if this function needs an unwind table.
+  bool needsUnwindTableEntry() const {
+    return hasUWTable() || !doesNotThrow();
+  }
+
   /// @brief Determine if the function returns a structure through first 
   /// pointer argument.
   bool hasStructRetAttr() const {
@@ -414,6 +431,10 @@ public:
   ///
   bool hasAddressTaken(const User** = 0) const;
 
+  /// callsFunctionThatReturnsTwice - Return true if the function has a call to
+  /// setjmp or other function that gcc recognizes as "returning twice".
+  bool callsFunctionThatReturnsTwice() const;
+
 private:
   // Shadow Value::setValueSubclassData with a private forwarding method so that
   // subclasses cannot accidentally use it.
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index cca0194..372eaba 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -66,6 +66,7 @@ void initializeBasicAliasAnalysisPass(PassRegistry&);
 void initializeBasicCallGraphPass(PassRegistry&);
 void initializeBlockExtractorPassPass(PassRegistry&);
 void initializeBlockPlacementPass(PassRegistry&);
+void initializeBranchProbabilityInfoPass(PassRegistry&);
 void initializeBreakCriticalEdgesPass(PassRegistry&);
 void initializeCFGOnlyPrinterPass(PassRegistry&);
 void initializeCFGOnlyViewerPass(PassRegistry&);
diff --git a/include/llvm/IntrinsicInst.h b/include/llvm/IntrinsicInst.h
index 74c30fb..24e5fe7 100644
--- a/include/llvm/IntrinsicInst.h
+++ b/include/llvm/IntrinsicInst.h
@@ -139,7 +139,7 @@ namespace llvm {
       return !getVolatileCst()->isZero();
     }
 
-    unsigned getAddressSpace() const {
+    unsigned getDestAddressSpace() const {
       return cast<PointerType>(getRawDest()->getType())->getAddressSpace();
     }
 
@@ -227,6 +227,10 @@ namespace llvm {
     /// value is guaranteed to be a pointer.
     Value *getSource() const { return getRawSource()->stripPointerCasts(); }
 
+    unsigned getSourceAddressSpace() const {
+      return cast<PointerType>(getRawSource()->getType())->getAddressSpace();
+    }
+
     void setSource(Value *Ptr) {
       assert(getRawSource()->getType() == Ptr->getType() &&
              "setSource called with pointer of wrong type!");
diff --git a/include/llvm/Intrinsics.td b/include/llvm/Intrinsics.td
index a63cd6a..d8f249a 100644
--- a/include/llvm/Intrinsics.td
+++ b/include/llvm/Intrinsics.td
@@ -47,6 +47,9 @@ def IntrReadWriteArgMem : IntrinsicProperty;
 // Commutative - This intrinsic is commutative: X op Y == Y op X.
 def Commutative : IntrinsicProperty;
 
+// Throws - This intrinsic can throw.
+def Throws : IntrinsicProperty;
+
 // NoCapture - The specified argument pointer is not captured by the intrinsic.
 class NoCapture<int argNo> : IntrinsicProperty {
   int ArgNo = argNo;
@@ -292,6 +295,7 @@ let Properties = [IntrNoMem] in {
 def int_eh_exception : Intrinsic<[llvm_ptr_ty], [], [IntrReadMem]>;
 def int_eh_selector  : Intrinsic<[llvm_i32_ty],
                                  [llvm_ptr_ty, llvm_ptr_ty, llvm_vararg_ty]>;
+def int_eh_resume    : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [Throws]>;
 
 def int_eh_typeid_for : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty]>;
 
@@ -307,7 +311,7 @@ let Properties = [IntrNoMem] in {
   def int_eh_sjlj_lsda    : Intrinsic<[llvm_ptr_ty]>;
   def int_eh_sjlj_callsite: Intrinsic<[], [llvm_i32_ty]>;
 }
-def int_eh_sjlj_dispatch_setup : Intrinsic<[], []>;
+def int_eh_sjlj_dispatch_setup : Intrinsic<[], [llvm_i32_ty], [IntrReadMem]>;
 def int_eh_sjlj_setjmp  : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty]>;
 def int_eh_sjlj_longjmp : Intrinsic<[], [llvm_ptr_ty]>;
 
diff --git a/include/llvm/IntrinsicsARM.td b/include/llvm/IntrinsicsARM.td
index 03e9261..fa8034e 100644
--- a/include/llvm/IntrinsicsARM.td
+++ b/include/llvm/IntrinsicsARM.td
@@ -36,6 +36,16 @@ let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
 }
 
 //===----------------------------------------------------------------------===//
+// Load and Store exclusive doubleword
+
+let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
+  def int_arm_strexd : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty,
+                                  llvm_ptr_ty], [IntrReadWriteArgMem]>;
+  def int_arm_ldrexd : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_ptr_ty],
+                                 [IntrReadArgMem]>;
+}
+
+//===----------------------------------------------------------------------===//
 // VFP
 
 let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
@@ -50,6 +60,43 @@ let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
 }
 
 //===----------------------------------------------------------------------===//
+// Coprocessor
+
+let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
+  // Move to coprocessor
+  def int_arm_mcr : GCCBuiltin<"__builtin_arm_mcr">,
+     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_arm_mcr2 : GCCBuiltin<"__builtin_arm_mcr2">,
+     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+
+  // Move from coprocessor
+  def int_arm_mrc : GCCBuiltin<"__builtin_arm_mrc">,
+     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                               llvm_i32_ty, llvm_i32_ty], []>;
+  def int_arm_mrc2 : GCCBuiltin<"__builtin_arm_mrc2">,
+     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                               llvm_i32_ty, llvm_i32_ty], []>;
+
+  // Coprocessor data processing
+  def int_arm_cdp : GCCBuiltin<"__builtin_arm_cdp">,
+     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_arm_cdp2 : GCCBuiltin<"__builtin_arm_cdp2">,
+     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+
+  // Move from two registers to coprocessor
+  def int_arm_mcrr : GCCBuiltin<"__builtin_arm_mcrr">,
+     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                    llvm_i32_ty, llvm_i32_ty], []>;
+  def int_arm_mcrr2 : GCCBuiltin<"__builtin_arm_mcrr2">,
+     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                    llvm_i32_ty, llvm_i32_ty], []>;
+}
+
+//===----------------------------------------------------------------------===//
 // Advanced SIMD (NEON)
 
 let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td
index b44101a..d445a01 100644
--- a/include/llvm/IntrinsicsX86.td
+++ b/include/llvm/IntrinsicsX86.td
@@ -224,9 +224,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
 // Cacheability support ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse_movnt_ps : GCCBuiltin<"__builtin_ia32_movntps">,
-              Intrinsic<[], [llvm_ptr_ty,
-                         llvm_v4f32_ty], []>;
   def int_x86_sse_sfence : GCCBuiltin<"__builtin_ia32_sfence">,
               Intrinsic<[], [], []>;
 }
@@ -536,19 +533,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
                          llvm_v4i32_ty], []>;
 }
 
-// Cacheability support ops
-let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_sse2_movnt_dq : GCCBuiltin<"__builtin_ia32_movntdq">,
-              Intrinsic<[], [llvm_ptr_ty,
-                         llvm_v2i64_ty], []>;
-  def int_x86_sse2_movnt_pd : GCCBuiltin<"__builtin_ia32_movntpd">,
-              Intrinsic<[], [llvm_ptr_ty,
-                         llvm_v2f64_ty], []>;
-  def int_x86_sse2_movnt_i : GCCBuiltin<"__builtin_ia32_movnti">,
-              Intrinsic<[], [llvm_ptr_ty,
-                         llvm_i32_ty], []>;
-}
-
 // Misc.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse2_packsswb_128 : GCCBuiltin<"__builtin_ia32_packsswb128">,
@@ -964,19 +948,19 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
 // Miscellaneous
 // CRC Instruction
 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
-  def int_x86_sse42_crc32_8         : GCCBuiltin<"__builtin_ia32_crc32qi">,
+  def int_x86_sse42_crc32_32_8       : GCCBuiltin<"__builtin_ia32_crc32qi">,
           Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i8_ty],
                     [IntrNoMem]>;
-  def int_x86_sse42_crc32_16         : GCCBuiltin<"__builtin_ia32_crc32hi">,
+  def int_x86_sse42_crc32_32_16      : GCCBuiltin<"__builtin_ia32_crc32hi">,
           Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i16_ty],
                     [IntrNoMem]>;
-  def int_x86_sse42_crc32_32         : GCCBuiltin<"__builtin_ia32_crc32si">,
+  def int_x86_sse42_crc32_32_32      : GCCBuiltin<"__builtin_ia32_crc32si">,
           Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
                     [IntrNoMem]>;
-  def int_x86_sse42_crc64_8         :
+  def int_x86_sse42_crc32_64_8       :
           Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i8_ty],
                     [IntrNoMem]>;
-  def int_x86_sse42_crc64_64         : GCCBuiltin<"__builtin_ia32_crc32di">,
+  def int_x86_sse42_crc32_64_64      : GCCBuiltin<"__builtin_ia32_crc32di">,
           Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
                     [IntrNoMem]>;
 }
diff --git a/include/llvm/IntrinsicsXCore.td b/include/llvm/IntrinsicsXCore.td
index e633af0..a062fc4 100644
--- a/include/llvm/IntrinsicsXCore.td
+++ b/include/llvm/IntrinsicsXCore.td
@@ -11,6 +11,12 @@
 let TargetPrefix = "xcore" in {  // All intrinsics start with "llvm.xcore.".
   // Miscellaneous instructions.
   def int_xcore_bitrev : Intrinsic<[llvm_i32_ty],[llvm_i32_ty],[IntrNoMem]>;
+  def int_xcore_crc8 : Intrinsic<[llvm_i32_ty, llvm_i32_ty],
+                                 [llvm_i32_ty,llvm_i32_ty,llvm_i32_ty],
+                                 [IntrNoMem]>;
+  def int_xcore_crc32 : Intrinsic<[llvm_i32_ty],
+                                  [llvm_i32_ty,llvm_i32_ty,llvm_i32_ty],
+                                  [IntrNoMem]>;
   def int_xcore_getid : Intrinsic<[llvm_i32_ty],[],[IntrNoMem]>;
   def int_xcore_getps : Intrinsic<[llvm_i32_ty],[llvm_i32_ty]>;
   def int_xcore_setps : Intrinsic<[],[llvm_i32_ty, llvm_i32_ty]>;
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index 88ee65a..c02e161 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -70,7 +70,7 @@ namespace {
       (void) llvm::createEdgeProfilerPass();
       (void) llvm::createOptimalEdgeProfilerPass();
       (void) llvm::createPathProfilerPass();
-      (void) llvm::createGCOVProfilerPass(true, true);
+      (void) llvm::createGCOVProfilerPass(true, true, false);
       (void) llvm::createFunctionInliningPass();
       (void) llvm::createAlwaysInlinerPass();
       (void) llvm::createGlobalDCEPass();
diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h
index 8733161..775d22b 100644
--- a/include/llvm/MC/MCAsmInfo.h
+++ b/include/llvm/MC/MCAsmInfo.h
@@ -26,12 +26,12 @@ namespace llvm {
   class MCSymbol;
   class MCContext;
 
-  /// MCAsmInfo - This class is intended to be used as a base class for asm
-  /// properties and features specific to the target.
   namespace ExceptionHandling {
-    enum ExceptionsType { None, DwarfTable, DwarfCFI, SjLj, ARM };
+    enum ExceptionsType { None, DwarfCFI, SjLj, ARM, Win64 };
   }
 
+  /// MCAsmInfo - This class is intended to be used as a base class for asm
+  /// properties and features specific to the target.
   class MCAsmInfo {
   protected:
     //===------------------------------------------------------------------===//
@@ -269,9 +269,6 @@ namespace llvm {
     /// SupportsExceptionHandling - True if target supports exception handling.
     ExceptionHandling::ExceptionsType ExceptionsType; // Defaults to None
 
-    /// RequiresFrameSection - true if the Dwarf2 output needs a frame section
-    bool DwarfRequiresFrameSection;          // Defaults to true.
-
     /// DwarfUsesInlineInfoSection - True if DwarfDebugInlineSection is used to
     /// encode inline subroutine information.
     bool DwarfUsesInlineInfoSection;         // Defaults to false.
@@ -279,9 +276,9 @@ namespace llvm {
     /// DwarfSectionOffsetDirective - Special section offset directive.
     const char* DwarfSectionOffsetDirective; // Defaults to NULL
 
-    /// DwarfUsesAbsoluteLabelForStmtList - True if DW_AT_stmt_list needs
-    /// absolute label instead of offset.
-    bool DwarfUsesAbsoluteLabelForStmtList;  // Defaults to true;
+    /// DwarfRequiresRelocationForSectionOffset - True if we need to produce a
+    // relocation when we want a section offset in dwarf.
+    bool DwarfRequiresRelocationForSectionOffset;  // Defaults to true;
 
     // DwarfUsesLabelOffsetDifference - True if Dwarf2 output can
     // use EmitLabelOffsetDifference.
@@ -462,13 +459,9 @@ namespace llvm {
     }
     bool isExceptionHandlingDwarf() const {
       return
-        (ExceptionsType == ExceptionHandling::DwarfTable ||
-         ExceptionsType == ExceptionHandling::DwarfCFI ||
-         ExceptionsType == ExceptionHandling::ARM);
-    }
-
-    bool doesDwarfRequireFrameSection() const {
-      return DwarfRequiresFrameSection;
+        (ExceptionsType == ExceptionHandling::DwarfCFI ||
+         ExceptionsType == ExceptionHandling::ARM ||
+         ExceptionsType == ExceptionHandling::Win64);
     }
     bool doesDwarfUsesInlineInfoSection() const {
       return DwarfUsesInlineInfoSection;
@@ -476,8 +469,8 @@ namespace llvm {
     const char *getDwarfSectionOffsetDirective() const {
       return DwarfSectionOffsetDirective;
     }
-    bool doesDwarfUsesAbsoluteLabelForStmtList() const {
-      return DwarfUsesAbsoluteLabelForStmtList;
+    bool doesDwarfRequireRelocationForSectionOffset() const {
+      return DwarfRequiresRelocationForSectionOffset;
     }
     bool doesDwarfUsesLabelOffsetForRanges() const {
       return DwarfUsesLabelOffsetForRanges;
diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h
index 3bbcf3e..90c3728 100644
--- a/include/llvm/MC/MCDwarf.h
+++ b/include/llvm/MC/MCDwarf.h
@@ -281,11 +281,10 @@ namespace llvm {
     //
     // This emits the frame info section.
     //
-    static void Emit(MCStreamer &streamer, bool usingCFI);
-    static void EmitDarwin(MCStreamer &streamer, bool usingCFI);
+    static void Emit(MCStreamer &streamer, bool usingCFI,
+                     bool isEH);
     static void EmitAdvanceLoc(MCStreamer &Streamer, uint64_t AddrDelta);
-    static void EncodeAdvanceLoc(uint64_t AddrDelta, raw_ostream &OS,
-                                 const TargetAsmInfo &AsmInfo);
+    static void EncodeAdvanceLoc(uint64_t AddrDelta, raw_ostream &OS);
   };
 } // end namespace llvm
 
diff --git a/include/llvm/MC/MCELFSymbolFlags.h b/include/llvm/MC/MCELFSymbolFlags.h
index d798fb0..2225ea0 100644
--- a/include/llvm/MC/MCELFSymbolFlags.h
+++ b/include/llvm/MC/MCELFSymbolFlags.h
@@ -49,7 +49,8 @@ namespace llvm {
       ELF_STV_Hidden    = (ELF::STV_HIDDEN    << ELF_STV_Shift),
       ELF_STV_Protected = (ELF::STV_PROTECTED << ELF_STV_Shift),
 
-      ELF_Other_Weakref = (1                  << ELF_Other_Shift)
+      ELF_Other_Weakref = (1                  << ELF_Other_Shift),
+      ELF_Other_ThumbFunc = (2                << ELF_Other_Shift)
   };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h
index 521fde6..0f28599 100644
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h
@@ -171,8 +171,10 @@ public:
     VK_ARM_GOTTPOFF,
 
     VK_PPC_TOC,
-    VK_PPC_HA16,  // ha16(symbol)
-    VK_PPC_LO16   // lo16(symbol)
+    VK_PPC_DARWIN_HA16,  // ha16(symbol)
+    VK_PPC_DARWIN_LO16,  // lo16(symbol)
+    VK_PPC_GAS_HA16,     // symbol@ha
+    VK_PPC_GAS_LO16      // symbol@l
   };
 
 private:
diff --git a/include/llvm/MC/MCInstPrinter.h b/include/llvm/MC/MCInstPrinter.h
index 0669558..39002da 100644
--- a/include/llvm/MC/MCInstPrinter.h
+++ b/include/llvm/MC/MCInstPrinter.h
@@ -45,8 +45,8 @@ public:
   /// "MOV32ri") or empty if we can't resolve it.
   virtual StringRef getOpcodeName(unsigned Opcode) const;
 
-  /// getRegName - Return the assembler register name.
-  virtual StringRef getRegName(unsigned RegNo) const;
+  /// printRegName - Print the assembler register name.
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
 
   unsigned getAvailableFeatures() const { return AvailableFeatures; }
   void setAvailableFeatures(unsigned Value) { AvailableFeatures = Value; }
diff --git a/include/llvm/MC/MCParser/MCAsmLexer.h b/include/llvm/MC/MCParser/MCAsmLexer.h
index 606725a..47c580f 100644
--- a/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/include/llvm/MC/MCParser/MCAsmLexer.h
@@ -44,6 +44,7 @@ public:
     Colon,
     Plus, Minus, Tilde,
     Slash,    // '/'
+    BackSlash, // '\'
     LParen, RParen, LBrac, RBrac, LCurly, RCurly,
     Star, Dot, Comma, Dollar, Equal, EqualEqual,
 
diff --git a/include/llvm/MC/MCParser/MCAsmParser.h b/include/llvm/MC/MCParser/MCAsmParser.h
index 54979d9..7376693 100644
--- a/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/include/llvm/MC/MCParser/MCAsmParser.h
@@ -71,7 +71,9 @@ public:
 
   /// Warning - Emit a warning at the location \arg L, with the message \arg
   /// Msg.
-  virtual void Warning(SMLoc L, const Twine &Msg) = 0;
+  ///
+  /// \return The return value is true, if warnings are fatal.
+  virtual bool Warning(SMLoc L, const Twine &Msg) = 0;
 
   /// Error - Emit an error at the location \arg L, with the message \arg
   /// Msg.
diff --git a/include/llvm/MC/MCParser/MCAsmParserExtension.h b/include/llvm/MC/MCParser/MCAsmParserExtension.h
index ceb57f5..4e2aee9 100644
--- a/include/llvm/MC/MCParser/MCAsmParserExtension.h
+++ b/include/llvm/MC/MCParser/MCAsmParserExtension.h
@@ -56,7 +56,7 @@ public:
   MCAsmParser &getParser() { return *Parser; }
   SourceMgr &getSourceManager() { return getParser().getSourceManager(); }
   MCStreamer &getStreamer() { return getParser().getStreamer(); }
-  void Warning(SMLoc L, const Twine &Msg) {
+  bool Warning(SMLoc L, const Twine &Msg) {
     return getParser().Warning(L, Msg);
   }
   bool Error(SMLoc L, const Twine &Msg) {
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index b005c8b..c05a925 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -18,6 +18,7 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCWin64EH.h"
 
 namespace llvm {
   class MCAsmInfo;
@@ -50,10 +51,18 @@ namespace llvm {
     MCStreamer(const MCStreamer&); // DO NOT IMPLEMENT
     MCStreamer &operator=(const MCStreamer&); // DO NOT IMPLEMENT
 
+    bool EmitEHFrame;
+    bool EmitDebugFrame;
+
     std::vector<MCDwarfFrameInfo> FrameInfos;
     MCDwarfFrameInfo *getCurrentFrameInfo();
     void EnsureValidFrame();
 
+    std::vector<MCWin64EHUnwindInfo *> W64UnwindInfos;
+    MCWin64EHUnwindInfo *CurrentW64UnwindInfo;
+    void setCurrentW64UnwindInfo(MCWin64EHUnwindInfo *Frame);
+    void EnsureValidW64UnwindInfo();
+
     const MCSymbol* LastNonPrivate;
 
     /// SectionStack - This is stack of current and previous section
@@ -67,8 +76,12 @@ namespace llvm {
     const MCExpr *BuildSymbolDiff(MCContext &Context, const MCSymbol *A,
                                   const MCSymbol *B);
 
-    const MCExpr *ForceExpAbs(MCStreamer *Streamer, MCContext &Context,
-                              const MCExpr* Expr);
+    const MCExpr *ForceExpAbs(const MCExpr* Expr);
+
+    void EmitFrames(bool usingCFI);
+
+    MCWin64EHUnwindInfo *getCurrentW64UnwindInfo(){return CurrentW64UnwindInfo;}
+    void EmitW64Tables();
 
   public:
     virtual ~MCStreamer();
@@ -83,6 +96,14 @@ namespace llvm {
       return FrameInfos[i];
     }
 
+    unsigned getNumW64UnwindInfos() {
+      return W64UnwindInfos.size();
+    }
+
+    MCWin64EHUnwindInfo &getW64UnwindInfo(unsigned i) {
+      return *W64UnwindInfos[i];
+    }
+
     /// @name Assembly File Formatting.
     /// @{
 
@@ -174,6 +195,17 @@ namespace llvm {
       }
     }
 
+    /// SwitchSectionNoChange - Set the current section where code is being
+    /// emitted to @p Section.  This is required to update CurSection. This
+    /// version does not call ChangeSection.
+    void SwitchSectionNoChange(const MCSection *Section) {
+      assert(Section && "Cannot switch to a null section!");
+      const MCSection *curSection = SectionStack.back().first;
+      SectionStack.back().second = curSection;
+      if (Section != curSection)
+        SectionStack.back().first = Section;
+    }
+
     /// InitSections - Create the default sections and set the initial one.
     virtual void InitSections() = 0;
 
@@ -288,6 +320,7 @@ namespace llvm {
     /// if non-zero.  This must be a power of 2 on some targets.
     virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                                 uint64_t Size, unsigned ByteAlignment = 0) = 0;
+
     /// @}
     /// @name Generating Data
     /// @{
@@ -436,6 +469,7 @@ namespace llvm {
     void EmitDwarfSetLineAddr(int64_t LineDelta, const MCSymbol *Label,
                               int PointerSize);
 
+    virtual void EmitCFISections(bool EH, bool Debug);
     virtual void EmitCFIStartProc();
     virtual void EmitCFIEndProc();
     virtual void EmitCFIDefCfa(int64_t Register, int64_t Offset);
@@ -450,6 +484,21 @@ namespace llvm {
     virtual void EmitCFIRelOffset(int64_t Register, int64_t Offset);
     virtual void EmitCFIAdjustCfaOffset(int64_t Adjustment);
 
+    virtual void EmitWin64EHStartProc(const MCSymbol *Symbol);
+    virtual void EmitWin64EHEndProc();
+    virtual void EmitWin64EHStartChained();
+    virtual void EmitWin64EHEndChained();
+    virtual void EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
+                                    bool Except);
+    virtual void EmitWin64EHHandlerData();
+    virtual void EmitWin64EHPushReg(unsigned Register);
+    virtual void EmitWin64EHSetFrame(unsigned Register, unsigned Offset);
+    virtual void EmitWin64EHAllocStack(unsigned Size);
+    virtual void EmitWin64EHSaveReg(unsigned Register, unsigned Offset);
+    virtual void EmitWin64EHSaveXMM(unsigned Register, unsigned Offset);
+    virtual void EmitWin64EHPushFrame(bool Code);
+    virtual void EmitWin64EHEndProlog();
+
     /// EmitInstruction - Emit the given @p Instruction into the current
     /// section.
     virtual void EmitInstruction(const MCInst &Inst) = 0;
diff --git a/include/llvm/MC/MCWin64EH.h b/include/llvm/MC/MCWin64EH.h
new file mode 100644
index 0000000..eb4665a
--- /dev/null
+++ b/include/llvm/MC/MCWin64EH.h
@@ -0,0 +1,93 @@
+//===- MCWin64EH.h - Machine Code Win64 EH support --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains declarations to support the Win64 Exception Handling
+// scheme in MC.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCWIN64EH_H
+#define LLVM_MC_MCWIN64EH_H
+
+#include "llvm/Support/Win64EH.h"
+#include <cassert>
+#include <vector>
+
+namespace llvm {
+  class StringRef;
+  class MCStreamer;
+  class MCSymbol;
+
+  class MCWin64EHInstruction {
+  public:
+    typedef Win64EH::UnwindOpcodes OpType;
+  private:
+    OpType Operation;
+    MCSymbol *Label;
+    unsigned Offset;
+    unsigned Register;
+  public:
+    MCWin64EHInstruction(OpType Op, MCSymbol *L, unsigned Reg)
+      : Operation(Op), Label(L), Offset(0), Register(Reg) {
+     assert(Op == Win64EH::UOP_PushNonVol);
+    }
+    MCWin64EHInstruction(MCSymbol *L, unsigned Size)
+      : Operation(Size>128 ? Win64EH::UOP_AllocLarge : Win64EH::UOP_AllocSmall),
+        Label(L), Offset(Size) { }
+    MCWin64EHInstruction(OpType Op, MCSymbol *L, unsigned Reg, unsigned Off)
+      : Operation(Op), Label(L), Offset(Off), Register(Reg) {
+      assert(Op == Win64EH::UOP_SetFPReg ||
+             Op == Win64EH::UOP_SaveNonVol ||
+             Op == Win64EH::UOP_SaveNonVolBig ||
+             Op == Win64EH::UOP_SaveXMM128 ||
+             Op == Win64EH::UOP_SaveXMM128Big);
+    }
+    MCWin64EHInstruction(OpType Op, MCSymbol *L, bool Code)
+      : Operation(Op), Label(L), Offset(Code ? 1 : 0) {
+      assert(Op == Win64EH::UOP_PushMachFrame);
+    }
+    OpType getOperation() const { return Operation; }
+    MCSymbol *getLabel() const { return Label; }
+    unsigned getOffset() const { return Offset; }
+    unsigned getSize() const { return Offset; }
+    unsigned getRegister() const { return Register; }
+    bool isPushCodeFrame() const { return Offset == 1; }
+  };
+
+  struct MCWin64EHUnwindInfo {
+    MCWin64EHUnwindInfo() : Begin(0), End(0), ExceptionHandler(0),
+                            Function(0), PrologEnd(0), Symbol(0),
+                            HandlesUnwind(false), HandlesExceptions(false),
+                            LastFrameInst(-1), ChainedParent(0),
+                            Instructions() {}
+    MCSymbol *Begin;
+    MCSymbol *End;
+    const MCSymbol *ExceptionHandler;
+    const MCSymbol *Function;
+    MCSymbol *PrologEnd;
+    MCSymbol *Symbol;
+    bool HandlesUnwind;
+    bool HandlesExceptions;
+    int LastFrameInst;
+    MCWin64EHUnwindInfo *ChainedParent;
+    std::vector<MCWin64EHInstruction> Instructions;
+  };
+
+  class MCWin64EHUnwindEmitter {
+  public:
+    static StringRef GetSectionSuffix(const MCSymbol *func);
+    //
+    // This emits the unwind info sections (.pdata and .xdata in PE/COFF).
+    //
+    static void Emit(MCStreamer &streamer);
+    static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info);
+  };
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/Metadata.h b/include/llvm/Metadata.h
index c323025..887e33c 100644
--- a/include/llvm/Metadata.h
+++ b/include/llvm/Metadata.h
@@ -34,7 +34,7 @@ template<typename ValueSubClass, typename ItemParentClass>
 //===----------------------------------------------------------------------===//
 /// MDString - a single uniqued string.
 /// These are used to efficiently contain a byte sequence for metadata.
-/// MDString is always unnamd.
+/// MDString is always unnamed.
 class MDString : public Value {
   MDString(const MDString &);            // DO NOT IMPLEMENT
 
diff --git a/include/llvm/Operator.h b/include/llvm/Operator.h
index ff2a0ad..e9aa499 100644
--- a/include/llvm/Operator.h
+++ b/include/llvm/Operator.h
@@ -186,28 +186,46 @@ public:
 };
 
 class AddOperator
-  : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Add> {};
+  : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Add> {
+  ~AddOperator(); // DO NOT IMPLEMENT
+};
 class SubOperator
-  : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Sub> {};
+  : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Sub> {
+  ~SubOperator(); // DO NOT IMPLEMENT
+};
 class MulOperator
-  : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Mul> {};
+  : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Mul> {
+  ~MulOperator(); // DO NOT IMPLEMENT
+};
 class ShlOperator
-  : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Shl> {};
+  : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Shl> {
+  ~ShlOperator(); // DO NOT IMPLEMENT
+};
 
   
 class SDivOperator
-  : public ConcreteOperator<PossiblyExactOperator, Instruction::SDiv> {};
+  : public ConcreteOperator<PossiblyExactOperator, Instruction::SDiv> {
+  ~SDivOperator(); // DO NOT IMPLEMENT
+};
 class UDivOperator
-  : public ConcreteOperator<PossiblyExactOperator, Instruction::UDiv> {};
+  : public ConcreteOperator<PossiblyExactOperator, Instruction::UDiv> {
+  ~UDivOperator(); // DO NOT IMPLEMENT
+};
 class AShrOperator
-  : public ConcreteOperator<PossiblyExactOperator, Instruction::AShr> {};
+  : public ConcreteOperator<PossiblyExactOperator, Instruction::AShr> {
+  ~AShrOperator(); // DO NOT IMPLEMENT
+};
 class LShrOperator
-  : public ConcreteOperator<PossiblyExactOperator, Instruction::LShr> {};
+  : public ConcreteOperator<PossiblyExactOperator, Instruction::LShr> {
+  ~LShrOperator(); // DO NOT IMPLEMENT
+};
   
   
   
 class GEPOperator
   : public ConcreteOperator<Operator, Instruction::GetElementPtr> {
+  ~GEPOperator(); // DO NOT IMPLEMENT
+
   enum {
     IsInBounds = (1 << 0)
   };
diff --git a/include/llvm/Support/BranchProbability.h b/include/llvm/Support/BranchProbability.h
new file mode 100644
index 0000000..7ba6491
--- /dev/null
+++ b/include/llvm/Support/BranchProbability.h
@@ -0,0 +1,50 @@
+//===- BranchProbability.h - Branch Probability Analysis --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of BranchProbability shared by IR and Machine Instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_BRANCHPROBABILITY_H
+#define LLVM_SUPPORT_BRANCHPROBABILITY_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class raw_ostream;
+class BranchProbabilityInfo;
+class MachineBranchProbabilityInfo;
+class MachineBasicBlock;
+
+// This class represents Branch Probability as a non-negative fraction.
+class BranchProbability {
+  friend class BranchProbabilityInfo;
+  friend class MachineBranchProbabilityInfo;
+  friend class MachineBasicBlock;
+
+  // Numerator
+  uint32_t N;
+
+  // Denominator
+  uint32_t D;
+
+  BranchProbability(uint32_t n, uint32_t d);
+
+public:
+  raw_ostream &print(raw_ostream &OS) const;
+
+  void dump() const;
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const BranchProbability &Prob);
+
+}
+
+#endif
diff --git a/include/llvm/Support/Casting.h b/include/llvm/Support/Casting.h
index abb5a9a..3aab436 100644
--- a/include/llvm/Support/Casting.h
+++ b/include/llvm/Support/Casting.h
@@ -23,8 +23,6 @@ namespace llvm {
 //                          isa<x> Support Templates
 //===----------------------------------------------------------------------===//
 
-template<typename FromCl> struct isa_impl_cl;
-
 // Define a template that can be specialized by smart pointers to reflect the
 // fact that they are automatically dereferenced, and are not involved with the
 // template selection process...  the default implementation is a noop.
@@ -43,12 +41,9 @@ template<typename From> struct simplify_type<const From> {
   }
 };
 
-
-// isa<X> - Return true if the parameter to the template is an instance of the
-// template type argument.  Used like this:
-//
-//  if (isa<Type*>(myVal)) { ... }
-//
+// The core of the implementation of isa<X> is here; To and From should be
+// the names of classes.  This template can be specialized to customize the
+// implementation of isa<> without rewriting it from scratch.
 template <typename To, typename From>
 struct isa_impl {
   static inline bool doit(const From &Val) {
@@ -56,66 +51,63 @@ struct isa_impl {
   }
 };
 
-template<typename To, typename From, typename SimpleType>
-struct isa_impl_wrap {
-  // When From != SimplifiedType, we can simplify the type some more by using
-  // the simplify_type template.
-  static bool doit(const From &Val) {
-    return isa_impl_cl<const SimpleType>::template
-                    isa<To>(simplify_type<const From>::getSimplifiedValue(Val));
+template <typename To, typename From> struct isa_impl_cl {
+  static inline bool doit(const From &Val) {
+    return isa_impl<To, From>::doit(Val);
   }
 };
 
-template<typename To, typename FromTy>
-struct isa_impl_wrap<To, const FromTy, const FromTy> {
-  // When From == SimpleType, we are as simple as we are going to get.
-  static bool doit(const FromTy &Val) {
-    return isa_impl<To,FromTy>::doit(Val);
+template <typename To, typename From> struct isa_impl_cl<To, const From> {
+  static inline bool doit(const From &Val) {
+    return isa_impl<To, From>::doit(Val);
   }
 };
 
-// isa_impl_cl - Use class partial specialization to transform types to a single
-// canonical form for isa_impl.
-//
-template<typename FromCl>
-struct isa_impl_cl {
-  template<class ToCl>
-  static bool isa(const FromCl &Val) {
-    return isa_impl_wrap<ToCl,const FromCl,
-                   typename simplify_type<const FromCl>::SimpleType>::doit(Val);
+template <typename To, typename From> struct isa_impl_cl<To, From*> {
+  static inline bool doit(const From *Val) {
+    return isa_impl<To, From>::doit(*Val);
   }
 };
 
-// Specialization used to strip const qualifiers off of the FromCl type...
-template<typename FromCl>
-struct isa_impl_cl<const FromCl> {
-  template<class ToCl>
-  static bool isa(const FromCl &Val) {
-    return isa_impl_cl<FromCl>::template isa<ToCl>(Val);
+template <typename To, typename From> struct isa_impl_cl<To, const From*> {
+  static inline bool doit(const From *Val) {
+    return isa_impl<To, From>::doit(*Val);
   }
 };
 
-// Define pointer traits in terms of base traits...
-template<class FromCl>
-struct isa_impl_cl<FromCl*> {
-  template<class ToCl>
-  static bool isa(FromCl *Val) {
-    return isa_impl_cl<FromCl>::template isa<ToCl>(*Val);
+template <typename To, typename From> struct isa_impl_cl<To, const From*const> {
+  static inline bool doit(const From *Val) {
+    return isa_impl<To, From>::doit(*Val);
   }
 };
 
-// Define reference traits in terms of base traits...
-template<class FromCl>
-struct isa_impl_cl<FromCl&> {
-  template<class ToCl>
-  static bool isa(FromCl &Val) {
-    return isa_impl_cl<FromCl>::template isa<ToCl>(&Val);
+template<typename To, typename From, typename SimpleFrom>
+struct isa_impl_wrap {
+  // When From != SimplifiedType, we can simplify the type some more by using
+  // the simplify_type template.
+  static bool doit(const From &Val) {
+    return isa_impl_wrap<To, SimpleFrom,
+      typename simplify_type<SimpleFrom>::SimpleType>::doit(
+                          simplify_type<From>::getSimplifiedValue(Val));
+  }
+};
+
+template<typename To, typename FromTy>
+struct isa_impl_wrap<To, FromTy, FromTy> {
+  // When From == SimpleType, we are as simple as we are going to get.
+  static bool doit(const FromTy &Val) {
+    return isa_impl_cl<To,FromTy>::doit(Val);
   }
 };
 
+// isa<X> - Return true if the parameter to the template is an instance of the
+// template type argument.  Used like this:
+//
+//  if (isa<Type>(myVal)) { ... }
+//
 template <class X, class Y>
 inline bool isa(const Y &Val) {
-  return isa_impl_cl<Y>::template isa<X>(Val);
+  return isa_impl_wrap<X, Y, typename simplify_type<Y>::SimpleType>::doit(Val);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/Support/CrashRecoveryContext.h b/include/llvm/Support/CrashRecoveryContext.h
index db835e8..4c0a5e2 100644
--- a/include/llvm/Support/CrashRecoveryContext.h
+++ b/include/llvm/Support/CrashRecoveryContext.h
@@ -186,8 +186,13 @@ public:
   }
 
   ~CrashRecoveryContextCleanupRegistrar() {
+    unregister();
+  }
+  
+  void unregister() {
     if (cleanup && !cleanup->cleanupFired)
-        cleanup->getContext()->unregisterCleanup(cleanup);
+      cleanup->getContext()->unregisterCleanup(cleanup);
+    cleanup = 0;
   }
 };
 }
diff --git a/include/llvm/Support/Dwarf.h b/include/llvm/Support/Dwarf.h
index f6d680b..70bac0c 100644
--- a/include/llvm/Support/Dwarf.h
+++ b/include/llvm/Support/Dwarf.h
@@ -235,6 +235,7 @@ enum dwarf_constants {
   DW_AT_APPLE_property_getter = 0x3fe9,
   DW_AT_APPLE_property_setter = 0x3fea,
   DW_AT_APPLE_property_attribute = 0x3feb,
+  DW_AT_APPLE_objc_complete_type = 0x3fec,
 
   // Attribute form encodings
   DW_FORM_addr = 0x01,
diff --git a/include/llvm/Support/IRBuilder.h b/include/llvm/Support/IRBuilder.h
index 3878e79..6a7c277 100644
--- a/include/llvm/Support/IRBuilder.h
+++ b/include/llvm/Support/IRBuilder.h
@@ -80,6 +80,7 @@ public:
   void SetInsertPoint(Instruction *I) {
     BB = I->getParent();
     InsertPt = I;
+    SetCurrentDebugLocation(I->getDebugLoc());
   }
   
   /// SetInsertPoint - This specifies that created instructions should be
@@ -106,6 +107,10 @@ public:
       I->setDebugLoc(CurDbgLocation);
   }
 
+  /// getCurrentFunctionReturnType - Get the return type of the current function
+  /// that we're emitting into.
+  const Type *getCurrentFunctionReturnType() const;
+  
   /// InsertPoint - A saved insertion point.
   class InsertPoint {
     BasicBlock *Block;
@@ -194,6 +199,7 @@ public:
     return ConstantInt::get(getInt64Ty(), C);
   }
   
+  /// getInt - Get a constant integer value.
   ConstantInt *getInt(const APInt &AI) {
     return ConstantInt::get(Context, AI);
   }
@@ -246,10 +252,10 @@ public:
     return Type::getInt8PtrTy(Context, AddrSpace);
   }
 
-  /// getCurrentFunctionReturnType - Get the return type of the current function
-  /// that we're emitting into.
-  const Type *getCurrentFunctionReturnType() const;
-  
+  //===--------------------------------------------------------------------===//
+  // Intrinsic creation methods
+  //===--------------------------------------------------------------------===//
+
   /// CreateMemSet - Create and insert a memset to the specified pointer and the
   /// specified value.  If the pointer isn't an i8*, it will be converted.  If a
   /// TBAA tag is specified, it will be added to the instruction.
@@ -282,6 +288,15 @@ public:
   
   CallInst *CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
                           bool isVolatile = false, MDNode *TBAATag = 0);  
+
+  /// CreateLifetimeStart - Create a lifetime.start intrinsic.  If the pointer
+  /// isn't i8* it will be converted.
+  CallInst *CreateLifetimeStart(Value *Ptr, ConstantInt *Size = 0);
+
+  /// CreateLifetimeEnd - Create a lifetime.end intrinsic.  If the pointer isn't
+  /// i8* it will be converted.
+  CallInst *CreateLifetimeEnd(Value *Ptr, ConstantInt *Size = 0);
+
 private:
   Value *getCastedInt8PtrValue(Value *Ptr);
 };
@@ -324,6 +339,7 @@ public:
   explicit IRBuilder(Instruction *IP)
     : IRBuilderBase(IP->getContext()), Folder() {
     SetInsertPoint(IP);
+    SetCurrentDebugLocation(IP->getDebugLoc());
   }
   
   IRBuilder(BasicBlock *TheBB, BasicBlock::iterator IP, const T& F)
diff --git a/include/llvm/Support/MemoryBuffer.h b/include/llvm/Support/MemoryBuffer.h
index d912e86..5e55bd9 100644
--- a/include/llvm/Support/MemoryBuffer.h
+++ b/include/llvm/Support/MemoryBuffer.h
@@ -81,7 +81,7 @@ public:
                                 bool RequiresNullTerminator = true);
 
   /// getMemBuffer - Open the specified memory range as a MemoryBuffer.  Note
-  /// that InputData must be null terminated.
+  /// that InputData must be null terminated if RequiresNullTerminator is true.
   static MemoryBuffer *getMemBuffer(StringRef InputData,
                                     StringRef BufferName = "",
                                     bool RequiresNullTerminator = true);
diff --git a/include/llvm/Support/PassManagerBuilder.h b/include/llvm/Support/PassManagerBuilder.h
new file mode 100644
index 0000000..513bb88
--- /dev/null
+++ b/include/llvm/Support/PassManagerBuilder.h
@@ -0,0 +1,322 @@
+//===-- llvm/Support/PassManagerBuilder.h - Build Standard Pass -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PassManagerBuilder class, which is used to set up a
+// "standard" optimization sequence suitable for languages like C and C++.
+//
+// These are implemented as inline functions so that we do not have to worry
+// about link issues.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_PASSMANAGERBUILDER_H
+#define LLVM_SUPPORT_PASSMANAGERBUILDER_H
+
+#include "llvm/PassManager.h"
+#include "llvm/DefaultPasses.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/IPO.h"
+
+namespace llvm {
+  
+/// PassManagerBuilder - This class is used to set up a standard optimization
+/// sequence for languages like C and C++, allowing some APIs to customize the
+/// pass sequence in various ways. A simple example of using it would be:
+///
+///  PassManagerBuilder Builder;
+///  Builder.OptLevel = 2;
+///  Builder.populateFunctionPassManager(FPM);
+///  Builder.populateModulePassManager(MPM);
+///
+/// In addition to setting up the basic passes, PassManagerBuilder allows
+/// frontends to vend a plugin API, where plugins are allowed to add extensions
+/// to the default pass manager.  They do this by specifying where in the pass
+/// pipeline they want to be added, along with a callback function that adds
+/// the pass(es).  For example, a plugin that wanted to add a loop optimization
+/// could do something like this:
+///
+/// static void addMyLoopPass(const PMBuilder &Builder, PassManagerBase &PM) {
+///   if (Builder.getOptLevel() > 2 && Builder.getOptSizeLevel() == 0)
+///     PM.add(createMyAwesomePass());
+/// }
+///   ...
+///   Builder.addExtension(PassManagerBuilder::EP_LoopOptimizerEnd,
+///                        addMyLoopPass);
+///   ...
+class PassManagerBuilder {
+public:
+  
+  /// Extensions are passed the builder itself (so they can see how it is
+  /// configured) as well as the pass manager to add stuff to.
+  typedef void (*ExtensionFn)(const PassManagerBuilder &Builder,
+                              PassManagerBase &PM);
+  enum ExtensionPointTy {
+    /// EP_EarlyAsPossible - This extension point allows adding passes before
+    /// any other transformations, allowing them to see the code as it is coming
+    /// out of the frontend.
+    EP_EarlyAsPossible,
+    
+    /// EP_LoopOptimizerEnd - This extension point allows adding loop passes to
+    /// the end of the loop optimizer.
+    EP_LoopOptimizerEnd
+  };
+  
+  /// The Optimization Level - Specify the basic optimization level.
+  ///    0 = -O0, 1 = -O1, 2 = -O2, 3 = -O3
+  unsigned OptLevel;
+  
+  /// SizeLevel - How much we're optimizing for size.
+  ///    0 = none, 1 = -Os, 2 = -Oz
+  unsigned SizeLevel;
+  
+  /// LibraryInfo - Specifies information about the runtime library for the
+  /// optimizer.  If this is non-null, it is added to both the function and
+  /// per-module pass pipeline.
+  TargetLibraryInfo *LibraryInfo;
+  
+  /// Inliner - Specifies the inliner to use.  If this is non-null, it is
+  /// added to the per-module passes.
+  Pass *Inliner;
+  
+  bool DisableSimplifyLibCalls;
+  bool DisableUnitAtATime;
+  bool DisableUnrollLoops;
+  
+private:
+  /// ExtensionList - This is list of all of the extensions that are registered.
+  std::vector<std::pair<ExtensionPointTy, ExtensionFn> > Extensions;
+  
+public:
+  PassManagerBuilder() {
+    OptLevel = 2;
+    SizeLevel = 0;
+    LibraryInfo = 0;
+    Inliner = 0;
+    DisableSimplifyLibCalls = false;
+    DisableUnitAtATime = false;
+    DisableUnrollLoops = false;
+  }
+  
+  ~PassManagerBuilder() {
+    delete LibraryInfo;
+    delete Inliner;
+  }
+  
+  void addExtension(ExtensionPointTy Ty, ExtensionFn Fn) {
+    Extensions.push_back(std::make_pair(Ty, Fn));
+  }
+  
+private:
+  void addExtensionsToPM(ExtensionPointTy ETy, PassManagerBase &PM) const {
+    for (unsigned i = 0, e = Extensions.size(); i != e; ++i)
+      if (Extensions[i].first == ETy)
+        Extensions[i].second(*this, PM);
+  }
+  
+  void addInitialAliasAnalysisPasses(PassManagerBase &PM) const {
+    // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
+    // BasicAliasAnalysis wins if they disagree. This is intended to help
+    // support "obvious" type-punning idioms.
+    PM.add(createTypeBasedAliasAnalysisPass());
+    PM.add(createBasicAliasAnalysisPass());
+  }
+public:
+  
+  /// populateFunctionPassManager - This fills in the function pass manager,
+  /// which is expected to be run on each function immediately as it is
+  /// generated.  The idea is to reduce the size of the IR in memory.
+  void populateFunctionPassManager(FunctionPassManager &FPM) {
+    addExtensionsToPM(EP_EarlyAsPossible, FPM);
+    
+    // Add LibraryInfo if we have some.
+    if (LibraryInfo) FPM.add(new TargetLibraryInfo(*LibraryInfo));
+
+    if (OptLevel == 0) return;
+
+    addInitialAliasAnalysisPasses(FPM);
+    
+    FPM.add(createCFGSimplificationPass());
+    FPM.add(createScalarReplAggregatesPass());
+    FPM.add(createEarlyCSEPass());
+  }
+  
+  /// populateModulePassManager - This sets up the primary pass manager.
+  void populateModulePassManager(PassManagerBase &MPM) {
+    // If all optimizations are disabled, just run the always-inline pass.
+    if (OptLevel == 0) {
+      if (Inliner) {
+        MPM.add(Inliner);
+        Inliner = 0;
+      }
+      return;
+    }
+      
+    // Add LibraryInfo if we have some.
+    if (LibraryInfo) MPM.add(new TargetLibraryInfo(*LibraryInfo));
+
+    addInitialAliasAnalysisPasses(MPM);
+    
+    if (!DisableUnitAtATime) {
+      MPM.add(createGlobalOptimizerPass());     // Optimize out global vars
+      
+      MPM.add(createIPSCCPPass());              // IP SCCP
+      MPM.add(createDeadArgEliminationPass());  // Dead argument elimination
+      
+      MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
+      MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
+    }
+    
+    // Start of CallGraph SCC passes.
+    if (!DisableUnitAtATime)
+      MPM.add(createPruneEHPass());             // Remove dead EH info
+    if (Inliner) {
+      MPM.add(Inliner);
+      Inliner = 0;
+    }
+    if (!DisableUnitAtATime)
+      MPM.add(createFunctionAttrsPass());       // Set readonly/readnone attrs
+    if (OptLevel > 2)
+      MPM.add(createArgumentPromotionPass());   // Scalarize uninlined fn args
+    
+    // Start of function pass.
+    // Break up aggregate allocas, using SSAUpdater.
+    MPM.add(createScalarReplAggregatesPass(-1, false));
+    MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
+    if (!DisableSimplifyLibCalls)
+      MPM.add(createSimplifyLibCallsPass());    // Library Call Optimizations
+    MPM.add(createJumpThreadingPass());         // Thread jumps.
+    MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
+    MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+    MPM.add(createInstructionCombiningPass());  // Combine silly seq's
+    
+    MPM.add(createTailCallEliminationPass());   // Eliminate tail calls
+    MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+    MPM.add(createReassociatePass());           // Reassociate expressions
+    MPM.add(createLoopRotatePass());            // Rotate Loop
+    MPM.add(createLICMPass());                  // Hoist loop invariants
+    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
+    MPM.add(createInstructionCombiningPass());  
+    MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
+    MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
+    MPM.add(createLoopDeletionPass());          // Delete dead loops
+    if (!DisableUnrollLoops)
+      MPM.add(createLoopUnrollPass());          // Unroll small loops
+    addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
+
+    if (OptLevel > 1)
+      MPM.add(createGVNPass());                 // Remove redundancies
+    MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset
+    MPM.add(createSCCPPass());                  // Constant prop with SCCP
+    
+    // Run instcombine after redundancy elimination to exploit opportunities
+    // opened up by them.
+    MPM.add(createInstructionCombiningPass());
+    MPM.add(createJumpThreadingPass());         // Thread jumps
+    MPM.add(createCorrelatedValuePropagationPass());
+    MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
+    MPM.add(createAggressiveDCEPass());         // Delete dead instructions
+    MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+    MPM.add(createInstructionCombiningPass());  // Clean up after everything.
+    
+    if (!DisableUnitAtATime) {
+      MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
+      MPM.add(createDeadTypeEliminationPass()); // Eliminate dead types
+      
+      // GlobalOpt already deletes dead functions and globals, at -O3 try a
+      // late pass of GlobalDCE.  It is capable of deleting dead cycles.
+      if (OptLevel > 2)
+        MPM.add(createGlobalDCEPass());         // Remove dead fns and globals.
+      
+      if (OptLevel > 1)
+        MPM.add(createConstantMergePass());     // Merge dup global constants
+    }
+  }
+  
+  void populateLTOPassManager(PassManagerBase &PM, bool Internalize,
+                              bool RunInliner) {
+    // Provide AliasAnalysis services for optimizations.
+    addInitialAliasAnalysisPasses(PM);
+    
+    // Now that composite has been compiled, scan through the module, looking
+    // for a main function.  If main is defined, mark all other functions
+    // internal.
+    if (Internalize)
+      PM.add(createInternalizePass(true));
+    
+    // Propagate constants at call sites into the functions they call.  This
+    // opens opportunities for globalopt (and inlining) by substituting function
+    // pointers passed as arguments to direct uses of functions.  
+    PM.add(createIPSCCPPass());
+    
+    // Now that we internalized some globals, see if we can hack on them!
+    PM.add(createGlobalOptimizerPass());
+    
+    // Linking modules together can lead to duplicated global constants, only
+    // keep one copy of each constant.
+    PM.add(createConstantMergePass());
+    
+    // Remove unused arguments from functions.
+    PM.add(createDeadArgEliminationPass());
+    
+    // Reduce the code after globalopt and ipsccp.  Both can open up significant
+    // simplification opportunities, and both can propagate functions through
+    // function pointers.  When this happens, we often have to resolve varargs
+    // calls, etc, so let instcombine do this.
+    PM.add(createInstructionCombiningPass());
+
+    // Inline small functions
+    if (RunInliner)
+      PM.add(createFunctionInliningPass());
+    
+    PM.add(createPruneEHPass());   // Remove dead EH info.
+    
+    // Optimize globals again if we ran the inliner.
+    if (RunInliner)
+      PM.add(createGlobalOptimizerPass());
+    PM.add(createGlobalDCEPass()); // Remove dead functions.
+    
+    // If we didn't decide to inline a function, check to see if we can
+    // transform it to pass arguments by value instead of by reference.
+    PM.add(createArgumentPromotionPass());
+    
+    // The IPO passes may leave cruft around.  Clean up after them.
+    PM.add(createInstructionCombiningPass());
+    PM.add(createJumpThreadingPass());
+    // Break up allocas
+    PM.add(createScalarReplAggregatesPass());
+    
+    // Run a few AA driven optimizations here and now, to cleanup the code.
+    PM.add(createFunctionAttrsPass()); // Add nocapture.
+    PM.add(createGlobalsModRefPass()); // IP alias analysis.
+    
+    PM.add(createLICMPass());      // Hoist loop invariants.
+    PM.add(createGVNPass());       // Remove redundancies.
+    PM.add(createMemCpyOptPass()); // Remove dead memcpys.
+    // Nuke dead stores.
+    PM.add(createDeadStoreEliminationPass());
+    
+    // Cleanup and simplify the code after the scalar optimizations.
+    PM.add(createInstructionCombiningPass());
+    
+    PM.add(createJumpThreadingPass());
+    
+    // Delete basic blocks, which optimization passes may have killed.
+    PM.add(createCFGSimplificationPass());
+   
+    // Now that we have optimized the program, discard unreachable functions.
+    PM.add(createGlobalDCEPass());
+  }
+};
+
+  
+} // end namespace llvm
+#endif
diff --git a/include/llvm/Support/PatternMatch.h b/include/llvm/Support/PatternMatch.h
index 172480e..f0fb516 100644
--- a/include/llvm/Support/PatternMatch.h
+++ b/include/llvm/Support/PatternMatch.h
@@ -694,6 +694,99 @@ inline brc_match<Cond_t> m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F) {
   return brc_match<Cond_t>(C, T, F);
 }
 
+
+//===----------------------------------------------------------------------===//
+// Matchers for max/min idioms, eg: "select (sgt x, y), x, y" -> smax(x,y).
+//
+
+template<typename LHS_t, typename RHS_t, typename Pred_t>
+struct MaxMin_match {
+  LHS_t L;
+  RHS_t R;
+
+  MaxMin_match(const LHS_t &LHS, const RHS_t &RHS)
+    : L(LHS), R(RHS) {}
+
+  template<typename OpTy>
+  bool match(OpTy *V) {
+    // Look for "(x pred y) ? x : y" or "(x pred y) ? y : x".
+    SelectInst *SI = dyn_cast<SelectInst>(V);
+    if (!SI)
+      return false;
+    ICmpInst *Cmp = dyn_cast<ICmpInst>(SI->getCondition());
+    if (!Cmp)
+      return false;
+    // At this point we have a select conditioned on a comparison.  Check that
+    // it is the values returned by the select that are being compared.
+    Value *TrueVal = SI->getTrueValue();
+    Value *FalseVal = SI->getFalseValue();
+    Value *LHS = Cmp->getOperand(0);
+    Value *RHS = Cmp->getOperand(1);
+    if ((TrueVal != LHS || FalseVal != RHS) &&
+        (TrueVal != RHS || FalseVal != LHS))
+      return false;
+    ICmpInst::Predicate Pred = LHS == TrueVal ?
+      Cmp->getPredicate() : Cmp->getSwappedPredicate();
+    // Does "(x pred y) ? x : y" represent the desired max/min operation?
+    if (!Pred_t::match(Pred))
+      return false;
+    // It does!  Bind the operands.
+    return L.match(LHS) && R.match(RHS);
+  }
+};
+
+/// smax_pred_ty - Helper class for identifying signed max predicates.
+struct smax_pred_ty {
+  static bool match(ICmpInst::Predicate Pred) {
+    return Pred == CmpInst::ICMP_SGT || Pred == CmpInst::ICMP_SGE;
+  }
+};
+
+/// smin_pred_ty - Helper class for identifying signed min predicates.
+struct smin_pred_ty {
+  static bool match(ICmpInst::Predicate Pred) {
+    return Pred == CmpInst::ICMP_SLT || Pred == CmpInst::ICMP_SLE;
+  }
+};
+
+/// umax_pred_ty - Helper class for identifying unsigned max predicates.
+struct umax_pred_ty {
+  static bool match(ICmpInst::Predicate Pred) {
+    return Pred == CmpInst::ICMP_UGT || Pred == CmpInst::ICMP_UGE;
+  }
+};
+
+/// umin_pred_ty - Helper class for identifying unsigned min predicates.
+struct umin_pred_ty {
+  static bool match(ICmpInst::Predicate Pred) {
+    return Pred == CmpInst::ICMP_ULT || Pred == CmpInst::ICMP_ULE;
+  }
+};
+
+template<typename LHS, typename RHS>
+inline MaxMin_match<LHS, RHS, smax_pred_ty>
+m_SMax(const LHS &L, const RHS &R) {
+  return MaxMin_match<LHS, RHS, smax_pred_ty>(L, R);
+}
+
+template<typename LHS, typename RHS>
+inline MaxMin_match<LHS, RHS, smin_pred_ty>
+m_SMin(const LHS &L, const RHS &R) {
+  return MaxMin_match<LHS, RHS, smin_pred_ty>(L, R);
+}
+
+template<typename LHS, typename RHS>
+inline MaxMin_match<LHS, RHS, umax_pred_ty>
+m_UMax(const LHS &L, const RHS &R) {
+  return MaxMin_match<LHS, RHS, umax_pred_ty>(L, R);
+}
+
+template<typename LHS, typename RHS>
+inline MaxMin_match<LHS, RHS, umin_pred_ty>
+m_UMin(const LHS &L, const RHS &R) {
+  return MaxMin_match<LHS, RHS, umin_pred_ty>(L, R);
+}
+
 } // end namespace PatternMatch
 } // end namespace llvm
 
diff --git a/include/llvm/Support/Program.h b/include/llvm/Support/Program.h
index 96b3566..a502657 100644
--- a/include/llvm/Support/Program.h
+++ b/include/llvm/Support/Program.h
@@ -85,8 +85,9 @@ namespace sys {
     /// This function waits for the program to exit. This function will block
     /// the current program until the invoked program exits.
     /// @returns an integer result code indicating the status of the program.
-    /// A zero or positive value indicates the result code of the program. A
-    /// negative value is the signal number on which it terminated.
+    /// A zero or positive value indicates the result code of the program.
+    /// -1 indicates failure to execute
+    /// -2 indicates a crash during execution or timeout
     /// @see Execute
     /// @brief Waits for the program to exit.
     int Wait
diff --git a/include/llvm/Support/SourceMgr.h b/include/llvm/Support/SourceMgr.h
index 2a712e4..030db8f 100644
--- a/include/llvm/Support/SourceMgr.h
+++ b/include/llvm/Support/SourceMgr.h
@@ -106,7 +106,9 @@ public:
   /// AddIncludeFile - Search for a file with the specified name in the current
   /// directory or in one of the IncludeDirs.  If no file is found, this returns
   /// ~0, otherwise it returns the buffer ID of the stacked file.
-  unsigned AddIncludeFile(const std::string &Filename, SMLoc IncludeLoc);
+  /// The full path to the included file can be found in IncludedFile.
+  unsigned AddIncludeFile(const std::string &Filename, SMLoc IncludeLoc,
+                          std::string &IncludedFile);
 
   /// FindBufferContainingLoc - Return the ID of the buffer containing the
   /// specified location, returning -1 if not found.
diff --git a/include/llvm/Support/StandardPasses.h b/include/llvm/Support/StandardPasses.h
deleted file mode 100644
index 8dfd6f9..0000000
--- a/include/llvm/Support/StandardPasses.h
+++ /dev/null
@@ -1,244 +0,0 @@
-//===-- llvm/Support/StandardPasses.h - Standard pass lists -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines utility functions for creating a "standard" set of
-// optimization passes, so that compilers and tools which use optimization
-// passes use the same set of standard passes.
-//
-// These are implemented as inline functions so that we do not have to worry
-// about link issues.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SUPPORT_STANDARDPASSES_H
-#define LLVM_SUPPORT_STANDARDPASSES_H
-
-#include "llvm/PassManager.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/IPO.h"
-
-namespace llvm {
-
-  static inline void createStandardAliasAnalysisPasses(PassManagerBase *PM) {
-    // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
-    // BasicAliasAnalysis wins if they disagree. This is intended to help
-    // support "obvious" type-punning idioms.
-    PM->add(createTypeBasedAliasAnalysisPass());
-    PM->add(createBasicAliasAnalysisPass());
-  }
-
-  /// createStandardFunctionPasses - Add the standard list of function passes to
-  /// the provided pass manager.
-  ///
-  /// \arg OptimizationLevel - The optimization level, corresponding to -O0,
-  /// -O1, etc.
-  static inline void createStandardFunctionPasses(PassManagerBase *PM,
-                                                  unsigned OptimizationLevel) {
-    if (OptimizationLevel > 0) {
-      createStandardAliasAnalysisPasses(PM);
-      PM->add(createCFGSimplificationPass());
-      PM->add(createScalarReplAggregatesPass());
-      PM->add(createEarlyCSEPass());
-    }
-  }
-
-  /// createStandardModulePasses - Add the standard list of module passes to the
-  /// provided pass manager.
-  ///
-  /// \arg OptimizationLevel - The optimization level, corresponding to -O0,
-  /// -O1, etc.
-  /// \arg OptimizeSize - Whether the transformations should optimize for size.
-  /// \arg UnitAtATime - Allow passes which may make global module changes.
-  /// \arg UnrollLoops - Allow loop unrolling.
-  /// \arg SimplifyLibCalls - Allow library calls to be simplified.
-  /// \arg HaveExceptions - Whether the module may have code using exceptions.
-  /// \arg InliningPass - The inlining pass to use, if any, or null. This will
-  /// always be added, even at -O0.a
-  static inline void createStandardModulePasses(PassManagerBase *PM,
-                                                unsigned OptimizationLevel,
-                                                bool OptimizeSize,
-                                                bool UnitAtATime,
-                                                bool UnrollLoops,
-                                                bool SimplifyLibCalls,
-                                                bool HaveExceptions,
-                                                Pass *InliningPass) {
-    createStandardAliasAnalysisPasses(PM);
-
-    // If all optimizations are disabled, just run the always-inline pass.
-    if (OptimizationLevel == 0) {
-      if (InliningPass)
-        PM->add(InliningPass);
-      return;
-    }
-    
-    if (UnitAtATime) {
-      PM->add(createGlobalOptimizerPass());     // Optimize out global vars
-      
-      PM->add(createIPSCCPPass());              // IP SCCP
-      PM->add(createDeadArgEliminationPass());  // Dead argument elimination
-      
-      PM->add(createInstructionCombiningPass());// Clean up after IPCP & DAE
-      PM->add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
-    }
-    
-    // Start of CallGraph SCC passes.
-    if (UnitAtATime && HaveExceptions)
-      PM->add(createPruneEHPass());             // Remove dead EH info
-    if (InliningPass)
-      PM->add(InliningPass);
-    if (UnitAtATime)
-      PM->add(createFunctionAttrsPass());       // Set readonly/readnone attrs
-    if (OptimizationLevel > 2)
-      PM->add(createArgumentPromotionPass());   // Scalarize uninlined fn args
-    
-    // Start of function pass.
-    // Break up aggregate allocas, using SSAUpdater.
-    PM->add(createScalarReplAggregatesPass(-1, false));
-    PM->add(createEarlyCSEPass());              // Catch trivial redundancies
-    if (SimplifyLibCalls)
-      PM->add(createSimplifyLibCallsPass());    // Library Call Optimizations
-    PM->add(createJumpThreadingPass());         // Thread jumps.
-    PM->add(createCorrelatedValuePropagationPass()); // Propagate conditionals
-    PM->add(createCFGSimplificationPass());     // Merge & remove BBs
-    PM->add(createInstructionCombiningPass());  // Combine silly seq's
-    
-    PM->add(createTailCallEliminationPass());   // Eliminate tail calls
-    PM->add(createCFGSimplificationPass());     // Merge & remove BBs
-    PM->add(createReassociatePass());           // Reassociate expressions
-    PM->add(createLoopRotatePass());            // Rotate Loop
-    PM->add(createLICMPass());                  // Hoist loop invariants
-    PM->add(createLoopUnswitchPass(OptimizeSize || OptimizationLevel < 3));
-    PM->add(createInstructionCombiningPass());  
-    PM->add(createIndVarSimplifyPass());        // Canonicalize indvars
-    PM->add(createLoopIdiomPass());             // Recognize idioms like memset.
-    PM->add(createLoopDeletionPass());          // Delete dead loops
-    if (UnrollLoops)
-      PM->add(createLoopUnrollPass());          // Unroll small loops
-    if (OptimizationLevel > 1)
-      PM->add(createGVNPass());                 // Remove redundancies
-    PM->add(createMemCpyOptPass());             // Remove memcpy / form memset
-    PM->add(createSCCPPass());                  // Constant prop with SCCP
-  
-    // Run instcombine after redundancy elimination to exploit opportunities
-    // opened up by them.
-    PM->add(createInstructionCombiningPass());
-    PM->add(createJumpThreadingPass());         // Thread jumps
-    PM->add(createCorrelatedValuePropagationPass());
-    PM->add(createDeadStoreEliminationPass());  // Delete dead stores
-    PM->add(createAggressiveDCEPass());         // Delete dead instructions
-    PM->add(createCFGSimplificationPass());     // Merge & remove BBs
-    PM->add(createInstructionCombiningPass());  // Clean up after everything.
-
-    if (UnitAtATime) {
-      PM->add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
-      PM->add(createDeadTypeEliminationPass()); // Eliminate dead types
-
-      // GlobalOpt already deletes dead functions and globals, at -O3 try a
-      // late pass of GlobalDCE.  It is capable of deleting dead cycles.
-      if (OptimizationLevel > 2)
-        PM->add(createGlobalDCEPass());         // Remove dead fns and globals.
-    
-      if (OptimizationLevel > 1)
-        PM->add(createConstantMergePass());       // Merge dup global constants
-    }
-  }
-
-  static inline void addOnePass(PassManagerBase *PM, Pass *P, bool AndVerify) {
-    PM->add(P);
-
-    if (AndVerify)
-      PM->add(createVerifierPass());
-  }
-
-  /// createStandardLTOPasses - Add the standard list of module passes suitable
-  /// for link time optimization.
-  ///
-  /// Internalize - Run the internalize pass.
-  /// RunInliner - Use a function inlining pass.
-  /// VerifyEach - Run the verifier after each pass.
-  static inline void createStandardLTOPasses(PassManagerBase *PM,
-                                             bool Internalize,
-                                             bool RunInliner,
-                                             bool VerifyEach) {
-    // Provide AliasAnalysis services for optimizations.
-    createStandardAliasAnalysisPasses(PM);
-
-    // Now that composite has been compiled, scan through the module, looking
-    // for a main function.  If main is defined, mark all other functions
-    // internal.
-    if (Internalize)
-      addOnePass(PM, createInternalizePass(true), VerifyEach);
-
-    // Propagate constants at call sites into the functions they call.  This
-    // opens opportunities for globalopt (and inlining) by substituting function
-    // pointers passed as arguments to direct uses of functions.  
-    addOnePass(PM, createIPSCCPPass(), VerifyEach);
-
-    // Now that we internalized some globals, see if we can hack on them!
-    addOnePass(PM, createGlobalOptimizerPass(), VerifyEach);
-    
-    // Linking modules together can lead to duplicated global constants, only
-    // keep one copy of each constant...
-    addOnePass(PM, createConstantMergePass(), VerifyEach);
-    
-    // Remove unused arguments from functions...
-    addOnePass(PM, createDeadArgEliminationPass(), VerifyEach);
-
-    // Reduce the code after globalopt and ipsccp.  Both can open up significant
-    // simplification opportunities, and both can propagate functions through
-    // function pointers.  When this happens, we often have to resolve varargs
-    // calls, etc, so let instcombine do this.
-    addOnePass(PM, createInstructionCombiningPass(), VerifyEach);
-
-    // Inline small functions
-    if (RunInliner)
-      addOnePass(PM, createFunctionInliningPass(), VerifyEach);
-
-    addOnePass(PM, createPruneEHPass(), VerifyEach);   // Remove dead EH info.
-    // Optimize globals again if we ran the inliner.
-    if (RunInliner)
-      addOnePass(PM, createGlobalOptimizerPass(), VerifyEach);
-    addOnePass(PM, createGlobalDCEPass(), VerifyEach); // Remove dead functions.
-
-    // If we didn't decide to inline a function, check to see if we can
-    // transform it to pass arguments by value instead of by reference.
-    addOnePass(PM, createArgumentPromotionPass(), VerifyEach);
-
-    // The IPO passes may leave cruft around.  Clean up after them.
-    addOnePass(PM, createInstructionCombiningPass(), VerifyEach);
-    addOnePass(PM, createJumpThreadingPass(), VerifyEach);
-    // Break up allocas
-    addOnePass(PM, createScalarReplAggregatesPass(), VerifyEach);
-
-    // Run a few AA driven optimizations here and now, to cleanup the code.
-    addOnePass(PM, createFunctionAttrsPass(), VerifyEach); // Add nocapture.
-    addOnePass(PM, createGlobalsModRefPass(), VerifyEach); // IP alias analysis.
-
-    addOnePass(PM, createLICMPass(), VerifyEach);      // Hoist loop invariants.
-    addOnePass(PM, createGVNPass(), VerifyEach);       // Remove redundancies.
-    addOnePass(PM, createMemCpyOptPass(), VerifyEach); // Remove dead memcpys.
-    // Nuke dead stores.
-    addOnePass(PM, createDeadStoreEliminationPass(), VerifyEach);
-
-    // Cleanup and simplify the code after the scalar optimizations.
-    addOnePass(PM, createInstructionCombiningPass(), VerifyEach);
-
-    addOnePass(PM, createJumpThreadingPass(), VerifyEach);
-    
-    // Delete basic blocks, which optimization passes may have killed.
-    addOnePass(PM, createCFGSimplificationPass(), VerifyEach);
-
-    // Now that we have optimized the program, discard unreachable functions.
-    addOnePass(PM, createGlobalDCEPass(), VerifyEach);
-  }
-}
-
-#endif
diff --git a/include/llvm/Support/Win64EH.h b/include/llvm/Support/Win64EH.h
new file mode 100644
index 0000000..8d74e10
--- /dev/null
+++ b/include/llvm/Support/Win64EH.h
@@ -0,0 +1,100 @@
+//===-- llvm/Support/Win64EH.h ---Win64 EH Constants-------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains constants and structures used for implementing
+// exception handling on Win64 platforms. For more information, see
+// http://msdn.microsoft.com/en-us/library/1eyas8tf.aspx
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_WIN64EH_H
+#define LLVM_SUPPORT_WIN64EH_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+namespace Win64EH {
+
+/// UnwindOpcodes - Enumeration whose values specify a single operation in
+/// the prolog of a function.
+enum UnwindOpcodes {
+  UOP_PushNonVol = 0,
+  UOP_AllocLarge,
+  UOP_AllocSmall,
+  UOP_SetFPReg,
+  UOP_SaveNonVol,
+  UOP_SaveNonVolBig,
+  UOP_SaveXMM128 = 8,
+  UOP_SaveXMM128Big,
+  UOP_PushMachFrame
+};
+
+/// UnwindCode - This union describes a single operation in a function prolog,
+/// or part thereof.
+union UnwindCode {
+  struct {
+    uint8_t codeOffset;
+    uint8_t unwindOp:4,
+            opInfo:4;
+  } u;
+  uint16_t frameOffset;
+};
+
+enum {
+  /// UNW_ExceptionHandler - Specifies that this function has an exception
+  /// handler.
+  UNW_ExceptionHandler = 0x01,
+  /// UNW_TerminateHandler - Specifies that this function has a termination
+  /// handler.
+  UNW_TerminateHandler = 0x02,
+  /// UNW_ChainInfo - Specifies that this UnwindInfo structure is chained to
+  /// another one.
+  UNW_ChainInfo = 0x04
+};
+
+/// RuntimeFunction - An entry in the table of functions with unwind info.
+struct RuntimeFunction {
+  uint64_t startAddress;
+  uint64_t endAddress;
+  uint64_t unwindInfoOffset;
+};
+
+/// UnwindInfo - An entry in the exception table.
+struct UnwindInfo {
+  uint8_t version:3,
+          flags:5;
+  uint8_t prologSize;
+  uint8_t numCodes;
+  uint8_t frameRegister:4,
+          frameOffset:4;
+  UnwindCode unwindCodes[1];
+
+  void *getLanguageSpecificData() {
+    return reinterpret_cast<void *>(&unwindCodes[(numCodes+1) & ~1]);
+  }
+  uint64_t getLanguageSpecificHandlerOffset() {
+    return *reinterpret_cast<uint64_t *>(getLanguageSpecificData());
+  }
+  void setLanguageSpecificHandlerOffset(uint64_t offset) {
+    *reinterpret_cast<uint64_t *>(getLanguageSpecificData()) = offset;
+  }
+  RuntimeFunction *getChainedFunctionEntry() {
+    return reinterpret_cast<RuntimeFunction *>(getLanguageSpecificData());
+  }
+  void *getExceptionData() {
+    return reinterpret_cast<void *>(reinterpret_cast<uint64_t *>(
+                                                  getLanguageSpecificData())+1);
+  }
+};
+
+
+} // End of namespace Win64EH
+} // End of namespace llvm
+
+#endif
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index 68f0515..ab6a4e2 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -128,6 +128,11 @@ class RegisterClass<string namespace, list<ValueType> regTypes, int alignment,
   // dags: (RegClass SubRegIndex, SubRegindex, ...)
   list<dag> SubRegClasses = [];
 
+  // isAllocatable - Specify that the register class can be used for virtual
+  // registers and register allocation.  Some register classes are only used to
+  // model instruction operand constraints, and should have isAllocatable = 0.
+  bit isAllocatable = 1;
+
   // MethodProtos/MethodBodies - These members can be used to insert arbitrary
   // code into a generated register class.   The normal usage of this is to
   // overload virtual methods.
@@ -151,6 +156,14 @@ class DwarfRegNum<list<int> Numbers> {
   list<int> DwarfNumbers = Numbers;
 }
 
+// DwarfRegAlias - This class declares that a given register uses the same dwarf
+// numbers as another one. This is useful for making it clear that the two
+// registers do have the same number. It also lets us build a mapping
+// from dwarf register number to llvm register.
+class DwarfRegAlias<Register reg> {
+      Register DwarfAlias = reg;
+}
+
 //===----------------------------------------------------------------------===//
 // Pull in the common support for scheduling
 //
diff --git a/include/llvm/Target/TargetAsmInfo.h b/include/llvm/Target/TargetAsmInfo.h
index 0271b67..743a2d4 100644
--- a/include/llvm/Target/TargetAsmInfo.h
+++ b/include/llvm/Target/TargetAsmInfo.h
@@ -22,6 +22,7 @@
 namespace llvm {
   class MCSection;
   class MCContext;
+  class MachineFunction;
   class TargetMachine;
   class TargetLoweringObjectFile;
 
@@ -58,6 +59,18 @@ public:
     return TLOF->getEHFrameSection();
   }
 
+  const MCSection *getDwarfFrameSection() const {
+    return TLOF->getDwarfFrameSection();
+  }
+
+  const MCSection *getWin64EHFuncTableSection(StringRef Suffix) const {
+    return TLOF->getWin64EHFuncTableSection(Suffix);
+  }
+
+  const MCSection *getWin64EHTableSection(StringRef Suffix) const {
+    return TLOF->getWin64EHTableSection(Suffix);
+  }
+
   unsigned getFDEEncoding(bool CFI) const {
     return TLOF->getFDEEncoding(CFI);
   }
@@ -66,6 +79,10 @@ public:
     return TLOF->isFunctionEHFrameSymbolPrivate();
   }
 
+  const unsigned *getCalleeSavedRegs(MachineFunction *MF = 0) const {
+    return TRI->getCalleeSavedRegs(MF);
+  }
+
   unsigned getDwarfRARegNum(bool isEH) const {
     return TRI->getDwarfRegNum(TRI->getRARegister(), isEH);
   }
@@ -77,6 +94,14 @@ public:
   int getDwarfRegNum(unsigned RegNum, bool isEH) const {
     return TRI->getDwarfRegNum(RegNum, isEH);
   }
+
+  int getLLVMRegNum(unsigned DwarfRegNum, bool isEH) const {
+    return TRI->getLLVMRegNum(DwarfRegNum, isEH);
+  }
+
+  int getSEHRegNum(unsigned RegNum) const {
+    return TRI->getSEHRegNum(RegNum);
+  }
 };
 
 }
diff --git a/include/llvm/Target/TargetInstrItineraries.h b/include/llvm/Target/TargetInstrItineraries.h
index 198d585..6011402 100644
--- a/include/llvm/Target/TargetInstrItineraries.h
+++ b/include/llvm/Target/TargetInstrItineraries.h
@@ -122,7 +122,8 @@ public:
 
   InstrItineraryData(const InstrStage *S, const unsigned *OS,
                      const unsigned *F, const InstrItinerary *I)
-    : Stages(S), OperandCycles(OS), Forwardings(F), Itineraries(I) {}
+    : Stages(S), OperandCycles(OS), Forwardings(F), Itineraries(I),
+      IssueWidth(0) {}
 
   /// isEmpty - Returns true if there are no itineraries.
   ///
diff --git a/include/llvm/Target/TargetLibraryInfo.h b/include/llvm/Target/TargetLibraryInfo.h
index 0914b5d..02a1a3c 100644
--- a/include/llvm/Target/TargetLibraryInfo.h
+++ b/include/llvm/Target/TargetLibraryInfo.h
@@ -51,6 +51,7 @@ public:
   static char ID;
   TargetLibraryInfo();
   TargetLibraryInfo(const Triple &T);
+  explicit TargetLibraryInfo(const TargetLibraryInfo &TLI);
   
   /// has - This function is used by optimizations that want to match on or form
   /// a given library function.
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 17d761c..3e36fb7 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -94,6 +94,19 @@ public:
     Custom      // Use the LowerOperation hook to implement custom lowering.
   };
 
+  /// LegalizeAction - This enum indicates whether a types are legal for a
+  /// target, and if not, what action should be used to make them valid.
+  enum LegalizeTypeAction {
+    TypeLegal,           // The target natively supports this type.
+    TypePromoteInteger,  // Replace this integer with a larger one.
+    TypeExpandInteger,   // Split this integer into two of half the size.
+    TypeSoftenFloat,     // Convert this float to a same size integer type.
+    TypeExpandFloat,     // Split this float into two of half the size.
+    TypeScalarizeVector, // Replace this one-element vector with its element.
+    TypeSplitVector,     // Split this vector into two of half the size.
+    TypeWidenVector      // This vector should be widened into a larger vector.
+  };
+
   enum BooleanContent { // How the target represents true/false values.
     UndefinedBooleanContent,    // Only bit 0 counts, the rest can hold garbage.
     ZeroOrOneBooleanContent,        // All bits zero except for bit 0.
@@ -200,71 +213,20 @@ public:
   }
 
   class ValueTypeActionImpl {
-    /// ValueTypeActions - For each value type, keep a LegalizeAction enum
+    /// ValueTypeActions - For each value type, keep a LegalizeTypeAction enum
     /// that indicates how instruction selection should deal with the type.
     uint8_t ValueTypeActions[MVT::LAST_VALUETYPE];
 
-    LegalizeAction getExtendedTypeAction(EVT VT) const {
-      // Handle non-vector integers.
-      if (!VT.isVector()) {
-        assert(VT.isInteger() && "Unsupported extended type!");
-        unsigned BitSize = VT.getSizeInBits();
-        // First promote to a power-of-two size, then expand if necessary.
-        if (BitSize < 8 || !isPowerOf2_32(BitSize))
-          return Promote;
-        return Expand;
-      }
-
-      // Vectors with only one element are always scalarized.
-      if (VT.getVectorNumElements() == 1)
-        return Expand;
-
-      // Vectors with a number of elements that is not a power of two are always
-      // widened, for example <3 x float> -> <4 x float>.
-      if (!VT.isPow2VectorType())
-        return Promote;
-
-      // Vectors with a crazy element type are always expanded, for example
-      // <4 x i2> is expanded into two vectors of type <2 x i2>.
-      if (!VT.getVectorElementType().isSimple())
-        return Expand;
-
-      // If this type is smaller than a legal vector type then widen it,
-      // otherwise expand it.  E.g. <2 x float> -> <4 x float>.
-      MVT EltType = VT.getVectorElementType().getSimpleVT();
-      unsigned NumElts = VT.getVectorNumElements();
-      while (1) {
-        // Round up to the next power of 2.
-        NumElts = (unsigned)NextPowerOf2(NumElts);
-
-        // If there is no simple vector type with this many elements then there
-        // cannot be a larger legal vector type.  Note that this assumes that
-        // there are no skipped intermediate vector types in the simple types.
-        MVT LargerVector = MVT::getVectorVT(EltType, NumElts);
-        if (LargerVector == MVT())
-          return Expand;
-
-        // If this type is legal then widen the vector.
-        if (getTypeAction(LargerVector) == Legal)
-          return Promote;
-      }
-    }
   public:
     ValueTypeActionImpl() {
       std::fill(ValueTypeActions, array_endof(ValueTypeActions), 0);
     }
 
-    LegalizeAction getTypeAction(EVT VT) const {
-      if (!VT.isExtended())
-        return getTypeAction(VT.getSimpleVT());
-      return getExtendedTypeAction(VT);
-    }
-
-    LegalizeAction getTypeAction(MVT VT) const {
-      return (LegalizeAction)ValueTypeActions[VT.SimpleTy];
+    LegalizeTypeAction getTypeAction(MVT VT) const {
+      return (LegalizeTypeAction)ValueTypeActions[VT.SimpleTy];
     }
 
-    void setTypeAction(EVT VT, LegalizeAction Action) {
+    void setTypeAction(EVT VT, LegalizeTypeAction Action) {
       unsigned I = VT.getSimpleVT().SimpleTy;
       ValueTypeActions[I] = Action;
     }
@@ -278,10 +240,10 @@ public:
   /// it is already legal (return 'Legal') or we need to promote it to a larger
   /// type (return 'Promote'), or we need to expand it into multiple registers
   /// of smaller integer type (return 'Expand').  'Custom' is not an option.
-  LegalizeAction getTypeAction(EVT VT) const {
-    return ValueTypeActions.getTypeAction(VT);
+  LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const {
+    return getTypeConversion(Context, VT).first;
   }
-  LegalizeAction getTypeAction(MVT VT) const {
+  LegalizeTypeAction getTypeAction(MVT VT) const {
     return ValueTypeActions.getTypeAction(VT);
   }
 
@@ -292,38 +254,7 @@ public:
   /// to get to the smaller register. For illegal floating point types, this
   /// returns the integer type to transform to.
   EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const {
-    if (VT.isSimple()) {
-      assert((unsigned)VT.getSimpleVT().SimpleTy <
-             array_lengthof(TransformToType));
-      EVT NVT = TransformToType[VT.getSimpleVT().SimpleTy];
-      assert(getTypeAction(NVT) != Promote &&
-             "Promote may not follow Expand or Promote");
-      return NVT;
-    }
-
-    if (VT.isVector()) {
-      EVT NVT = VT.getPow2VectorType(Context);
-      if (NVT == VT) {
-        // Vector length is a power of 2 - split to half the size.
-        unsigned NumElts = VT.getVectorNumElements();
-        EVT EltVT = VT.getVectorElementType();
-        return (NumElts == 1) ?
-          EltVT : EVT::getVectorVT(Context, EltVT, NumElts / 2);
-      }
-      // Promote to a power of two size, avoiding multi-step promotion.
-      return getTypeAction(NVT) == Promote ?
-        getTypeToTransformTo(Context, NVT) : NVT;
-    } else if (VT.isInteger()) {
-      EVT NVT = VT.getRoundIntegerType(Context);
-      if (NVT == VT)      // Size is a power of two - expand to half the size.
-        return EVT::getIntegerVT(Context, VT.getSizeInBits() / 2);
-
-      // Promote to a power of two size, avoiding multi-step promotion.
-      return getTypeAction(NVT) == Promote ?
-        getTypeToTransformTo(Context, NVT) : NVT;
-    }
-    assert(0 && "Unsupported extended type!");
-    return MVT(MVT::Other); // Not reached
+    return getTypeConversion(Context, VT).second;
   }
 
   /// getTypeToExpandTo - For types supported by the target, this is an
@@ -333,7 +264,7 @@ public:
   EVT getTypeToExpandTo(LLVMContext &Context, EVT VT) const {
     assert(!VT.isVector());
     while (true) {
-      switch (getTypeAction(VT)) {
+      switch (getTypeAction(Context, VT)) {
       case Legal:
         return VT;
       case Expand:
@@ -761,6 +692,18 @@ public:
     return MinStackArgumentAlignment;
   }
 
+  /// getMinFunctionAlignment - return the minimum function alignment.
+  ///
+  unsigned getMinFunctionAlignment() const {
+    return MinFunctionAlignment;
+  }
+
+  /// getPrefFunctionAlignment - return the preferred function alignment.
+  ///
+  unsigned getPrefFunctionAlignment() const {
+    return PrefFunctionAlignment;
+  }
+
   /// getPrefLoopAlignment - return the preferred loop alignment.
   ///
   unsigned getPrefLoopAlignment() const {
@@ -824,9 +767,6 @@ public:
   /// PIC relocation models.
   virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
 
-  /// getFunctionAlignment - Return the Log2 alignment of this function.
-  virtual unsigned getFunctionAlignment(const Function *) const = 0;
-
   /// getStackCookieLocation - Return true if the target stores stack
   /// protector cookies at a fixed offset in some non-standard address
   /// space, and populates the address space and offset as
@@ -1167,6 +1107,18 @@ protected:
     JumpBufAlignment = Align;
   }
 
+  /// setMinFunctionAlignment - Set the target's minimum function alignment.
+  void setMinFunctionAlignment(unsigned Align) {
+    MinFunctionAlignment = Align;
+  }
+
+  /// setPrefFunctionAlignment - Set the target's preferred function alignment.
+  /// This should be set if there is a performance benefit to
+  /// higher-than-minimum alignment
+  void setPrefFunctionAlignment(unsigned Align) {
+    PrefFunctionAlignment = Align;
+  }
+
   /// setPrefLoopAlignment - Set the target's preferred loop alignment. Default
   /// alignment is zero, it means the target does not care about loop alignment.
   void setPrefLoopAlignment(unsigned Align) {
@@ -1259,7 +1211,8 @@ public:
   /// return values described by the Outs array can fit into the return
   /// registers.  If false is returned, an sret-demotion is performed.
   ///
-  virtual bool CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
+  virtual bool CanLowerReturn(CallingConv::ID CallConv,
+			      MachineFunction &MF, bool isVarArg,
                const SmallVectorImpl<ISD::OutputArg> &Outs,
                LLVMContext &Context) const
   {
@@ -1497,7 +1450,7 @@ public:
 
   /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
   /// vector.  If it is invalid, don't add anything to Ops.
-  virtual void LowerAsmOperandForConstraint(SDValue Op, char ConstraintLetter,
+  virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
                                             std::vector<SDValue> &Ops,
                                             SelectionDAG &DAG) const;
 
@@ -1583,6 +1536,14 @@ public:
     return true;
   }
 
+  /// isLegalAddImmediate - Return true if the specified immediate is legal
+  /// add immediate, that is the target has add instructions which can add
+  /// a register with the immediate without having to materialize the
+  /// immediate into a register.
+  virtual bool isLegalAddImmediate(int64_t Imm) const {
+    return true;
+  }
+
   //===--------------------------------------------------------------------===//
   // Div utility functions
   //
@@ -1637,6 +1598,13 @@ private:
   const TargetData *TD;
   const TargetLoweringObjectFile &TLOF;
 
+  /// We are in the process of implementing a new TypeLegalization action
+  /// which is the promotion of vector elements. This feature is under
+  /// development. Until this feature is complete, it is only enabled using a
+  /// flag. We pass this flag using a member because of circular dep issues.
+  /// This member will be removed with the flag once we complete the transition.
+  bool mayPromoteElements;
+
   /// PointerTy - The type to use for pointers, usually i32 or i64.
   ///
   MVT PointerTy;
@@ -1693,7 +1661,18 @@ private:
   ///
   unsigned MinStackArgumentAlignment;
 
-  /// PrefLoopAlignment - The perferred loop alignment.
+  /// MinFunctionAlignment - The minimum function alignment (used when
+  /// optimizing for size, and to prevent explicitly provided alignment
+  /// from leading to incorrect code).
+  ///
+  unsigned MinFunctionAlignment;
+
+  /// PrefFunctionAlignment - The preferred function alignment (used when
+  /// alignment unspecified and optimizing for speed).
+  ///
+  unsigned PrefFunctionAlignment;
+
+  /// PrefLoopAlignment - The preferred loop alignment.
   ///
   unsigned PrefLoopAlignment;
 
@@ -1774,6 +1753,128 @@ private:
 
   ValueTypeActionImpl ValueTypeActions;
 
+  typedef std::pair<LegalizeTypeAction, EVT> LegalizeKind;
+
+  LegalizeKind
+  getTypeConversion(LLVMContext &Context, EVT VT) const {
+    // If this is a simple type, use the ComputeRegisterProp mechanism.
+    if (VT.isSimple()) {
+      assert((unsigned)VT.getSimpleVT().SimpleTy <
+             array_lengthof(TransformToType));
+      EVT NVT = TransformToType[VT.getSimpleVT().SimpleTy];
+      LegalizeTypeAction LA = ValueTypeActions.getTypeAction(VT.getSimpleVT());
+
+      assert(
+        (!(NVT.isSimple() && LA != TypeLegal) ||
+         ValueTypeActions.getTypeAction(NVT.getSimpleVT()) != TypePromoteInteger)
+         && "Promote may not follow Expand or Promote");
+
+      return LegalizeKind(LA, NVT);
+    }
+
+    // Handle Extended Scalar Types.
+    if (!VT.isVector()) {
+      assert(VT.isInteger() && "Float types must be simple");
+      unsigned BitSize = VT.getSizeInBits();
+      // First promote to a power-of-two size, then expand if necessary.
+      if (BitSize < 8 || !isPowerOf2_32(BitSize)) {
+        EVT NVT = VT.getRoundIntegerType(Context);
+        assert(NVT != VT && "Unable to round integer VT");
+        LegalizeKind NextStep = getTypeConversion(Context, NVT);
+        // Avoid multi-step promotion.
+        if (NextStep.first == TypePromoteInteger) return NextStep;
+        // Return rounded integer type.
+        return LegalizeKind(TypePromoteInteger, NVT);
+      }
+
+      return LegalizeKind(TypeExpandInteger,
+                          EVT::getIntegerVT(Context, VT.getSizeInBits()/2));
+    }
+
+    // Handle vector types.
+    unsigned NumElts = VT.getVectorNumElements();
+    EVT EltVT = VT.getVectorElementType();
+
+    // Vectors with only one element are always scalarized.
+    if (NumElts == 1)
+      return LegalizeKind(TypeScalarizeVector, EltVT);
+
+    // If we allow the promotion of vector elements using a flag,
+    // then try to widen vector elements until a legal type is found.
+    if (mayPromoteElements && EltVT.isInteger()) {
+      // Vectors with a number of elements that is not a power of two are always
+      // widened, for example <3 x float> -> <4 x float>.
+      if (!VT.isPow2VectorType()) {
+        NumElts = (unsigned)NextPowerOf2(NumElts);
+        EVT NVT = EVT::getVectorVT(Context, EltVT, NumElts);
+        return LegalizeKind(TypeWidenVector, NVT);
+      }
+
+      // Examine the element type.
+      LegalizeKind LK = getTypeConversion(Context, EltVT);
+
+      // If type is to be expanded, split the vector.
+      //  <4 x i140> -> <2 x i140>
+      if (LK.first == TypeExpandInteger)
+        return LegalizeKind(TypeSplitVector,
+                            EVT::getVectorVT(Context, EltVT, NumElts / 2));
+
+      // Promote the integer element types until a legal vector type is found
+      // or until the element integer type is too big. If a legal type was not
+      // found, fallback to the usual mechanism of widening/splitting the
+      // vector.
+      while (1) {
+        // Increase the bitwidth of the element to the next pow-of-two
+        // (which is greater than 8 bits).
+        EltVT = EVT::getIntegerVT(Context, 1 + EltVT.getSizeInBits()
+                                 ).getRoundIntegerType(Context);
+
+        // Stop trying when getting a non-simple element type.
+        // Note that vector elements may be greater than legal vector element
+        // types. Example: X86 XMM registers hold 64bit element on 32bit systems.
+        if (!EltVT.isSimple()) break;
+
+        // Build a new vector type and check if it is legal.
+        MVT NVT = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
+
+        // Found a legal promoted vector type.
+        if (ValueTypeActions.getTypeAction(NVT) == TypeLegal)
+          return LegalizeKind(TypePromoteInteger,
+                              EVT::getVectorVT(Context, EltVT, NumElts));
+      }
+    }
+
+    // Try to widen the vector until a legal type is found.
+    // If there is no wider legal type, split the vector.
+    while (1) {
+      // Round up to the next power of 2.
+      NumElts = (unsigned)NextPowerOf2(NumElts);
+
+      // If there is no simple vector type with this many elements then there
+      // cannot be a larger legal vector type.  Note that this assumes that
+      // there are no skipped intermediate vector types in the simple types.
+      MVT LargerVector = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
+      if (LargerVector == MVT()) break;
+
+      // If this type is legal then widen the vector.
+      if (ValueTypeActions.getTypeAction(LargerVector) == TypeLegal)
+        return LegalizeKind(TypeWidenVector, LargerVector);
+    }
+
+    // Widen odd vectors to next power of two.
+    if (!VT.isPow2VectorType()) {
+      EVT NVT = VT.getPow2VectorType(Context);
+      return LegalizeKind(TypeWidenVector, NVT);
+    }
+
+    // Vectors with illegal element types are expanded.
+    EVT NVT = EVT::getVectorVT(Context, EltVT, VT.getVectorNumElements() / 2);
+    return LegalizeKind(TypeSplitVector, NVT);
+
+    assert(false && "Unable to handle this kind of vector type");
+    return LegalizeKind(TypeLegal, VT);
+  }
+
   std::vector<std::pair<EVT, TargetRegisterClass*> > AvailableRegClasses;
 
   /// TargetDAGCombineArray - Targets can specify ISD nodes that they would
diff --git a/include/llvm/Target/TargetLoweringObjectFile.h b/include/llvm/Target/TargetLoweringObjectFile.h
index 7402ed6..3991035 100644
--- a/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/include/llvm/Target/TargetLoweringObjectFile.h
@@ -97,10 +97,6 @@ protected:
   /// weak_definition of constant 0 for an omitted EH frame.
   bool SupportsWeakOmittedEHFrame;
   
-  /// IsFunctionEHSymbolGlobal - This flag is set to true if the ".eh" symbol
-  /// for a function should be marked .globl.
-  bool IsFunctionEHSymbolGlobal;
-  
   /// IsFunctionEHFrameSymbolPrivate - This flag is set to true if the
   /// "EH_frame" symbol for EH information should be an assembler temporary (aka
   /// private linkage, aka an L or .L label) or false if it should be a normal
@@ -119,9 +115,6 @@ public:
     Ctx = &ctx;
   }
   
-  bool isFunctionEHSymbolGlobal() const {
-    return IsFunctionEHSymbolGlobal;
-  }
   bool isFunctionEHFrameSymbolPrivate() const {
     return IsFunctionEHFrameSymbolPrivate;
   }
@@ -162,6 +155,8 @@ public:
   const MCSection *getTLSExtraDataSection() const {
     return TLSExtraDataSection;
   }
+  virtual const MCSection *getWin64EHFuncTableSection(StringRef suffix)const=0;
+  virtual const MCSection *getWin64EHTableSection(StringRef suffix) const = 0;
   
   /// shouldEmitUsedDirectiveFor - This hook allows targets to selectively
   /// decide not to emit the UsedDirective for some symbols in llvm.used.
diff --git a/include/llvm/Target/TargetOptions.h b/include/llvm/Target/TargetOptions.h
index 62190c1..beed039 100644
--- a/include/llvm/Target/TargetOptions.h
+++ b/include/llvm/Target/TargetOptions.h
@@ -125,10 +125,6 @@ namespace llvm {
   /// flag is hidden and is only for debugging the debug info.
   extern bool JITEmitDebugInfoToDisk;
 
-  /// UnwindTablesMandatory - This flag indicates that unwind tables should
-  /// be emitted for all functions.
-  extern bool UnwindTablesMandatory;
-
   /// GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is
   /// specified on the commandline. When the flag is on, participating targets
   /// will perform tail call optimization on all calls which use the fastcc
diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h
index 205e76f..f6a8414 100644
--- a/include/llvm/Target/TargetRegisterInfo.h
+++ b/include/llvm/Target/TargetRegisterInfo.h
@@ -47,6 +47,7 @@ struct TargetRegisterDesc {
   const unsigned *SubRegs;      // Sub-register set, described above
   const unsigned *SuperRegs;    // Super-register set, described above
   unsigned CostPerUse;          // Extra cost of instructions using register.
+  bool inAllocatableClass;      // Register belongs to an allocatable regclass.
 };
 
 class TargetRegisterClass {
@@ -66,6 +67,7 @@ private:
   const sc_iterator SuperRegClasses;
   const unsigned RegSize, Alignment;    // Size & Alignment of register in bytes
   const int CopyCost;
+  const bool Allocatable;
   const iterator RegsBegin, RegsEnd;
   DenseSet<unsigned> RegSet;
 public:
@@ -76,11 +78,12 @@ public:
                       const TargetRegisterClass * const *supcs,
                       const TargetRegisterClass * const *subregcs,
                       const TargetRegisterClass * const *superregcs,
-                      unsigned RS, unsigned Al, int CC,
+                      unsigned RS, unsigned Al, int CC, bool Allocable,
                       iterator RB, iterator RE)
     : ID(id), Name(name), VTs(vts), SubClasses(subcs), SuperClasses(supcs),
     SubRegClasses(subregcs), SuperRegClasses(superregcs),
-    RegSize(RS), Alignment(Al), CopyCost(CC), RegsBegin(RB), RegsEnd(RE) {
+    RegSize(RS), Alignment(Al), CopyCost(CC), Allocatable(Allocable),
+    RegsBegin(RB), RegsEnd(RE) {
       for (iterator I = RegsBegin, E = RegsEnd; I != E; ++I)
         RegSet.insert(*I);
     }
@@ -182,6 +185,12 @@ public:
     return false;
   }
 
+  /// hasSubClassEq - Returns true if RC is a subclass of or equal to this
+  /// class.
+  bool hasSubClassEq(const TargetRegisterClass *RC) const {
+    return RC == this || hasSubClass(RC);
+  }
+
   /// subclasses_begin / subclasses_end - Loop over all of the classes
   /// that are proper subsets of this register class.
   sc_iterator subclasses_begin() const {
@@ -203,6 +212,12 @@ public:
     return false;
   }
 
+  /// hasSuperClassEq - Returns true if RC is a superclass of or equal to this
+  /// class.
+  bool hasSuperClassEq(const TargetRegisterClass *RC) const {
+    return RC == this || hasSuperClass(RC);
+  }
+
   /// superclasses_begin / superclasses_end - Loop over all of the classes
   /// that are proper supersets of this register class.
   sc_iterator superclasses_begin() const {
@@ -256,6 +271,10 @@ public:
   /// this class. A negative number means the register class is very expensive
   /// to copy e.g. status flag register classes.
   int getCopyCost() const { return CopyCost; }
+
+  /// isAllocatable - Return true if this register class may be used to create
+  /// virtual registers.
+  bool isAllocatable() const { return Allocatable; }
 };
 
 
@@ -351,13 +370,13 @@ public:
   /// The first virtual register in a function will get the index 0.
   static unsigned virtReg2Index(unsigned Reg) {
     assert(isVirtualRegister(Reg) && "Not a virtual register");
-    return Reg - (1u << 31);
+    return Reg & ~(1u << 31);
   }
 
   /// index2VirtReg - Convert a 0-based index to a virtual register number.
   /// This is the inverse operation of VirtReg2IndexFunctor below.
   static unsigned index2VirtReg(unsigned Index) {
-    return Index + (1u << 31);
+    return Index | (1u << 31);
   }
 
   /// getMinimalPhysRegClass - Returns the Register Class of a physical
@@ -802,6 +821,8 @@ public:
   /// debugging info.
   virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const = 0;
 
+  virtual int getLLVMRegNum(unsigned RegNum, bool isEH) const = 0;
+
   /// getFrameRegister - This method should return the register used as a base
   /// for values allocated in the current stack frame.
   virtual unsigned getFrameRegister(const MachineFunction &MF) const = 0;
@@ -809,6 +830,12 @@ public:
   /// getRARegister - This method should return the register where the return
   /// address can be found.
   virtual unsigned getRARegister() const = 0;
+
+  /// getSEHRegNum - Map a target register to an equivalent SEH register
+  /// number.  Returns -1 if there is no equivalent value.
+  virtual int getSEHRegNum(unsigned i) const {
+    return i;
+  }
 };
 
 
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index ff8d07d..672117f 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -354,6 +354,7 @@ def fmul       : SDNode<"ISD::FMUL"       , SDTFPBinOp, [SDNPCommutative]>;
 def fdiv       : SDNode<"ISD::FDIV"       , SDTFPBinOp>;
 def frem       : SDNode<"ISD::FREM"       , SDTFPBinOp>;
 def fabs       : SDNode<"ISD::FABS"       , SDTFPUnaryOp>;
+def fgetsign   : SDNode<"ISD::FGETSIGN"   , SDTFPToIntOp>;
 def fneg       : SDNode<"ISD::FNEG"       , SDTFPUnaryOp>;
 def fsqrt      : SDNode<"ISD::FSQRT"      , SDTFPUnaryOp>;
 def fsin       : SDNode<"ISD::FSIN"       , SDTFPUnaryOp>;
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index 088775a..8d55231 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -28,7 +28,8 @@ ModulePass *createOptimalEdgeProfilerPass();
 ModulePass *createPathProfilerPass();
 
 // Insert GCOV profiling instrumentation
-ModulePass *createGCOVProfilerPass(bool EmitNotes = true, bool EmitData = true);
+ModulePass *createGCOVProfilerPass(bool EmitNotes = true, bool EmitData = true,
+                                   bool Use402Format = false);
 
 } // End llvm namespace
 
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index e61dcb3..7f99dbc 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -43,8 +43,10 @@ template<typename T> class SmallVectorImpl;
 /// constant value, convert it into an unconditional branch to the constant
 /// destination.  This is a nontrivial operation because the successors of this
 /// basic block must have their PHI nodes updated.
-///
-bool ConstantFoldTerminator(BasicBlock *BB);
+/// Also calls RecursivelyDeleteTriviallyDeadInstructions() on any branch/switch
+/// conditions and indirectbr addresses this might make dead if
+/// DeleteDeadConditions is true.
+bool ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions = false);
 
 //===----------------------------------------------------------------------===//
 //  Local dead code elimination.
@@ -176,6 +178,10 @@ bool ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
 /// of llvm.dbg.value intrinsics.
 bool LowerDbgDeclare(Function &F);
 
+/// FindAllocaDbgDeclare - Finds the llvm.dbg.declare intrinsic corresponding to
+/// an alloca, if any.
+DbgDeclareInst *FindAllocaDbgDeclare(Value *V);
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Utils/SSAUpdater.h b/include/llvm/Transforms/Utils/SSAUpdater.h
index b4048b9..51c8467 100644
--- a/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -21,6 +21,8 @@ namespace llvm {
   class PHINode;
   template<typename T> class SmallVectorImpl;
   template<typename T> class SSAUpdaterTraits;
+  class DbgDeclareInst;
+  class DIBuilder;
   class BumpPtrAllocator;
 
 /// SSAUpdater - This class updates SSA form for a set of values defined in
@@ -120,9 +122,12 @@ private:
 class LoadAndStorePromoter {
 protected:
   SSAUpdater &SSA;
+  DbgDeclareInst *DDI;
+  DIBuilder *DIB;
 public:
   LoadAndStorePromoter(const SmallVectorImpl<Instruction*> &Insts,
-                       SSAUpdater &S, StringRef Name = StringRef());
+                       SSAUpdater &S, DbgDeclareInst *DDI, DIBuilder *DIB,
+                       StringRef Name = StringRef());
   virtual ~LoadAndStorePromoter() {}
   
   /// run - This does the promotion.  Insts is a list of loads and stores to
diff --git a/include/llvm/Type.h b/include/llvm/Type.h
index 0939d67..3bda91d 100644
--- a/include/llvm/Type.h
+++ b/include/llvm/Type.h
@@ -273,6 +273,9 @@ public:
   /// @brief Determine if this type could be losslessly bitcast to Ty
   bool canLosslesslyBitCastTo(const Type *Ty) const;
 
+  /// isEmptyTy - Return true if this type is empty, that is, it has no
+  /// elements or all its elements are empty.
+  bool isEmptyTy() const;
 
   /// Here are some useful little methods to query what type derived types are
   /// Note that all other types can just compare to see if this == Type::xxxTy;
diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
index 6ebe100..e57ba78 100644
--- a/lib/Analysis/Analysis.cpp
+++ b/lib/Analysis/Analysis.cpp
@@ -23,6 +23,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeAliasSetPrinterPass(Registry);
   initializeNoAAPass(Registry);
   initializeBasicAliasAnalysisPass(Registry);
+  initializeBranchProbabilityInfoPass(Registry);
   initializeCFGViewerPass(Registry);
   initializeCFGPrinterPass(Registry);
   initializeCFGOnlyViewerPass(Registry);
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index f1bb8a3..8330ea7 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -281,17 +281,20 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
       continue;
     }
 
-    if (const Instruction *I = dyn_cast<Instruction>(V))
-      // TODO: Get a DominatorTree and use it here.
-      if (const Value *Simplified =
-            SimplifyInstruction(const_cast<Instruction *>(I), TD)) {
-        V = Simplified;
-        continue;
-      }
-    
     const GEPOperator *GEPOp = dyn_cast<GEPOperator>(Op);
-    if (GEPOp == 0)
+    if (GEPOp == 0) {
+      // If it's not a GEP, hand it off to SimplifyInstruction to see if it
+      // can come up with something. This matches what GetUnderlyingObject does.
+      if (const Instruction *I = dyn_cast<Instruction>(V))
+        // TODO: Get a DominatorTree and use it here.
+        if (const Value *Simplified =
+              SimplifyInstruction(const_cast<Instruction *>(I), TD)) {
+          V = Simplified;
+          continue;
+        }
+    
       return V;
+    }
     
     // Don't attempt to analyze GEPs over unsized objects.
     if (!cast<PointerType>(GEPOp->getOperand(0)->getType())
@@ -448,7 +451,13 @@ namespace {
   /// BasicAliasAnalysis - This is the primary alias analysis implementation.
   struct BasicAliasAnalysis : public ImmutablePass, public AliasAnalysis {
     static char ID; // Class identification, replacement for typeinfo
-    BasicAliasAnalysis() : ImmutablePass(ID) {
+    BasicAliasAnalysis() : ImmutablePass(ID),
+                           // AliasCache rarely has more than 1 or 2 elements,
+                           // so start it off fairly small so that clear()
+                           // doesn't have to tromp through 64 (the default)
+                           // elements on each alias query. This really wants
+                           // something like a SmallDenseMap.
+                           AliasCache(8) {
       initializeBasicAliasAnalysisPass(*PassRegistry::getPassRegistry());
     }
 
@@ -462,12 +471,12 @@ namespace {
 
     virtual AliasResult alias(const Location &LocA,
                               const Location &LocB) {
-      assert(Visited.empty() && "Visited must be cleared after use!");
+      assert(AliasCache.empty() && "AliasCache must be cleared after use!");
       assert(notDifferentParent(LocA.Ptr, LocB.Ptr) &&
              "BasicAliasAnalysis doesn't support interprocedural queries.");
       AliasResult Alias = aliasCheck(LocA.Ptr, LocA.Size, LocA.TBAATag,
                                      LocB.Ptr, LocB.Size, LocB.TBAATag);
-      Visited.clear();
+      AliasCache.clear();
       return Alias;
     }
 
@@ -503,7 +512,12 @@ namespace {
     }
     
   private:
-    // Visited - Track instructions visited by a aliasPHI, aliasSelect(), and aliasGEP().
+    // AliasCache - Track alias queries to guard against recursion.
+    typedef std::pair<Location, Location> LocPair;
+    typedef DenseMap<LocPair, AliasResult> AliasCacheTy;
+    AliasCacheTy AliasCache;
+
+    // Visited - Track instructions visited by pointsToConstantMemory.
     SmallPtrSet<const Value*, 16> Visited;
 
     // aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP
@@ -680,9 +694,12 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
     unsigned ArgNo = 0;
     for (ImmutableCallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
          CI != CE; ++CI, ++ArgNo) {
-      // Only look at the no-capture pointer arguments.
+      // Only look at the no-capture or byval pointer arguments.  If this
+      // pointer were passed to arguments that were neither of these, then it
+      // couldn't be no-capture.
       if (!(*CI)->getType()->isPointerTy() ||
-          !CS.paramHasAttr(ArgNo+1, Attribute::NoCapture))
+          (!CS.paramHasAttr(ArgNo+1, Attribute::NoCapture) &&
+           !CS.paramHasAttr(ArgNo+1, Attribute::ByVal)))
         continue;
       
       // If this is a no-capture pointer argument, see if we can tell that it
@@ -816,13 +833,6 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
                              const MDNode *V2TBAAInfo,
                              const Value *UnderlyingV1,
                              const Value *UnderlyingV2) {
-  // If this GEP has been visited before, we're on a use-def cycle.
-  // Such cycles are only valid when PHI nodes are involved or in unreachable
-  // code. The visitPHI function catches cycles containing PHIs, but there
-  // could still be a cycle without PHIs in unreachable code.
-  if (!Visited.insert(GEP1))
-    return MayAlias;
-
   int64_t GEP1BaseOffset;
   SmallVector<VariableGEPIndex, 4> GEP1VariableIndices;
 
@@ -940,7 +950,30 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
       return NoAlias;
   }
   
-  return MayAlias;
+  // Statically, we can see that the base objects are the same, but the
+  // pointers have dynamic offsets which we can't resolve. And none of our
+  // little tricks above worked.
+  //
+  // TODO: Returning PartialAlias instead of MayAlias is a mild hack; the
+  // practical effect of this is protecting TBAA in the case of dynamic
+  // indices into arrays of unions. An alternative way to solve this would
+  // be to have clang emit extra metadata for unions and/or union accesses.
+  // A union-specific solution wouldn't handle the problem for malloc'd
+  // memory however.
+  return PartialAlias;
+}
+
+static AliasAnalysis::AliasResult
+MergeAliasResults(AliasAnalysis::AliasResult A, AliasAnalysis::AliasResult B) {
+  // If the results agree, take it.
+  if (A == B)
+    return A;
+  // A mix of PartialAlias and MustAlias is PartialAlias.
+  if ((A == AliasAnalysis::PartialAlias && B == AliasAnalysis::MustAlias) ||
+      (B == AliasAnalysis::PartialAlias && A == AliasAnalysis::MustAlias))
+    return AliasAnalysis::PartialAlias;
+  // Otherwise, we don't know anything.
+  return AliasAnalysis::MayAlias;
 }
 
 /// aliasSelect - Provide a bunch of ad-hoc rules to disambiguate a Select
@@ -950,13 +983,6 @@ BasicAliasAnalysis::aliasSelect(const SelectInst *SI, uint64_t SISize,
                                 const MDNode *SITBAAInfo,
                                 const Value *V2, uint64_t V2Size,
                                 const MDNode *V2TBAAInfo) {
-  // If this select has been visited before, we're on a use-def cycle.
-  // Such cycles are only valid when PHI nodes are involved or in unreachable
-  // code. The visitPHI function catches cycles containing PHIs, but there
-  // could still be a cycle without PHIs in unreachable code.
-  if (!Visited.insert(SI))
-    return MayAlias;
-
   // If the values are Selects with the same condition, we can do a more precise
   // check: just check for aliases between the values on corresponding arms.
   if (const SelectInst *SI2 = dyn_cast<SelectInst>(V2))
@@ -969,9 +995,7 @@ BasicAliasAnalysis::aliasSelect(const SelectInst *SI, uint64_t SISize,
       AliasResult ThisAlias =
         aliasCheck(SI->getFalseValue(), SISize, SITBAAInfo,
                    SI2->getFalseValue(), V2Size, V2TBAAInfo);
-      if (ThisAlias != Alias)
-        return MayAlias;
-      return Alias;
+      return MergeAliasResults(ThisAlias, Alias);
     }
 
   // If both arms of the Select node NoAlias or MustAlias V2, then returns
@@ -981,16 +1005,9 @@ BasicAliasAnalysis::aliasSelect(const SelectInst *SI, uint64_t SISize,
   if (Alias == MayAlias)
     return MayAlias;
 
-  // If V2 is visited, the recursive case will have been caught in the
-  // above aliasCheck call, so these subsequent calls to aliasCheck
-  // don't need to assume that V2 is being visited recursively.
-  Visited.erase(V2);
-
   AliasResult ThisAlias =
     aliasCheck(V2, V2Size, V2TBAAInfo, SI->getFalseValue(), SISize, SITBAAInfo);
-  if (ThisAlias != Alias)
-    return MayAlias;
-  return Alias;
+  return MergeAliasResults(ThisAlias, Alias);
 }
 
 // aliasPHI - Provide a bunch of ad-hoc rules to disambiguate a PHI instruction
@@ -1000,10 +1017,6 @@ BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize,
                              const MDNode *PNTBAAInfo,
                              const Value *V2, uint64_t V2Size,
                              const MDNode *V2TBAAInfo) {
-  // The PHI node has already been visited, avoid recursion any further.
-  if (!Visited.insert(PN))
-    return MayAlias;
-
   // If the values are PHIs in the same block, we can do a more precise
   // as well as efficient check: just check for aliases between the values
   // on corresponding edges.
@@ -1020,8 +1033,9 @@ BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize,
           aliasCheck(PN->getIncomingValue(i), PNSize, PNTBAAInfo,
                      PN2->getIncomingValueForBlock(PN->getIncomingBlock(i)),
                      V2Size, V2TBAAInfo);
-        if (ThisAlias != Alias)
-          return MayAlias;
+        Alias = MergeAliasResults(ThisAlias, Alias);
+        if (Alias == MayAlias)
+          break;
       }
       return Alias;
     }
@@ -1052,15 +1066,11 @@ BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize,
   for (unsigned i = 1, e = V1Srcs.size(); i != e; ++i) {
     Value *V = V1Srcs[i];
 
-    // If V2 is visited, the recursive case will have been caught in the
-    // above aliasCheck call, so these subsequent calls to aliasCheck
-    // don't need to assume that V2 is being visited recursively.
-    Visited.erase(V2);
-
     AliasResult ThisAlias = aliasCheck(V2, V2Size, V2TBAAInfo,
                                        V, PNSize, PNTBAAInfo);
-    if (ThisAlias != Alias || ThisAlias == MayAlias)
-      return MayAlias;
+    Alias = MergeAliasResults(ThisAlias, Alias);
+    if (Alias == MayAlias)
+      break;
   }
 
   return Alias;
@@ -1145,6 +1155,17 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
         (V2Size != UnknownSize && isObjectSmallerThan(O1, V2Size, *TD)))
       return NoAlias;
   
+  // Check the cache before climbing up use-def chains. This also terminates
+  // otherwise infinitely recursive queries.
+  LocPair Locs(Location(V1, V1Size, V1TBAAInfo),
+               Location(V2, V2Size, V2TBAAInfo));
+  if (V1 > V2)
+    std::swap(Locs.first, Locs.second);
+  std::pair<AliasCacheTy::iterator, bool> Pair =
+    AliasCache.insert(std::make_pair(Locs, MayAlias));
+  if (!Pair.second)
+    return Pair.first->second;
+
   // FIXME: This isn't aggressively handling alias(GEP, PHI) for example: if the
   // GEP can't simplify, we don't even look at the PHI cases.
   if (!isa<GEPOperator>(V1) && isa<GEPOperator>(V2)) {
@@ -1154,7 +1175,7 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
   }
   if (const GEPOperator *GV1 = dyn_cast<GEPOperator>(V1)) {
     AliasResult Result = aliasGEP(GV1, V1Size, V2, V2Size, V2TBAAInfo, O1, O2);
-    if (Result != MayAlias) return Result;
+    if (Result != MayAlias) return AliasCache[Locs] = Result;
   }
 
   if (isa<PHINode>(V2) && !isa<PHINode>(V1)) {
@@ -1164,7 +1185,7 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
   if (const PHINode *PN = dyn_cast<PHINode>(V1)) {
     AliasResult Result = aliasPHI(PN, V1Size, V1TBAAInfo,
                                   V2, V2Size, V2TBAAInfo);
-    if (Result != MayAlias) return Result;
+    if (Result != MayAlias) return AliasCache[Locs] = Result;
   }
 
   if (isa<SelectInst>(V2) && !isa<SelectInst>(V1)) {
@@ -1174,7 +1195,7 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
   if (const SelectInst *S1 = dyn_cast<SelectInst>(V1)) {
     AliasResult Result = aliasSelect(S1, V1Size, V1TBAAInfo,
                                      V2, V2Size, V2TBAAInfo);
-    if (Result != MayAlias) return Result;
+    if (Result != MayAlias) return AliasCache[Locs] = Result;
   }
 
   // If both pointers are pointing into the same object and one of them
@@ -1183,8 +1204,10 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
   if (TD && O1 == O2)
     if ((V1Size != UnknownSize && isObjectSize(O1, V1Size, *TD)) ||
         (V2Size != UnknownSize && isObjectSize(O2, V2Size, *TD)))
-      return PartialAlias;
+      return AliasCache[Locs] = PartialAlias;
 
-  return AliasAnalysis::alias(Location(V1, V1Size, V1TBAAInfo),
-                              Location(V2, V2Size, V2TBAAInfo));
+  AliasResult Result =
+    AliasAnalysis::alias(Location(V1, V1Size, V1TBAAInfo),
+                         Location(V2, V2Size, V2TBAAInfo));
+  return AliasCache[Locs] = Result;
 }
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
new file mode 100644
index 0000000..812fac0
--- /dev/null
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -0,0 +1,357 @@
+//===-- BranchProbabilityInfo.cpp - Branch Probability Analysis -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Loops should be simplified before this analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Instructions.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+INITIALIZE_PASS_BEGIN(BranchProbabilityInfo, "branch-prob",
+                      "Branch Probability Analysis", false, true)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_END(BranchProbabilityInfo, "branch-prob",
+                    "Branch Probability Analysis", false, true)
+
+char BranchProbabilityInfo::ID = 0;
+
+
+// Please note that BranchProbabilityAnalysis is not a FunctionPass.
+// It is created by BranchProbabilityInfo (which is a FunctionPass), which
+// provides a clear interface. Thanks to that, all heuristics and other
+// private methods are hidden in the .cpp file.
+class BranchProbabilityAnalysis {
+
+  typedef std::pair<BasicBlock *, BasicBlock *> Edge;
+
+  DenseMap<Edge, uint32_t> *Weights;
+
+  BranchProbabilityInfo *BP;
+
+  LoopInfo *LI;
+
+
+  // Weights are for internal use only. They are used by heuristics to help to
+  // estimate edges' probability. Example:
+  //
+  // Using "Loop Branch Heuristics" we predict weights of edges for the
+  // block BB2.
+  //         ...
+  //          |
+  //          V
+  //         BB1<-+
+  //          |   |
+  //          |   | (Weight = 128)
+  //          V   |
+  //         BB2--+
+  //          |
+  //          | (Weight = 4)
+  //          V
+  //         BB3
+  //
+  // Probability of the edge BB2->BB1 = 128 / (128 + 4) = 0.9696..
+  // Probability of the edge BB2->BB3 = 4 / (128 + 4) = 0.0303..
+
+  static const uint32_t LBH_TAKEN_WEIGHT = 128;
+  static const uint32_t LBH_NONTAKEN_WEIGHT = 4;
+
+  // Standard weight value. Used when none of the heuristics set weight for
+  // the edge.
+  static const uint32_t NORMAL_WEIGHT = 16;
+
+  // Minimum weight of an edge. Please note, that weight is NEVER 0.
+  static const uint32_t MIN_WEIGHT = 1;
+
+  // Return TRUE if BB leads directly to a Return Instruction.
+  static bool isReturningBlock(BasicBlock *BB) {
+    SmallPtrSet<BasicBlock *, 8> Visited;
+
+    while (true) {
+      TerminatorInst *TI = BB->getTerminator();
+      if (isa<ReturnInst>(TI))
+        return true;
+
+      if (TI->getNumSuccessors() > 1)
+        break;
+
+      // It is unreachable block which we can consider as a return instruction.
+      if (TI->getNumSuccessors() == 0)
+        return true;
+
+      Visited.insert(BB);
+      BB = TI->getSuccessor(0);
+
+      // Stop if cycle is detected.
+      if (Visited.count(BB))
+        return false;
+    }
+
+    return false;
+  }
+
+  // Multiply Edge Weight by two.
+  void incEdgeWeight(BasicBlock *Src, BasicBlock *Dst) {
+    uint32_t Weight = BP->getEdgeWeight(Src, Dst);
+    uint32_t MaxWeight = getMaxWeightFor(Src);
+
+    if (Weight * 2 > MaxWeight)
+      BP->setEdgeWeight(Src, Dst, MaxWeight);
+    else
+      BP->setEdgeWeight(Src, Dst, Weight * 2);
+  }
+
+  // Divide Edge Weight by two.
+  void decEdgeWeight(BasicBlock *Src, BasicBlock *Dst) {
+    uint32_t Weight = BP->getEdgeWeight(Src, Dst);
+
+    assert(Weight > 0);
+    if (Weight / 2 < MIN_WEIGHT)
+      BP->setEdgeWeight(Src, Dst, MIN_WEIGHT);
+    else
+      BP->setEdgeWeight(Src, Dst, Weight / 2);
+  }
+
+
+  uint32_t getMaxWeightFor(BasicBlock *BB) const {
+    return UINT32_MAX / BB->getTerminator()->getNumSuccessors();
+  }
+
+public:
+  BranchProbabilityAnalysis(DenseMap<Edge, uint32_t> *W,
+                            BranchProbabilityInfo *BP, LoopInfo *LI)
+    : Weights(W), BP(BP), LI(LI) {
+  }
+
+  // Return Heuristics
+  void calcReturnHeuristics(BasicBlock *BB);
+
+  // Pointer Heuristics
+  void calcPointerHeuristics(BasicBlock *BB);
+
+  // Loop Branch Heuristics
+  void calcLoopBranchHeuristics(BasicBlock *BB);
+
+  bool runOnFunction(Function &F);
+};
+
+// Calculate Edge Weights using "Return Heuristics". Predict a successor which
+// leads directly to Return Instruction will not be taken.
+void BranchProbabilityAnalysis::calcReturnHeuristics(BasicBlock *BB){
+  if (BB->getTerminator()->getNumSuccessors() == 1)
+    return;
+
+  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+    BasicBlock *Succ = *I;
+    if (isReturningBlock(Succ)) {
+      decEdgeWeight(BB, Succ);
+    }
+  }
+}
+
+// Calculate Edge Weights using "Pointer Heuristics". Predict a comparsion
+// between two pointer or pointer and NULL will fail.
+void BranchProbabilityAnalysis::calcPointerHeuristics(BasicBlock *BB) {
+  BranchInst * BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isConditional())
+    return;
+
+  Value *Cond = BI->getCondition();
+  ICmpInst *CI = dyn_cast<ICmpInst>(Cond);
+  if (!CI)
+    return;
+
+  Value *LHS = CI->getOperand(0);
+
+  if (!LHS->getType()->isPointerTy())
+    return;
+
+  assert(CI->getOperand(1)->getType()->isPointerTy());
+
+  BasicBlock *Taken = BI->getSuccessor(0);
+  BasicBlock *NonTaken = BI->getSuccessor(1);
+
+  // p != 0   ->   isProb = true
+  // p == 0   ->   isProb = false
+  // p != q   ->   isProb = true
+  // p == q   ->   isProb = false;
+  bool isProb = !CI->isEquality();
+  if (!isProb)
+    std::swap(Taken, NonTaken);
+
+  incEdgeWeight(BB, Taken);
+  decEdgeWeight(BB, NonTaken);
+}
+
+// Calculate Edge Weights using "Loop Branch Heuristics". Predict backedges
+// as taken, exiting edges as not-taken.
+void BranchProbabilityAnalysis::calcLoopBranchHeuristics(BasicBlock *BB) {
+  uint32_t numSuccs = BB->getTerminator()->getNumSuccessors();
+
+  Loop *L = LI->getLoopFor(BB);
+  if (!L)
+    return;
+
+  SmallVector<BasicBlock *, 8> BackEdges;
+  SmallVector<BasicBlock *, 8> ExitingEdges;
+
+  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+    BasicBlock *Succ = *I;
+    Loop *SuccL = LI->getLoopFor(Succ);
+    if (SuccL != L)
+      ExitingEdges.push_back(Succ);
+    else if (Succ == L->getHeader())
+      BackEdges.push_back(Succ);
+  }
+
+  if (uint32_t numBackEdges = BackEdges.size()) {
+    uint32_t backWeight = LBH_TAKEN_WEIGHT / numBackEdges;
+    if (backWeight < NORMAL_WEIGHT)
+      backWeight = NORMAL_WEIGHT;
+
+    for (SmallVector<BasicBlock *, 8>::iterator EI = BackEdges.begin(),
+         EE = BackEdges.end(); EI != EE; ++EI) {
+      BasicBlock *Back = *EI;
+      BP->setEdgeWeight(BB, Back, backWeight);
+    }
+  }
+
+  uint32_t numExitingEdges = ExitingEdges.size();
+  if (uint32_t numNonExitingEdges = numSuccs - numExitingEdges) {
+    uint32_t exitWeight = LBH_NONTAKEN_WEIGHT / numNonExitingEdges;
+    if (exitWeight < MIN_WEIGHT)
+      exitWeight = MIN_WEIGHT;
+
+    for (SmallVector<BasicBlock *, 8>::iterator EI = ExitingEdges.begin(),
+         EE = ExitingEdges.end(); EI != EE; ++EI) {
+      BasicBlock *Exiting = *EI;
+      BP->setEdgeWeight(BB, Exiting, exitWeight);
+    }
+  }
+}
+
+bool BranchProbabilityAnalysis::runOnFunction(Function &F) {
+
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
+    BasicBlock *BB = I++;
+
+    // Only LBH uses setEdgeWeight method.
+    calcLoopBranchHeuristics(BB);
+
+    // PH and RH use only incEdgeWeight and decEwdgeWeight methods to
+    // not efface LBH results.
+    calcPointerHeuristics(BB);
+    calcReturnHeuristics(BB);
+  }
+
+  return false;
+}
+
+
+bool BranchProbabilityInfo::runOnFunction(Function &F) {
+  LoopInfo &LI = getAnalysis<LoopInfo>();
+  BranchProbabilityAnalysis BPA(&Weights, this, &LI);
+  return BPA.runOnFunction(F);
+}
+
+uint32_t BranchProbabilityInfo::getSumForBlock(BasicBlock *BB) const {
+  uint32_t Sum = 0;
+
+  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+    BasicBlock *Succ = *I;
+    uint32_t Weight = getEdgeWeight(BB, Succ);
+    uint32_t PrevSum = Sum;
+
+    Sum += Weight;
+    assert(Sum > PrevSum); (void) PrevSum;
+  }
+
+  return Sum;
+}
+
+bool BranchProbabilityInfo::isEdgeHot(BasicBlock *Src, BasicBlock *Dst) const {
+  // Hot probability is at least 4/5 = 80%
+  uint32_t Weight = getEdgeWeight(Src, Dst);
+  uint32_t Sum = getSumForBlock(Src);
+
+  // FIXME: Implement BranchProbability::compare then change this code to
+  // compare this BranchProbability against a static "hot" BranchProbability.
+  return (uint64_t)Weight * 5 > (uint64_t)Sum * 4;
+}
+
+BasicBlock *BranchProbabilityInfo::getHotSucc(BasicBlock *BB) const {
+  uint32_t Sum = 0;
+  uint32_t MaxWeight = 0;
+  BasicBlock *MaxSucc = 0;
+
+  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+    BasicBlock *Succ = *I;
+    uint32_t Weight = getEdgeWeight(BB, Succ);
+    uint32_t PrevSum = Sum;
+
+    Sum += Weight;
+    assert(Sum > PrevSum); (void) PrevSum;
+
+    if (Weight > MaxWeight) {
+      MaxWeight = Weight;
+      MaxSucc = Succ;
+    }
+  }
+
+  // FIXME: Use BranchProbability::compare.
+  if ((uint64_t)MaxWeight * 5 > (uint64_t)Sum * 4)
+    return MaxSucc;
+
+  return 0;
+}
+
+// Return edge's weight. If can't find it, return DEFAULT_WEIGHT value.
+uint32_t
+BranchProbabilityInfo::getEdgeWeight(BasicBlock *Src, BasicBlock *Dst) const {
+  Edge E(Src, Dst);
+  DenseMap<Edge, uint32_t>::const_iterator I = Weights.find(E);
+
+  if (I != Weights.end())
+    return I->second;
+
+  return DEFAULT_WEIGHT;
+}
+
+void BranchProbabilityInfo::setEdgeWeight(BasicBlock *Src, BasicBlock *Dst,
+                                     uint32_t Weight) {
+  Weights[std::make_pair(Src, Dst)] = Weight;
+  DEBUG(dbgs() << "set edge " << Src->getNameStr() << " -> "
+               << Dst->getNameStr() << " weight to " << Weight
+               << (isEdgeHot(Src, Dst) ? " [is HOT now]\n" : "\n"));
+}
+
+
+BranchProbability BranchProbabilityInfo::
+getEdgeProbability(BasicBlock *Src, BasicBlock *Dst) const {
+
+  uint32_t N = getEdgeWeight(Src, Dst);
+  uint32_t D = getSumForBlock(Src);
+
+  return BranchProbability(N, D);
+}
+
+raw_ostream &
+BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS, BasicBlock *Src,
+                                            BasicBlock *Dst) const {
+  BranchProbability Prob = getEdgeProbability(Src, Dst);
+
+  OS << "edge " << Src->getNameStr() << " -> " << Dst->getNameStr()
+     << " probability is " << Prob
+     << (isEdgeHot(Src, Dst) ? " [HOT edge]\n" : "\n");
+
+  return OS;
+}
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 6be5617..1a975bf 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_library(LLVMAnalysis
   AliasSetTracker.cpp
   Analysis.cpp
   BasicAliasAnalysis.cpp
+  BranchProbabilityInfo.cpp
   CFGPrinter.cpp
   CaptureTracking.cpp
   ConstantFolding.cpp
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 5de2b04..08a6065 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -1085,7 +1085,7 @@ llvm::canConstantFoldCallTo(const Function *F) {
   case 'c':
     return Name == "cos" || Name == "ceil" || Name == "cosf" || Name == "cosh";
   case 'e':
-    return Name == "exp";
+    return Name == "exp" || Name == "exp2";
   case 'f':
     return Name == "fabs" || Name == "fmod" || Name == "floor";
   case 'l':
@@ -1221,6 +1221,12 @@ llvm::ConstantFoldCall(Function *F,
       case 'e':
         if (Name == "exp")
           return ConstantFoldFP(exp, V, Ty);
+  
+        if (Name == "exp2") {
+          // Constant fold exp2(x) as pow(2,x) in case the host doesn't have a
+          // C99 library.
+          return ConstantFoldBinaryFP(pow, 2.0, V, Ty);
+        }
         break;
       case 'f':
         if (Name == "fabs")
diff --git a/lib/Analysis/DIBuilder.cpp b/lib/Analysis/DIBuilder.cpp
index dc98c9e..ef5d03a 100644
--- a/lib/Analysis/DIBuilder.cpp
+++ b/lib/Analysis/DIBuilder.cpp
@@ -51,6 +51,10 @@ void DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeVer)
   };
   TheCU = DICompileUnit(MDNode::get(VMContext, Elts));
+
+  // Create a named metadata so that it is easier to find cu in a module.
+  NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
+  NMD->addOperand(TheCU);
 }
 
 /// createFile - Create a file descriptor to hold debugging information
@@ -156,12 +160,12 @@ DIType DIBuilder::createReferenceType(DIType RTy) {
 
 /// createTypedef - Create debugging information entry for a typedef.
 DIType DIBuilder::createTypedef(DIType Ty, StringRef Name, DIFile File,
-                                unsigned LineNo) {
+                                unsigned LineNo, DIDescriptor Context) {
   // typedefs are encoded in DIDerivedType format.
   assert(Ty.Verify() && "Invalid typedef type!");
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_typedef),
-    Ty.getContext(),
+    Context,
     MDString::get(VMContext, Name),
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
diff --git a/lib/Analysis/IPA/CallGraph.cpp b/lib/Analysis/IPA/CallGraph.cpp
index 690c4b4..2e79eab 100644
--- a/lib/Analysis/IPA/CallGraph.cpp
+++ b/lib/Analysis/IPA/CallGraph.cpp
@@ -148,7 +148,7 @@ private:
       for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
            II != IE; ++II) {
         CallSite CS(cast<Value>(II));
-        if (CS && !isa<DbgInfoIntrinsic>(II)) {
+        if (CS && !isa<IntrinsicInst>(II)) {
           const Function *Callee = CS.getCalledFunction();
           if (Callee)
             Node->addCalledFunction(CS, getOrInsertFunction(Callee));
diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp
index 725ab72..659ffab 100644
--- a/lib/Analysis/IPA/CallGraphSCCPass.cpp
+++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp
@@ -245,8 +245,8 @@ bool CGPassManager::RefreshCallGraph(CallGraphSCC &CurSCC,
     
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
       for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-          CallSite CS(cast<Value>(I));
-        if (!CS || isa<DbgInfoIntrinsic>(I)) continue;
+        CallSite CS(cast<Value>(I));
+        if (!CS || isa<IntrinsicInst>(I)) continue;
         
         // If this call site already existed in the callgraph, just verify it
         // matches up to expectations and remove it from CallSites.
diff --git a/lib/Analysis/IPA/FindUsedTypes.cpp b/lib/Analysis/IPA/FindUsedTypes.cpp
index 06ae34c..dde2556 100644
--- a/lib/Analysis/IPA/FindUsedTypes.cpp
+++ b/lib/Analysis/IPA/FindUsedTypes.cpp
@@ -32,7 +32,7 @@ INITIALIZE_PASS(FindUsedTypes, "print-used-types",
 void FindUsedTypes::IncorporateType(const Type *Ty) {
   // If ty doesn't already exist in the used types map, add it now, otherwise
   // return.
-  if (!UsedTypes.insert(Ty).second) return;  // Already contain Ty.
+  if (!UsedTypes.insert(Ty)) return;  // Already contain Ty.
 
   // Make sure to add any types this type references now.
   //
@@ -94,7 +94,7 @@ bool FindUsedTypes::runOnModule(Module &m) {
 //
 void FindUsedTypes::print(raw_ostream &OS, const Module *M) const {
   OS << "Types in use by this module:\n";
-  for (std::set<const Type *>::const_iterator I = UsedTypes.begin(),
+  for (SetVector<const Type *>::const_iterator I = UsedTypes.begin(),
        E = UsedTypes.end(); I != E; ++I) {
     OS << "   ";
     WriteTypeSymbolic(OS, *I, M);
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index 2cda791..a0c42f0 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/ADT/STLExtras.h"
@@ -38,6 +39,15 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_END(IVUsers, "iv-users",
                       "Induction Variable Users", false, true)
 
+// IVUsers behavior currently depends on this temporary indvars mode. The
+// option must be defined upstream from its uses.
+namespace llvm {
+  bool DisableIVRewrite = false;
+}
+cl::opt<bool, true> DisableIVRewriteOpt(
+  "disable-iv-rewrite", cl::Hidden, cl::location(llvm::DisableIVRewrite),
+  cl::desc("Disable canonical induction variable rewriting"));
+
 Pass *llvm::createIVUsersPass() {
   return new IVUsers();
 }
@@ -79,7 +89,7 @@ static bool isInteresting(const SCEV *S, const Instruction *I, const Loop *L,
 /// AddUsersIfInteresting - Inspect the specified instruction.  If it is a
 /// reducible SCEV, recursively add its users to the IVUsesByStride set and
 /// return true.  Otherwise, return false.
-bool IVUsers::AddUsersIfInteresting(Instruction *I) {
+bool IVUsers::AddUsersIfInteresting(Instruction *I, PHINode *Phi) {
   if (!SE->isSCEVable(I->getType()))
     return false;   // Void and FP expressions cannot be reduced.
 
@@ -90,6 +100,11 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
   if (Width > 64 || (TD && !TD->isLegalInteger(Width)))
     return false;
 
+  // We expect Sign/Zero extension to be eliminated from the IR before analyzing
+  // any downstream uses.
+  if (DisableIVRewrite && (isa<SExtInst>(I) || isa<ZExtInst>(I)))
+    return false;
+
   if (!Processed.insert(I))
     return true;    // Instruction already handled.
 
@@ -121,13 +136,13 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
     bool AddUserToIVUsers = false;
     if (LI->getLoopFor(User->getParent()) != L) {
       if (isa<PHINode>(User) || Processed.count(User) ||
-          !AddUsersIfInteresting(User)) {
+          !AddUsersIfInteresting(User, Phi)) {
         DEBUG(dbgs() << "FOUND USER in other loop: " << *User << '\n'
                      << "   OF SCEV: " << *ISE << '\n');
         AddUserToIVUsers = true;
       }
     } else if (Processed.count(User) ||
-               !AddUsersIfInteresting(User)) {
+               !AddUsersIfInteresting(User, Phi)) {
       DEBUG(dbgs() << "FOUND USER: " << *User << '\n'
                    << "   OF SCEV: " << *ISE << '\n');
       AddUserToIVUsers = true;
@@ -135,9 +150,11 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
 
     if (AddUserToIVUsers) {
       // Okay, we found a user that we cannot reduce.
-      IVUses.push_back(new IVStrideUse(this, User, I));
+      IVUses.push_back(new IVStrideUse(this, User, I, Phi));
       IVStrideUse &NewUse = IVUses.back();
-      // Transform the expression into a normalized form.
+      // Autodetect the post-inc loop set, populating NewUse.PostIncLoops.
+      // The regular return value here is discarded; instead of recording
+      // it, we just recompute it when we need it.
       ISE = TransformForPostIncUse(NormalizeAutodetect,
                                    ISE, User, I,
                                    NewUse.PostIncLoops,
@@ -148,8 +165,8 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
   return true;
 }
 
-IVStrideUse &IVUsers::AddUser(Instruction *User, Value *Operand) {
-  IVUses.push_back(new IVStrideUse(this, User, Operand));
+IVStrideUse &IVUsers::AddUser(Instruction *User, Value *Operand, PHINode *Phi) {
+  IVUses.push_back(new IVStrideUse(this, User, Operand, Phi));
   return IVUses.back();
 }
 
@@ -177,7 +194,7 @@ bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
   // them by stride.  Start by finding all of the PHI nodes in the header for
   // this loop.  If they are induction variables, inspect their uses.
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I)
-    (void)AddUsersIfInteresting(I);
+    (void)AddUsersIfInteresting(I, cast<PHINode>(I));
 
   return false;
 }
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index a820ecf..efde598 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -66,21 +66,13 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB) {
 
       ImmutableCallSite CS(cast<Instruction>(II));
 
-      // If this function contains a call to setjmp or _setjmp, never inline
-      // it.  This is a hack because we depend on the user marking their local
-      // variables as volatile if they are live across a setjmp call, and they
-      // probably won't do this in callers.
       if (const Function *F = CS.getCalledFunction()) {
         // If a function is both internal and has a single use, then it is 
         // extremely likely to get inlined in the future (it was probably 
         // exposed by an interleaved devirtualization pass).
         if (F->hasInternalLinkage() && F->hasOneUse())
           ++NumInlineCandidates;
-        
-        if (F->isDeclaration() && 
-            (F->getName() == "setjmp" || F->getName() == "_setjmp"))
-          callsSetJmp = true;
-       
+
         // If this call is to function itself, then the function is recursive.
         // Inlining it into other functions is a bad idea, because this is
         // basically just a form of loop peeling, and our metrics aren't useful
@@ -226,6 +218,13 @@ unsigned CodeMetrics::CountCodeReductionForAlloca(Value *V) {
 /// analyzeFunction - Fill in the current structure with information gleaned
 /// from the specified function.
 void CodeMetrics::analyzeFunction(Function *F) {
+  // If this function contains a call to setjmp or _setjmp, never inline
+  // it.  This is a hack because we depend on the user marking their local
+  // variables as volatile if they are live across a setjmp call, and they
+  // probably won't do this in callers.
+  if (F->callsFunctionThatReturnsTwice())
+    callsSetJmp = true;
+
   // Look at the size of the callee.
   for (Function::const_iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
     analyzeBasicBlock(&*BB);
@@ -594,7 +593,7 @@ InlineCostAnalyzer::growCachedCostInfo(Function *Caller, Function *Callee) {
   CodeMetrics &CallerMetrics = CachedFunctionInfo[Caller].Metrics;
 
   // For small functions we prefer to recalculate the cost for better accuracy.
-  if (CallerMetrics.NumBlocks < 10 || CallerMetrics.NumInsts < 1000) {
+  if (CallerMetrics.NumBlocks < 10 && CallerMetrics.NumInsts < 1000) {
     resetCachedCostInfo(Caller);
     return;
   }
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 9d6d339..9d78f8b 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -913,8 +913,6 @@ static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
     }
   }
 
-  bool isSigned = Opcode == Instruction::SRem;
-
   // X % undef -> undef
   if (match(Op1, m_Undef()))
     return Op1;
@@ -1378,6 +1376,26 @@ static const Type *GetCompareTy(Value *Op) {
   return CmpInst::makeCmpResultType(Op->getType());
 }
 
+/// ExtractEquivalentCondition - Rummage around inside V looking for something
+/// equivalent to the comparison "LHS Pred RHS".  Return such a value if found,
+/// otherwise return null.  Helper function for analyzing max/min idioms.
+static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
+                                         Value *LHS, Value *RHS) {
+  SelectInst *SI = dyn_cast<SelectInst>(V);
+  if (!SI)
+    return 0;
+  CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
+  if (!Cmp)
+    return 0;
+  Value *CmpLHS = Cmp->getOperand(0), *CmpRHS = Cmp->getOperand(1);
+  if (Pred == Cmp->getPredicate() && LHS == CmpLHS && RHS == CmpRHS)
+    return Cmp;
+  if (Pred == CmpInst::getSwappedPredicate(Cmp->getPredicate()) &&
+      LHS == CmpRHS && RHS == CmpLHS)
+    return Cmp;
+  return 0;
+}
+
 /// SimplifyICmpInst - Given operands for an ICmpInst, see if we can
 /// fold the result.  If not, this returns null.
 static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
@@ -1460,46 +1478,48 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     default:
       assert(false && "Unknown ICmp predicate!");
     case ICmpInst::ICMP_ULT:
-      return ConstantInt::getFalse(LHS->getContext());
+      // getNullValue also works for vectors, unlike getFalse.
+      return Constant::getNullValue(ITy);
     case ICmpInst::ICMP_UGE:
-      return ConstantInt::getTrue(LHS->getContext());
+      // getAllOnesValue also works for vectors, unlike getTrue.
+      return ConstantInt::getAllOnesValue(ITy);
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_ULE:
       if (isKnownNonZero(LHS, TD))
-        return ConstantInt::getFalse(LHS->getContext());
+        return Constant::getNullValue(ITy);
       break;
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_UGT:
       if (isKnownNonZero(LHS, TD))
-        return ConstantInt::getTrue(LHS->getContext());
+        return ConstantInt::getAllOnesValue(ITy);
       break;
     case ICmpInst::ICMP_SLT:
       ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD);
       if (LHSKnownNegative)
-        return ConstantInt::getTrue(LHS->getContext());
+        return ConstantInt::getAllOnesValue(ITy);
       if (LHSKnownNonNegative)
-        return ConstantInt::getFalse(LHS->getContext());
+        return Constant::getNullValue(ITy);
       break;
     case ICmpInst::ICMP_SLE:
       ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD);
       if (LHSKnownNegative)
-        return ConstantInt::getTrue(LHS->getContext());
+        return ConstantInt::getAllOnesValue(ITy);
       if (LHSKnownNonNegative && isKnownNonZero(LHS, TD))
-        return ConstantInt::getFalse(LHS->getContext());
+        return Constant::getNullValue(ITy);
       break;
     case ICmpInst::ICMP_SGE:
       ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD);
       if (LHSKnownNegative)
-        return ConstantInt::getFalse(LHS->getContext());
+        return Constant::getNullValue(ITy);
       if (LHSKnownNonNegative)
-        return ConstantInt::getTrue(LHS->getContext());
+        return ConstantInt::getAllOnesValue(ITy);
       break;
     case ICmpInst::ICMP_SGT:
       ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD);
       if (LHSKnownNegative)
-        return ConstantInt::getFalse(LHS->getContext());
+        return Constant::getNullValue(ITy);
       if (LHSKnownNonNegative && isKnownNonZero(LHS, TD))
-        return ConstantInt::getTrue(LHS->getContext());
+        return ConstantInt::getAllOnesValue(ITy);
       break;
     }
   }
@@ -1791,7 +1811,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE:
-      return ConstantInt::getFalse(RHS->getContext());
+      // getNullValue also works for vectors, unlike getFalse.
+      return Constant::getNullValue(ITy);
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE:
       ComputeSignBit(LHS, KnownNonNegative, KnownNegative, TD);
@@ -1801,7 +1822,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_ULE:
-      return ConstantInt::getTrue(RHS->getContext());
+      // getAllOnesValue also works for vectors, unlike getTrue.
+      return Constant::getAllOnesValue(ITy);
     }
   }
   if (RBO && match(RBO, m_URem(m_Value(), m_Specific(LHS)))) {
@@ -1818,7 +1840,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE:
-      return ConstantInt::getTrue(RHS->getContext());
+      // getAllOnesValue also works for vectors, unlike getTrue.
+      return Constant::getAllOnesValue(ITy);
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE:
       ComputeSignBit(RHS, KnownNonNegative, KnownNegative, TD);
@@ -1828,7 +1851,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_ULE:
-      return ConstantInt::getFalse(RHS->getContext());
+      // getNullValue also works for vectors, unlike getFalse.
+      return Constant::getNullValue(ITy);
     }
   }
 
@@ -1843,7 +1867,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       // fall-through
     case Instruction::SDiv:
     case Instruction::AShr:
-      if (!LBO->isExact() && !RBO->isExact())
+      if (!LBO->isExact() || !RBO->isExact())
         break;
       if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
                                       RBO->getOperand(0), TD, DT, MaxRecurse-1))
@@ -1864,6 +1888,194 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     }
   }
 
+  // Simplify comparisons involving max/min.
+  Value *A, *B;
+  CmpInst::Predicate P = CmpInst::BAD_ICMP_PREDICATE;
+  CmpInst::Predicate EqP; // Chosen so that "A == max/min(A,B)" iff "A EqP B".
+
+  // Signed variants on "max(a,b)>=a -> true".
+  if (match(LHS, m_SMax(m_Value(A), m_Value(B))) && (A == RHS || B == RHS)) {
+    if (A != RHS) std::swap(A, B); // smax(A, B) pred A.
+    EqP = CmpInst::ICMP_SGE; // "A == smax(A, B)" iff "A sge B".
+    // We analyze this as smax(A, B) pred A.
+    P = Pred;
+  } else if (match(RHS, m_SMax(m_Value(A), m_Value(B))) &&
+             (A == LHS || B == LHS)) {
+    if (A != LHS) std::swap(A, B); // A pred smax(A, B).
+    EqP = CmpInst::ICMP_SGE; // "A == smax(A, B)" iff "A sge B".
+    // We analyze this as smax(A, B) swapped-pred A.
+    P = CmpInst::getSwappedPredicate(Pred);
+  } else if (match(LHS, m_SMin(m_Value(A), m_Value(B))) &&
+             (A == RHS || B == RHS)) {
+    if (A != RHS) std::swap(A, B); // smin(A, B) pred A.
+    EqP = CmpInst::ICMP_SLE; // "A == smin(A, B)" iff "A sle B".
+    // We analyze this as smax(-A, -B) swapped-pred -A.
+    // Note that we do not need to actually form -A or -B thanks to EqP.
+    P = CmpInst::getSwappedPredicate(Pred);
+  } else if (match(RHS, m_SMin(m_Value(A), m_Value(B))) &&
+             (A == LHS || B == LHS)) {
+    if (A != LHS) std::swap(A, B); // A pred smin(A, B).
+    EqP = CmpInst::ICMP_SLE; // "A == smin(A, B)" iff "A sle B".
+    // We analyze this as smax(-A, -B) pred -A.
+    // Note that we do not need to actually form -A or -B thanks to EqP.
+    P = Pred;
+  }
+  if (P != CmpInst::BAD_ICMP_PREDICATE) {
+    // Cases correspond to "max(A, B) p A".
+    switch (P) {
+    default:
+      break;
+    case CmpInst::ICMP_EQ:
+    case CmpInst::ICMP_SLE:
+      // Equivalent to "A EqP B".  This may be the same as the condition tested
+      // in the max/min; if so, we can just return that.
+      if (Value *V = ExtractEquivalentCondition(LHS, EqP, A, B))
+        return V;
+      if (Value *V = ExtractEquivalentCondition(RHS, EqP, A, B))
+        return V;
+      // Otherwise, see if "A EqP B" simplifies.
+      if (MaxRecurse)
+        if (Value *V = SimplifyICmpInst(EqP, A, B, TD, DT, MaxRecurse-1))
+          return V;
+      break;
+    case CmpInst::ICMP_NE:
+    case CmpInst::ICMP_SGT: {
+      CmpInst::Predicate InvEqP = CmpInst::getInversePredicate(EqP);
+      // Equivalent to "A InvEqP B".  This may be the same as the condition
+      // tested in the max/min; if so, we can just return that.
+      if (Value *V = ExtractEquivalentCondition(LHS, InvEqP, A, B))
+        return V;
+      if (Value *V = ExtractEquivalentCondition(RHS, InvEqP, A, B))
+        return V;
+      // Otherwise, see if "A InvEqP B" simplifies.
+      if (MaxRecurse)
+        if (Value *V = SimplifyICmpInst(InvEqP, A, B, TD, DT, MaxRecurse-1))
+          return V;
+      break;
+    }
+    case CmpInst::ICMP_SGE:
+      // Always true.
+      return Constant::getAllOnesValue(ITy);
+    case CmpInst::ICMP_SLT:
+      // Always false.
+      return Constant::getNullValue(ITy);
+    }
+  }
+
+  // Unsigned variants on "max(a,b)>=a -> true".
+  P = CmpInst::BAD_ICMP_PREDICATE;
+  if (match(LHS, m_UMax(m_Value(A), m_Value(B))) && (A == RHS || B == RHS)) {
+    if (A != RHS) std::swap(A, B); // umax(A, B) pred A.
+    EqP = CmpInst::ICMP_UGE; // "A == umax(A, B)" iff "A uge B".
+    // We analyze this as umax(A, B) pred A.
+    P = Pred;
+  } else if (match(RHS, m_UMax(m_Value(A), m_Value(B))) &&
+             (A == LHS || B == LHS)) {
+    if (A != LHS) std::swap(A, B); // A pred umax(A, B).
+    EqP = CmpInst::ICMP_UGE; // "A == umax(A, B)" iff "A uge B".
+    // We analyze this as umax(A, B) swapped-pred A.
+    P = CmpInst::getSwappedPredicate(Pred);
+  } else if (match(LHS, m_UMin(m_Value(A), m_Value(B))) &&
+             (A == RHS || B == RHS)) {
+    if (A != RHS) std::swap(A, B); // umin(A, B) pred A.
+    EqP = CmpInst::ICMP_ULE; // "A == umin(A, B)" iff "A ule B".
+    // We analyze this as umax(-A, -B) swapped-pred -A.
+    // Note that we do not need to actually form -A or -B thanks to EqP.
+    P = CmpInst::getSwappedPredicate(Pred);
+  } else if (match(RHS, m_UMin(m_Value(A), m_Value(B))) &&
+             (A == LHS || B == LHS)) {
+    if (A != LHS) std::swap(A, B); // A pred umin(A, B).
+    EqP = CmpInst::ICMP_ULE; // "A == umin(A, B)" iff "A ule B".
+    // We analyze this as umax(-A, -B) pred -A.
+    // Note that we do not need to actually form -A or -B thanks to EqP.
+    P = Pred;
+  }
+  if (P != CmpInst::BAD_ICMP_PREDICATE) {
+    // Cases correspond to "max(A, B) p A".
+    switch (P) {
+    default:
+      break;
+    case CmpInst::ICMP_EQ:
+    case CmpInst::ICMP_ULE:
+      // Equivalent to "A EqP B".  This may be the same as the condition tested
+      // in the max/min; if so, we can just return that.
+      if (Value *V = ExtractEquivalentCondition(LHS, EqP, A, B))
+        return V;
+      if (Value *V = ExtractEquivalentCondition(RHS, EqP, A, B))
+        return V;
+      // Otherwise, see if "A EqP B" simplifies.
+      if (MaxRecurse)
+        if (Value *V = SimplifyICmpInst(EqP, A, B, TD, DT, MaxRecurse-1))
+          return V;
+      break;
+    case CmpInst::ICMP_NE:
+    case CmpInst::ICMP_UGT: {
+      CmpInst::Predicate InvEqP = CmpInst::getInversePredicate(EqP);
+      // Equivalent to "A InvEqP B".  This may be the same as the condition
+      // tested in the max/min; if so, we can just return that.
+      if (Value *V = ExtractEquivalentCondition(LHS, InvEqP, A, B))
+        return V;
+      if (Value *V = ExtractEquivalentCondition(RHS, InvEqP, A, B))
+        return V;
+      // Otherwise, see if "A InvEqP B" simplifies.
+      if (MaxRecurse)
+        if (Value *V = SimplifyICmpInst(InvEqP, A, B, TD, DT, MaxRecurse-1))
+          return V;
+      break;
+    }
+    case CmpInst::ICMP_UGE:
+      // Always true.
+      return Constant::getAllOnesValue(ITy);
+    case CmpInst::ICMP_ULT:
+      // Always false.
+      return Constant::getNullValue(ITy);
+    }
+  }
+
+  // Variants on "max(x,y) >= min(x,z)".
+  Value *C, *D;
+  if (match(LHS, m_SMax(m_Value(A), m_Value(B))) &&
+      match(RHS, m_SMin(m_Value(C), m_Value(D))) &&
+      (A == C || A == D || B == C || B == D)) {
+    // max(x, ?) pred min(x, ?).
+    if (Pred == CmpInst::ICMP_SGE)
+      // Always true.
+      return Constant::getAllOnesValue(ITy);
+    if (Pred == CmpInst::ICMP_SLT)
+      // Always false.
+      return Constant::getNullValue(ITy);
+  } else if (match(LHS, m_SMin(m_Value(A), m_Value(B))) &&
+             match(RHS, m_SMax(m_Value(C), m_Value(D))) &&
+             (A == C || A == D || B == C || B == D)) {
+    // min(x, ?) pred max(x, ?).
+    if (Pred == CmpInst::ICMP_SLE)
+      // Always true.
+      return Constant::getAllOnesValue(ITy);
+    if (Pred == CmpInst::ICMP_SGT)
+      // Always false.
+      return Constant::getNullValue(ITy);
+  } else if (match(LHS, m_UMax(m_Value(A), m_Value(B))) &&
+             match(RHS, m_UMin(m_Value(C), m_Value(D))) &&
+             (A == C || A == D || B == C || B == D)) {
+    // max(x, ?) pred min(x, ?).
+    if (Pred == CmpInst::ICMP_UGE)
+      // Always true.
+      return Constant::getAllOnesValue(ITy);
+    if (Pred == CmpInst::ICMP_ULT)
+      // Always false.
+      return Constant::getNullValue(ITy);
+  } else if (match(LHS, m_UMin(m_Value(A), m_Value(B))) &&
+             match(RHS, m_UMax(m_Value(C), m_Value(D))) &&
+             (A == C || A == D || B == C || B == D)) {
+    // min(x, ?) pred max(x, ?).
+    if (Pred == CmpInst::ICMP_ULE)
+      // Always true.
+      return Constant::getAllOnesValue(ITy);
+    if (Pred == CmpInst::ICMP_UGT)
+      // Always false.
+      return Constant::getNullValue(ITy);
+  }
+
   // If the comparison is with the result of a select instruction, check whether
   // comparing with either branch of the select always yields the same value.
   if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index d5f0b5c..6e27597 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -589,16 +589,18 @@ static bool InstructionDereferencesPointer(Instruction *I, Value *Ptr) {
   }
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
     if (MI->isVolatile()) return false;
-    if (MI->getAddressSpace() != 0) return false;
 
     // FIXME: check whether it has a valuerange that excludes zero?
     ConstantInt *Len = dyn_cast<ConstantInt>(MI->getLength());
     if (!Len || Len->isZero()) return false;
 
-    if (MI->getRawDest() == Ptr || MI->getDest() == Ptr)
-      return true;
+    if (MI->getDestAddressSpace() == 0)
+      if (MI->getRawDest() == Ptr || MI->getDest() == Ptr)
+        return true;
     if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI))
-      return MTI->getRawSource() == Ptr || MTI->getSource() == Ptr;
+      if (MTI->getSourceAddressSpace() == 0)
+        if (MTI->getRawSource() == Ptr || MTI->getSource() == Ptr)
+          return true;
   }
   return false;
 }
diff --git a/lib/Analysis/Loads.cpp b/lib/Analysis/Loads.cpp
index ab34fd6..c5c676b 100644
--- a/lib/Analysis/Loads.cpp
+++ b/lib/Analysis/Loads.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
 static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
   // Test if the values are trivially equivalent.
   if (A == B) return true;
-  
+
   // Test if the values come from identical arithmetic instructions.
   // Use isIdenticalToWhenDefined instead of isIdenticalTo because
   // this function is only used when one address use dominates the
@@ -42,7 +42,7 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
     if (const Instruction *BI = dyn_cast<Instruction>(B))
       if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
         return true;
-  
+
   // Otherwise they may not be equivalent.
   return false;
 }
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index ce7fab6..5f640c0 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -374,10 +374,16 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
         if (R == AliasAnalysis::MustAlias)
           return MemDepResult::getDef(Inst);
 
+#if 0 // FIXME: Temporarily disabled. GVN is cleverly rewriting loads
+      // in terms of clobbering loads, but since it does this by looking
+      // at the clobbering load directly, it doesn't know about any
+      // phi translation that may have happened along the way.
+
         // If we have a partial alias, then return this as a clobber for the
         // client to handle.
         if (R == AliasAnalysis::PartialAlias)
           return MemDepResult::getClobber(Inst);
+#endif
         
         // Random may-alias loads don't depend on each other without a
         // dependence.
@@ -497,7 +503,7 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
       // If we can do a pointer scan, make it happen.
       bool isLoad = !(MR & AliasAnalysis::Mod);
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(QueryInst))
-        isLoad |= II->getIntrinsicID() == Intrinsic::lifetime_end;
+        isLoad |= II->getIntrinsicID() == Intrinsic::lifetime_start;
 
       LocalCache = getPointerDependencyFrom(MemLoc, isLoad, ScanPos,
                                             QueryParent);
@@ -937,6 +943,9 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
   SmallVector<BasicBlock*, 32> Worklist;
   Worklist.push_back(StartBB);
   
+  // PredList used inside loop.
+  SmallVector<std::pair<BasicBlock*, PHITransAddr>, 16> PredList;
+
   // Keep track of the entries that we know are sorted.  Previously cached
   // entries will all be sorted.  The entries we add we only sort on demand (we
   // don't insert every element into its sorted position).  We know that we
@@ -973,22 +982,29 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
     // the same Pointer.
     if (!Pointer.NeedsPHITranslationFromBlock(BB)) {
       SkipFirstBlock = false;
+      SmallVector<BasicBlock*, 16> NewBlocks;
       for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) {
         // Verify that we haven't looked at this block yet.
         std::pair<DenseMap<BasicBlock*,Value*>::iterator, bool>
           InsertRes = Visited.insert(std::make_pair(*PI, Pointer.getAddr()));
         if (InsertRes.second) {
           // First time we've looked at *PI.
-          Worklist.push_back(*PI);
+          NewBlocks.push_back(*PI);
           continue;
         }
         
         // If we have seen this block before, but it was with a different
         // pointer then we have a phi translation failure and we have to treat
         // this as a clobber.
-        if (InsertRes.first->second != Pointer.getAddr())
+        if (InsertRes.first->second != Pointer.getAddr()) {
+          // Make sure to clean up the Visited map before continuing on to
+          // PredTranslationFailure.
+          for (unsigned i = 0; i < NewBlocks.size(); i++)
+            Visited.erase(NewBlocks[i]);
           goto PredTranslationFailure;
+        }
       }
+      Worklist.append(NewBlocks.begin(), NewBlocks.end());
       continue;
     }
     
@@ -1007,13 +1023,15 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
       NumSortedEntries = Cache->size();
     }
     Cache = 0;
-    
+
+    PredList.clear();
     for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) {
       BasicBlock *Pred = *PI;
-      
+      PredList.push_back(std::make_pair(Pred, Pointer));
+
       // Get the PHI translated pointer in this predecessor.  This can fail if
       // not translatable, in which case the getAddr() returns null.
-      PHITransAddr PredPointer(Pointer);
+      PHITransAddr &PredPointer = PredList.back().second;
       PredPointer.PHITranslateValue(BB, Pred, 0);
 
       Value *PredPtrVal = PredPointer.getAddr();
@@ -1027,6 +1045,9 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
         InsertRes = Visited.insert(std::make_pair(Pred, PredPtrVal));
 
       if (!InsertRes.second) {
+        // We found the pred; take it off the list of preds to visit.
+        PredList.pop_back();
+
         // If the predecessor was visited with PredPtr, then we already did
         // the analysis and can ignore it.
         if (InsertRes.first->second == PredPtrVal)
@@ -1035,14 +1056,47 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
         // Otherwise, the block was previously analyzed with a different
         // pointer.  We can't represent the result of this case, so we just
         // treat this as a phi translation failure.
+
+        // Make sure to clean up the Visited map before continuing on to
+        // PredTranslationFailure.
+        for (unsigned i = 0; i < PredList.size(); i++)
+          Visited.erase(PredList[i].first);
+
         goto PredTranslationFailure;
       }
-      
+    }
+
+    // Actually process results here; this need to be a separate loop to avoid
+    // calling getNonLocalPointerDepFromBB for blocks we don't want to return
+    // any results for.  (getNonLocalPointerDepFromBB will modify our 
+    // datastructures in ways the code after the PredTranslationFailure label
+    // doesn't expect.)
+    for (unsigned i = 0; i < PredList.size(); i++) {
+      BasicBlock *Pred = PredList[i].first;
+      PHITransAddr &PredPointer = PredList[i].second;
+      Value *PredPtrVal = PredPointer.getAddr();
+
+      bool CanTranslate = true;
       // If PHI translation was unable to find an available pointer in this
       // predecessor, then we have to assume that the pointer is clobbered in
       // that predecessor.  We can still do PRE of the load, which would insert
       // a computation of the pointer in this predecessor.
-      if (PredPtrVal == 0) {
+      if (PredPtrVal == 0)
+        CanTranslate = false;
+
+      // FIXME: it is entirely possible that PHI translating will end up with
+      // the same value.  Consider PHI translating something like:
+      // X = phi [x, bb1], [y, bb2].  PHI translating for bb1 doesn't *need*
+      // to recurse here, pedantically speaking.
+
+      // If getNonLocalPointerDepFromBB fails here, that means the cached
+      // result conflicted with the Visited list; we have to conservatively
+      // assume a clobber, but this also does not block PRE of the load.
+      if (!CanTranslate ||
+          getNonLocalPointerDepFromBB(PredPointer,
+                                      Loc.getWithNewPtr(PredPtrVal),
+                                      isLoad, Pred,
+                                      Result, Visited)) {
         // Add the entry to the Result list.
         NonLocalDepResult Entry(Pred,
                                 MemDepResult::getClobber(Pred->getTerminator()),
@@ -1058,19 +1112,6 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
         NLPI.Pair = BBSkipFirstBlockPair();
         continue;
       }
-
-      // FIXME: it is entirely possible that PHI translating will end up with
-      // the same value.  Consider PHI translating something like:
-      // X = phi [x, bb1], [y, bb2].  PHI translating for bb1 doesn't *need*
-      // to recurse here, pedantically speaking.
-      
-      // If we have a problem phi translating, fall through to the code below
-      // to handle the failure condition.
-      if (getNonLocalPointerDepFromBB(PredPointer,
-                                      Loc.getWithNewPtr(PredPointer.getAddr()),
-                                      isLoad, Pred,
-                                      Result, Visited))
-        goto PredTranslationFailure;
     }
     
     // Refresh the CacheInfo/Cache pointer so that it isn't invalidated.
@@ -1087,6 +1128,9 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
     continue;
 
   PredTranslationFailure:
+    // The following code is "failure"; we can't produce a sane translation
+    // for the given block.  It assumes that we haven't modified any of
+    // our datastructures while processing the current block.
     
     if (Cache == 0) {
       // Refresh the CacheInfo/Cache pointer if it got invalidated.
@@ -1117,8 +1161,8 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
       
       assert(I->getResult().isNonLocal() &&
              "Should only be here with transparent block");
-      I->setResult(MemDepResult::getClobber(BB->begin()));
-      ReverseNonLocalPtrDeps[BB->begin()].insert(CacheKey);
+      I->setResult(MemDepResult::getClobber(BB->getTerminator()));
+      ReverseNonLocalPtrDeps[BB->getTerminator()].insert(CacheKey);
       Result.push_back(NonLocalDepResult(I->getBB(), I->getResult(),
                                          Pointer.getAddr()));
       break;
diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp
index 3269dcc..80eda79 100644
--- a/lib/Analysis/RegionPass.cpp
+++ b/lib/Analysis/RegionPass.cpp
@@ -249,7 +249,7 @@ void RegionPass::assignPassManager(PMStack &PMS,
     assert (!PMS.empty() && "Unable to create Region Pass Manager");
     PMDataManager *PMD = PMS.top();
 
-    // [1] Create new Call Graph Pass Manager
+    // [1] Create new Region Pass Manager
     RGPM = new RGPassManager(PMD->getDepth() + 1);
     RGPM->populateInheritedAnalysis(PMS);
 
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index bab4619..025718e 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -1035,6 +1035,93 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
   return S;
 }
 
+// Get the limit of a recurrence such that incrementing by Step cannot cause
+// signed overflow as long as the value of the recurrence within the loop does
+// not exceed this limit before incrementing.
+static const SCEV *getOverflowLimitForStep(const SCEV *Step,
+                                           ICmpInst::Predicate *Pred,
+                                           ScalarEvolution *SE) {
+  unsigned BitWidth = SE->getTypeSizeInBits(Step->getType());
+  if (SE->isKnownPositive(Step)) {
+    *Pred = ICmpInst::ICMP_SLT;
+    return SE->getConstant(APInt::getSignedMinValue(BitWidth) -
+                           SE->getSignedRange(Step).getSignedMax());
+  }
+  if (SE->isKnownNegative(Step)) {
+    *Pred = ICmpInst::ICMP_SGT;
+    return SE->getConstant(APInt::getSignedMaxValue(BitWidth) -
+                       SE->getSignedRange(Step).getSignedMin());
+  }
+  return 0;
+}
+
+// The recurrence AR has been shown to have no signed wrap. Typically, if we can
+// prove NSW for AR, then we can just as easily prove NSW for its preincrement
+// or postincrement sibling. This allows normalizing a sign extended AddRec as
+// such: {sext(Step + Start),+,Step} => {(Step + sext(Start),+,Step} As a
+// result, the expression "Step + sext(PreIncAR)" is congruent with
+// "sext(PostIncAR)"
+static const SCEV *getPreStartForSignExtend(const SCEVAddRecExpr *AR,
+                                            const Type *Ty,
+                                            ScalarEvolution *SE) {
+  const Loop *L = AR->getLoop();
+  const SCEV *Start = AR->getStart();
+  const SCEV *Step = AR->getStepRecurrence(*SE);
+
+  // Check for a simple looking step prior to loop entry.
+  const SCEVAddExpr *SA = dyn_cast<SCEVAddExpr>(Start);
+  if (!SA || SA->getNumOperands() != 2 || SA->getOperand(0) != Step)
+    return 0;
+
+  // This is a postinc AR. Check for overflow on the preinc recurrence using the
+  // same three conditions that getSignExtendedExpr checks.
+
+  // 1. NSW flags on the step increment.
+  const SCEV *PreStart = SA->getOperand(1);
+  const SCEVAddRecExpr *PreAR = dyn_cast<SCEVAddRecExpr>(
+    SE->getAddRecExpr(PreStart, Step, L, SCEV::FlagAnyWrap));
+
+  if (PreAR && PreAR->getNoWrapFlags(SCEV::FlagNSW))
+    return PreStart;
+
+  // 2. Direct overflow check on the step operation's expression.
+  unsigned BitWidth = SE->getTypeSizeInBits(AR->getType());
+  const Type *WideTy = IntegerType::get(SE->getContext(), BitWidth * 2);
+  const SCEV *OperandExtendedStart =
+    SE->getAddExpr(SE->getSignExtendExpr(PreStart, WideTy),
+                   SE->getSignExtendExpr(Step, WideTy));
+  if (SE->getSignExtendExpr(Start, WideTy) == OperandExtendedStart) {
+    // Cache knowledge of PreAR NSW.
+    if (PreAR)
+      const_cast<SCEVAddRecExpr *>(PreAR)->setNoWrapFlags(SCEV::FlagNSW);
+    // FIXME: this optimization needs a unit test
+    DEBUG(dbgs() << "SCEV: untested prestart overflow check\n");
+    return PreStart;
+  }
+
+  // 3. Loop precondition.
+  ICmpInst::Predicate Pred;
+  const SCEV *OverflowLimit = getOverflowLimitForStep(Step, &Pred, SE);
+
+  if (OverflowLimit &&
+      SE->isLoopEntryGuardedByCond(L, Pred, PreStart, OverflowLimit)) {
+    return PreStart;
+  }
+  return 0;
+}
+
+// Get the normalized sign-extended expression for this AddRec's Start.
+static const SCEV *getSignExtendAddRecStart(const SCEVAddRecExpr *AR,
+                                            const Type *Ty,
+                                            ScalarEvolution *SE) {
+  const SCEV *PreStart = getPreStartForSignExtend(AR, Ty, SE);
+  if (!PreStart)
+    return SE->getSignExtendExpr(AR->getStart(), Ty);
+
+  return SE->getAddExpr(SE->getSignExtendExpr(AR->getStepRecurrence(*SE), Ty),
+                        SE->getSignExtendExpr(PreStart, Ty));
+}
+
 const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
                                                const Type *Ty) {
   assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
@@ -1097,7 +1184,7 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
       // If we have special knowledge that this addrec won't overflow,
       // we don't need to do any further analysis.
       if (AR->getNoWrapFlags(SCEV::FlagNSW))
-        return getAddRecExpr(getSignExtendExpr(Start, Ty),
+        return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this),
                              getSignExtendExpr(Step, Ty),
                              L, SCEV::FlagNSW);
 
@@ -1133,7 +1220,7 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
             // Cache knowledge of AR NSW, which is propagated to this AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
             // Return the expression with the addrec on the outside.
-            return getAddRecExpr(getSignExtendExpr(Start, Ty),
+            return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this),
                                  getSignExtendExpr(Step, Ty),
                                  L, AR->getNoWrapFlags());
           }
@@ -1149,7 +1236,7 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
             // Cache knowledge of AR NSW, which is propagated to this AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
             // Return the expression with the addrec on the outside.
-            return getAddRecExpr(getSignExtendExpr(Start, Ty),
+            return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this),
                                  getZeroExtendExpr(Step, Ty),
                                  L, AR->getNoWrapFlags());
           }
@@ -1159,34 +1246,18 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
         // the addrec is safe. Also, if the entry is guarded by a comparison
         // with the start value and the backedge is guarded by a comparison
         // with the post-inc value, the addrec is safe.
-        if (isKnownPositive(Step)) {
-          const SCEV *N = getConstant(APInt::getSignedMinValue(BitWidth) -
-                                      getSignedRange(Step).getSignedMax());
-          if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_SLT, AR, N) ||
-              (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SLT, Start, N) &&
-               isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_SLT,
-                                           AR->getPostIncExpr(*this), N))) {
-            // Cache knowledge of AR NSW, which is propagated to this AddRec.
-            const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
-            // Return the expression with the addrec on the outside.
-            return getAddRecExpr(getSignExtendExpr(Start, Ty),
-                                 getSignExtendExpr(Step, Ty),
-                                 L, AR->getNoWrapFlags());
-          }
-        } else if (isKnownNegative(Step)) {
-          const SCEV *N = getConstant(APInt::getSignedMaxValue(BitWidth) -
-                                      getSignedRange(Step).getSignedMin());
-          if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_SGT, AR, N) ||
-              (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SGT, Start, N) &&
-               isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_SGT,
-                                           AR->getPostIncExpr(*this), N))) {
-            // Cache knowledge of AR NSW, which is propagated to this AddRec.
-            const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
-            // Return the expression with the addrec on the outside.
-            return getAddRecExpr(getSignExtendExpr(Start, Ty),
-                                 getSignExtendExpr(Step, Ty),
-                                 L, AR->getNoWrapFlags());
-          }
+        ICmpInst::Predicate Pred;
+        const SCEV *OverflowLimit = getOverflowLimitForStep(Step, &Pred, this);
+        if (OverflowLimit &&
+            (isLoopBackedgeGuardedByCond(L, Pred, AR, OverflowLimit) ||
+             (isLoopEntryGuardedByCond(L, Pred, Start, OverflowLimit) &&
+              isLoopBackedgeGuardedByCond(L, Pred, AR->getPostIncExpr(*this),
+                                          OverflowLimit)))) {
+          // Cache knowledge of AR NSW, then propagate NSW to the wide AddRec.
+          const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
+          return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this),
+                               getSignExtendExpr(Step, Ty),
+                               L, AR->getNoWrapFlags());
         }
       }
     }
@@ -3783,7 +3854,7 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
   // update the value. The temporary CouldNotCompute value tells SCEV
   // code elsewhere that it shouldn't attempt to request a new
   // backedge-taken count, which could result in infinite recursion.
-  std::pair<std::map<const Loop *, BackedgeTakenInfo>::iterator, bool> Pair =
+  std::pair<DenseMap<const Loop *, BackedgeTakenInfo>::iterator, bool> Pair =
     BackedgeTakenCounts.insert(std::make_pair(L, getCouldNotCompute()));
   if (!Pair.second)
     return Pair.first->second;
@@ -4433,7 +4504,7 @@ Constant *
 ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
                                                    const APInt &BEs,
                                                    const Loop *L) {
-  std::map<PHINode*, Constant*>::const_iterator I =
+  DenseMap<PHINode*, Constant*>::const_iterator I =
     ConstantEvolutionLoopExitValue.find(PN);
   if (I != ConstantEvolutionLoopExitValue.end())
     return I->second;
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 8f18dd2..dab5aeb 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -131,8 +131,18 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
     }
     return;
   }
+  
+  if (Argument *A = dyn_cast<Argument>(V)) {
+    // Get alignment information off byval arguments if specified in the IR.
+    if (A->hasByValAttr())
+      if (unsigned Align = A->getParamAlignment())
+        KnownZero = Mask & APInt::getLowBitsSet(BitWidth,
+                                                CountTrailingZeros_32(Align));
+    return;
+  }
 
-  KnownZero.clearAllBits(); KnownOne.clearAllBits();   // Start out not knowing anything.
+  // Start out not knowing anything.
+  KnownZero.clearAllBits(); KnownOne.clearAllBits();
 
   if (Depth == MaxDepth || Mask == 0)
     return;  // Limit search depth.
@@ -670,6 +680,10 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
         KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
         break;
       }
+      case Intrinsic::x86_sse42_crc32_64_8:
+      case Intrinsic::x86_sse42_crc32_64_64:
+        KnownZero = APInt::getHighBitsSet(64, 32);
+        break;
       }
     }
     break;
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 857fa1e..014e816 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -308,16 +308,8 @@ lltok::Kind LLLexer::LexAt() {
   }
 
   // Handle GlobalVarName: @[-a-zA-Z$._][-a-zA-Z$._0-9]*
-  if (isalpha(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
-      CurPtr[0] == '.' || CurPtr[0] == '_') {
-    ++CurPtr;
-    while (isalnum(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
-           CurPtr[0] == '.' || CurPtr[0] == '_')
-      ++CurPtr;
-
-    StrVal.assign(TokStart+1, CurPtr);   // Skip @
+  if (ReadVarName())
     return lltok::GlobalVar;
-  }
 
   // Handle GlobalVarID: @[0-9]+
   if (isdigit(CurPtr[0])) {
@@ -334,6 +326,39 @@ lltok::Kind LLLexer::LexAt() {
   return lltok::Error;
 }
 
+/// ReadString - Read a string until the closing quote.
+lltok::Kind LLLexer::ReadString(lltok::Kind kind) {
+  const char *Start = CurPtr;
+  while (1) {
+    int CurChar = getNextChar();
+
+    if (CurChar == EOF) {
+      Error("end of file in string constant");
+      return lltok::Error;
+    }
+    if (CurChar == '"') {
+      StrVal.assign(Start, CurPtr-1);
+      UnEscapeLexed(StrVal);
+      return kind;
+    }
+  }
+}
+
+/// ReadVarName - Read the rest of a token containing a variable name.
+bool LLLexer::ReadVarName() {
+  const char *NameStart = CurPtr;
+  if (isalpha(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
+      CurPtr[0] == '.' || CurPtr[0] == '_') {
+    ++CurPtr;
+    while (isalnum(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
+           CurPtr[0] == '.' || CurPtr[0] == '_')
+      ++CurPtr;
+
+    StrVal.assign(NameStart, CurPtr);
+    return true;
+  }
+  return false;
+}
 
 /// LexPercent - Lex all tokens that start with a % character:
 ///   LocalVar   ::= %\"[^\"]*\"
@@ -343,33 +368,12 @@ lltok::Kind LLLexer::LexPercent() {
   // Handle LocalVarName: %\"[^\"]*\"
   if (CurPtr[0] == '"') {
     ++CurPtr;
-
-    while (1) {
-      int CurChar = getNextChar();
-
-      if (CurChar == EOF) {
-        Error("end of file in string constant");
-        return lltok::Error;
-      }
-      if (CurChar == '"') {
-        StrVal.assign(TokStart+2, CurPtr-1);
-        UnEscapeLexed(StrVal);
-        return lltok::LocalVar;
-      }
-    }
+    return ReadString(lltok::LocalVar);
   }
 
   // Handle LocalVarName: %[-a-zA-Z$._][-a-zA-Z$._0-9]*
-  if (isalpha(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
-      CurPtr[0] == '.' || CurPtr[0] == '_') {
-    ++CurPtr;
-    while (isalnum(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
-           CurPtr[0] == '.' || CurPtr[0] == '_')
-      ++CurPtr;
-
-    StrVal.assign(TokStart+1, CurPtr);   // Skip %
+  if (ReadVarName())
     return lltok::LocalVar;
-  }
 
   // Handle LocalVarID: %[0-9]+
   if (isdigit(CurPtr[0])) {
@@ -390,27 +394,16 @@ lltok::Kind LLLexer::LexPercent() {
 ///   QuoteLabel        "[^"]+":
 ///   StringConstant    "[^"]*"
 lltok::Kind LLLexer::LexQuote() {
-  while (1) {
-    int CurChar = getNextChar();
-
-    if (CurChar == EOF) {
-      Error("end of file in quoted string");
-      return lltok::Error;
-    }
-
-    if (CurChar != '"') continue;
-
-    if (CurPtr[0] != ':') {
-      StrVal.assign(TokStart+1, CurPtr-1);
-      UnEscapeLexed(StrVal);
-      return lltok::StringConstant;
-    }
+  lltok::Kind kind = ReadString(lltok::StringConstant);
+  if (kind == lltok::Error || kind == lltok::Eof)
+    return kind;
 
+  if (CurPtr[0] == ':') {
     ++CurPtr;
-    StrVal.assign(TokStart+1, CurPtr-2);
-    UnEscapeLexed(StrVal);
-    return lltok::LabelStr;
+    kind = lltok::LabelStr;
   }
+
+  return kind;
 }
 
 static bool JustWhitespaceNewLine(const char *&Ptr) {
@@ -565,6 +558,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(nest);
   KEYWORD(readnone);
   KEYWORD(readonly);
+  KEYWORD(uwtable);
 
   KEYWORD(inlinehint);
   KEYWORD(noinline);
diff --git a/lib/AsmParser/LLLexer.h b/lib/AsmParser/LLLexer.h
index 09ae801..4fe705e 100644
--- a/lib/AsmParser/LLLexer.h
+++ b/lib/AsmParser/LLLexer.h
@@ -71,6 +71,9 @@ namespace llvm {
 
     int getNextChar();
     void SkipLineComment();
+    lltok::Kind ReadString(lltok::Kind kind);
+    bool ReadVarName();
+
     lltok::Kind LexIdentifier();
     lltok::Kind LexDigitOrNegative();
     lltok::Kind LexPositive();
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index a2c53be..81e0747 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -972,6 +972,7 @@ bool LLParser::ParseOptionalAttrs(unsigned &Attrs, unsigned AttrKind) {
 
     case lltok::kw_noreturn:        Attrs |= Attribute::NoReturn; break;
     case lltok::kw_nounwind:        Attrs |= Attribute::NoUnwind; break;
+    case lltok::kw_uwtable:         Attrs |= Attribute::UWTable; break;
     case lltok::kw_noinline:        Attrs |= Attribute::NoInline; break;
     case lltok::kw_readnone:        Attrs |= Attribute::ReadNone; break;
     case lltok::kw_readonly:        Attrs |= Attribute::ReadOnly; break;
@@ -3003,7 +3004,6 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_sub:
   case lltok::kw_mul:
   case lltok::kw_shl: {
-    LocTy ModifierLoc = Lex.getLoc();
     bool NUW = EatIfPresent(lltok::kw_nuw);
     bool NSW = EatIfPresent(lltok::kw_nsw);
     if (!NUW) NUW = EatIfPresent(lltok::kw_nuw);
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 576da19..02f97a3 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -87,6 +87,7 @@ namespace lltok {
     kw_nest,
     kw_readnone,
     kw_readonly,
+    kw_uwtable,
 
     kw_inlinehint,
     kw_noinline,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 19f57cf..bc995ae 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -1588,8 +1588,18 @@ bool BitcodeReader::ParseBitcodeInto(Module *M) {
   while (!Stream.AtEndOfStream()) {
     unsigned Code = Stream.ReadCode();
 
-    if (Code != bitc::ENTER_SUBBLOCK)
+    if (Code != bitc::ENTER_SUBBLOCK) {
+
+      // The ranlib in xcode 4 will align archive members by appending newlines to the
+      // end of them. If this file size is a multiple of 4 but not 8, we have to read and
+      // ignore these final 4 bytes :-(
+      if (Stream.GetAbbrevIDWidth() == 2 && Code == 2 &&
+          Stream.Read(6) == 2 && Stream.Read(24) == 0xa0a0a &&
+	  Stream.AtEndOfStream())
+        return false;
+
       return Error("Invalid record at top-level");
+    }
 
     unsigned BlockID = Stream.ReadSubBlockID();
 
@@ -1842,7 +1852,6 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         FunctionBBs[i] = BasicBlock::Create(Context, "", F);
       CurBB = FunctionBBs[0];
       continue;
-
         
     case bitc::FUNC_CODE_DEBUG_LOC_AGAIN:  // DEBUG_LOC_AGAIN
       // This record indicates that the last instruction is at the same
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index e34137f..6972a45 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -100,8 +100,6 @@ static unsigned GetEncodedBinaryOpcode(unsigned Opcode) {
   }
 }
 
-
-
 static void WriteStringRecord(unsigned Code, const std::string &Str,
                               unsigned AbbrevToUse, BitstreamWriter &Stream) {
   SmallVector<unsigned, 64> Vals;
@@ -447,7 +445,6 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
     Vals.clear();
   }
 
-
   // Emit the alias information.
   for (Module::const_alias_iterator AI = M->alias_begin(), E = M->alias_end();
        AI != E; ++AI) {
@@ -871,8 +868,6 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
         break;
       }
     } else if (const BlockAddress *BA = dyn_cast<BlockAddress>(C)) {
-      assert(BA->getFunction() == BA->getBasicBlock()->getParent() &&
-             "Malformed blockaddress");
       Code = bitc::CST_CODE_BLOCKADDRESS;
       Record.push_back(VE.getTypeID(BA->getFunction()->getType()));
       Record.push_back(VE.getValueID(BA->getFunction()));
@@ -1514,9 +1509,9 @@ static void WriteModule(const Module *M, BitstreamWriter &Stream) {
   WriteModuleMetadata(M, VE, Stream);
 
   // Emit function bodies.
-  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I)
-    if (!I->isDeclaration())
-      WriteFunction(*I, VE, Stream);
+  for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F)
+    if (!F->isDeclaration())
+      WriteFunction(*F, VE, Stream);
 
   // Emit metadata.
   WriteModuleMetadataStore(M, Stream);
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index 21f004a..5138c3c 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -452,7 +452,6 @@ void ValueEnumerator::EnumerateAttributes(const AttrListPtr &PAL) {
   }
 }
 
-
 void ValueEnumerator::incorporateFunction(const Function &F) {
   InstructionCount = 0;
   NumModuleValues = Values.size();
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 5c809f7..dca1d29 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -719,7 +719,9 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
                               const std::vector<SUnit>& SUnits,
                               MachineBasicBlock::iterator Begin,
                               MachineBasicBlock::iterator End,
-                              unsigned InsertPosIndex) {
+                              unsigned InsertPosIndex,
+                              DbgValueVector &DbgValues) {
+
   std::vector<unsigned> &KillIndices = State->GetKillIndices();
   std::vector<unsigned> &DefIndices = State->GetDefIndices();
   std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
@@ -923,14 +925,10 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
               // sure to update that as well.
               const SUnit *SU = MISUnitMap[Q->second.Operand->getParent()];
               if (!SU) continue;
-              for (unsigned i = 0, e = SU->DbgInstrList.size() ; i < e ; ++i) {
-                MachineInstr *DI = SU->DbgInstrList[i];
-                assert (DI->getNumOperands()==3 && DI->getOperand(0).isReg() &&
-                        DI->getOperand(0).getReg()
-                        && "Non register dbg_value attached to SUnit!");
-                if (DI->getOperand(0).getReg() == AntiDepReg)
-                  DI->getOperand(0).setReg(NewReg);
-              }
+              for (DbgValueVector::iterator DVI = DbgValues.begin(),
+                     DVE = DbgValues.end(); DVI != DVE; ++DVI)
+                if (DVI->second == Q->second.Operand->getParent())
+                  UpdateDbgValue(DVI->first, AntiDepReg, NewReg);
             }
 
             // We just went back in time and modified history; the
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.h b/lib/CodeGen/AggressiveAntiDepBreaker.h
index 9d715cc..b7ddafc 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.h
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.h
@@ -146,7 +146,8 @@ namespace llvm {
     unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
                                    MachineBasicBlock::iterator Begin,
                                    MachineBasicBlock::iterator End,
-                                   unsigned InsertPosIndex);
+                                   unsigned InsertPosIndex,
+                                   DbgValueVector &DbgValues);
 
     /// Observe - Update liveness information to account for the current
     /// instruction, which will not be scheduled.
diff --git a/lib/CodeGen/AllocationOrder.cpp b/lib/CodeGen/AllocationOrder.cpp
index 20c7625..a8ee2b6 100644
--- a/lib/CodeGen/AllocationOrder.cpp
+++ b/lib/CodeGen/AllocationOrder.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AllocationOrder.h"
+#include "RegisterClassInfo.h"
 #include "VirtRegMap.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
@@ -23,8 +24,8 @@ using namespace llvm;
 // Compare VirtRegMap::getRegAllocPref().
 AllocationOrder::AllocationOrder(unsigned VirtReg,
                                  const VirtRegMap &VRM,
-                                 const BitVector &ReservedRegs)
-  : Pos(0), Reserved(ReservedRegs) {
+                                 const RegisterClassInfo &RegClassInfo)
+  : Begin(0), End(0), Pos(0), RCI(RegClassInfo), OwnedBegin(false) {
   const TargetRegisterClass *RC = VRM.getRegInfo().getRegClass(VirtReg);
   std::pair<unsigned, unsigned> HintPair =
     VRM.getRegInfo().getRegAllocationHint(VirtReg);
@@ -36,33 +37,45 @@ AllocationOrder::AllocationOrder(unsigned VirtReg,
   if (TargetRegisterInfo::isVirtualRegister(Hint))
     Hint = VRM.getPhys(Hint);
 
-  // The remaining allocation order may depend on the hint.
-  tie(Begin, End) = VRM.getTargetRegInfo()
-        .getAllocationOrder(RC, HintPair.first, Hint, VRM.getMachineFunction());
+  // The first hint pair component indicates a target-specific hint.
+  if (HintPair.first) {
+    const TargetRegisterInfo &TRI = VRM.getTargetRegInfo();
+    // The remaining allocation order may depend on the hint.
+    const unsigned *B, *E;
+    tie(B, E) = TRI.getAllocationOrder(RC, HintPair.first, Hint,
+                                       VRM.getMachineFunction());
 
-  // Target-dependent hints require resolution.
-  if (HintPair.first)
-    Hint = VRM.getTargetRegInfo().ResolveRegAllocHint(HintPair.first, Hint,
-                                                      VRM.getMachineFunction());
+    // Empty allocation order?
+    if (B == E)
+      return;
+
+    // Copy the allocation order with reserved registers removed.
+    OwnedBegin = true;
+    unsigned *P = new unsigned[E - B];
+    Begin = P;
+    for (; B != E; ++B)
+      if (!RCI.isReserved(*B))
+        *P++ = *B;
+    End = P;
+
+    // Target-dependent hints require resolution.
+    Hint = TRI.ResolveRegAllocHint(HintPair.first, Hint,
+                                   VRM.getMachineFunction());
+  } else {
+    // If there is no hint or just a normal hint, use the cached allocation
+    // order from RegisterClassInfo.
+    ArrayRef<unsigned> O = RCI.getOrder(RC);
+    Begin = O.begin();
+    End = O.end();
+  }
 
   // The hint must be a valid physreg for allocation.
   if (Hint && (!TargetRegisterInfo::isPhysicalRegister(Hint) ||
-               !RC->contains(Hint) || ReservedRegs.test(Hint)))
+               !RC->contains(Hint) || RCI.isReserved(Hint)))
     Hint = 0;
 }
 
-unsigned AllocationOrder::next() {
-  // First take the hint.
-  if (!Pos) {
-    Pos = Begin;
-    if (Hint)
-      return Hint;
-  }
-  // Then look at the order from TRI.
-  while(Pos != End) {
-    unsigned Reg = *Pos++;
-    if (Reg != Hint && !Reserved.test(Reg))
-      return Reg;
-  }
-  return 0;
+AllocationOrder::~AllocationOrder() {
+  if (OwnedBegin)
+    delete [] Begin;
 }
diff --git a/lib/CodeGen/AllocationOrder.h b/lib/CodeGen/AllocationOrder.h
index 61fd8f8..d1e48a1 100644
--- a/lib/CodeGen/AllocationOrder.h
+++ b/lib/CodeGen/AllocationOrder.h
@@ -19,15 +19,16 @@
 
 namespace llvm {
 
-class BitVector;
+class RegisterClassInfo;
 class VirtRegMap;
 
 class AllocationOrder {
   const unsigned *Begin;
   const unsigned *End;
   const unsigned *Pos;
-  const BitVector &Reserved;
+  const RegisterClassInfo &RCI;
   unsigned Hint;
+  bool OwnedBegin;
 public:
 
   /// AllocationOrder - Create a new AllocationOrder for VirtReg.
@@ -37,12 +38,28 @@ public:
   ///        TargetRegisterInfo::getReservedRegs().
   AllocationOrder(unsigned VirtReg,
                   const VirtRegMap &VRM,
-                  const BitVector &ReservedRegs);
+                  const RegisterClassInfo &RegClassInfo);
+
+  ~AllocationOrder();
 
   /// next - Return the next physical register in the allocation order, or 0.
   /// It is safe to call next again after it returned 0.
   /// It will keep returning 0 until rewind() is called.
-  unsigned next();
+  unsigned next() {
+    // First take the hint.
+    if (!Pos) {
+      Pos = Begin;
+      if (Hint)
+        return Hint;
+    }
+    // Then look at the order from TRI.
+    while (Pos != End) {
+      unsigned Reg = *Pos++;
+      if (Reg != Hint)
+        return Reg;
+    }
+    return 0;
+  }
 
   /// rewind - Start over from the beginning.
   void rewind() { Pos = 0; }
diff --git a/lib/CodeGen/AntiDepBreaker.h b/lib/CodeGen/AntiDepBreaker.h
index 086b757..df47f98 100644
--- a/lib/CodeGen/AntiDepBreaker.h
+++ b/lib/CodeGen/AntiDepBreaker.h
@@ -30,6 +30,9 @@ namespace llvm {
 /// anti-dependencies.
 class AntiDepBreaker {
 public:
+  typedef std::vector<std::pair<MachineInstr *, MachineInstr *> > 
+    DbgValueVector;
+
   virtual ~AntiDepBreaker();
 
   /// Start - Initialize anti-dep breaking for a new basic block.
@@ -40,9 +43,10 @@ public:
   /// the number of anti-dependencies broken.
   ///
   virtual unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
-                                MachineBasicBlock::iterator Begin,
-                                MachineBasicBlock::iterator End,
-                                unsigned InsertPosIndex) =0;
+                                         MachineBasicBlock::iterator Begin,
+                                         MachineBasicBlock::iterator End,
+                                         unsigned InsertPosIndex,
+                                         DbgValueVector &DbgValues) = 0;
   
   /// Observe - Update liveness information to account for the current
   /// instruction, which will not be scheduled.
@@ -52,6 +56,14 @@ public:
   
   /// Finish - Finish anti-dep breaking for a basic block.
   virtual void FinishBlock() =0;
+
+  /// UpdateDbgValue - Update DBG_VALUE if dependency breaker is updating
+  /// other machine instruction to use NewReg.
+  void UpdateDbgValue(MachineInstr *MI, unsigned OldReg, unsigned NewReg) {
+    assert (MI->isDebugValue() && "MI is not DBG_VALUE!");
+    if (MI && MI->getOperand(0).isReg() && MI->getOperand(0).getReg() == OldReg)
+      MI->getOperand(0).setReg(NewReg);
+  }
 };
 
 }
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 0db28a6..5861fa4 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -52,7 +52,7 @@ void ARMException::EndModule() {
 /// being emitted immediately after the function entry point.
 void ARMException::BeginFunction(const MachineFunction *MF) {
   Asm->OutStreamer.EmitFnStart();
-  if (!Asm->MF->getFunction()->doesNotThrow() || UnwindTablesMandatory)
+  if (Asm->MF->getFunction()->needsUnwindTableEntry())
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin",
                                                   Asm->getFunctionNumber()));
 }
@@ -60,7 +60,7 @@ void ARMException::BeginFunction(const MachineFunction *MF) {
 /// EndFunction - Gather and emit post-function exception information.
 ///
 void ARMException::EndFunction() {
-  if (Asm->MF->getFunction()->doesNotThrow() && !UnwindTablesMandatory)
+  if (!Asm->MF->getFunction()->needsUnwindTableEntry())
     Asm->OutStreamer.EmitCantUnwind();
   else {
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_end",
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 8116f8d..161afba 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -189,21 +189,22 @@ bool AsmPrinter::doInitialization(Module &M) {
   if (MAI->doesSupportDebugInformation())
     DD = new DwarfDebug(this, &M);
 
-  if (MAI->doesSupportExceptionHandling())
-    switch (MAI->getExceptionHandlingType()) {
-    default:
-    case ExceptionHandling::DwarfTable:
-      DE = new DwarfTableException(this);
-      break;
-    case ExceptionHandling::DwarfCFI:
-      DE = new DwarfCFIException(this);
-      break;
-    case ExceptionHandling::ARM:
-      DE = new ARMException(this);
-      break;
-    }
+  switch (MAI->getExceptionHandlingType()) {
+  case ExceptionHandling::None:
+    return false;
+  case ExceptionHandling::SjLj:
+  case ExceptionHandling::DwarfCFI:
+    DE = new DwarfCFIException(this);
+    return false;
+  case ExceptionHandling::ARM:
+    DE = new ARMException(this);
+    return false;
+  case ExceptionHandling::Win64:
+    DE = new Win64Exception(this);
+    return false;
+  }
 
-  return false;
+  llvm_unreachable("Unknown exception type.");
 }
 
 void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
@@ -268,7 +269,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   }
 
   MCSymbol *GVSym = Mang->getSymbol(GV);
-  EmitVisibility(GVSym, GV->getVisibility());
+  EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
 
   if (!GV->hasInitializer())   // External globals require no extra code.
     return;
@@ -592,30 +593,29 @@ static bool EmitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
   return true;
 }
 
-bool AsmPrinter::needsCFIMoves() {
-  if (UnwindTablesMandatory)
-    return true;
+AsmPrinter::CFIMoveType AsmPrinter::needsCFIMoves() {
+  if (MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI &&
+      MF->getFunction()->needsUnwindTableEntry())
+    return CFI_M_EH;
 
-  if (MMI->hasDebugInfo() && !MAI->doesDwarfRequireFrameSection())
-    return true;
+  if (MMI->hasDebugInfo())
+    return CFI_M_Debug;
 
-  if (MF->getFunction()->doesNotThrow())
-    return false;
+  return CFI_M_None;
+}
 
-  return true;
+bool AsmPrinter::needsSEHMoves() {
+  return MAI->getExceptionHandlingType() == ExceptionHandling::Win64 &&
+    MF->getFunction()->needsUnwindTableEntry();
 }
 
 void AsmPrinter::emitPrologLabel(const MachineInstr &MI) {
   MCSymbol *Label = MI.getOperand(0).getMCSymbol();
 
-  if (MAI->doesDwarfRequireFrameSection() ||
-      MAI->getExceptionHandlingType() != ExceptionHandling::DwarfCFI)
-    OutStreamer.EmitLabel(Label);
-
   if (MAI->getExceptionHandlingType() != ExceptionHandling::DwarfCFI)
     return;
 
-  if (!needsCFIMoves())
+  if (needsCFIMoves() == CFI_M_None)
     return;
 
   MachineModuleInfo &MMI = MF->getMMI();
@@ -768,30 +768,25 @@ getDebugValueLocation(const MachineInstr *MI) const {
   return MachineLocation();
 }
 
-/// getDwarfRegOpSize - get size required to emit given machine location using
-/// dwarf encoding.
-unsigned AsmPrinter::getDwarfRegOpSize(const MachineLocation &MLoc) const {
-  const TargetRegisterInfo *RI = TM.getRegisterInfo();
-  unsigned DWReg = RI->getDwarfRegNum(MLoc.getReg(), false);
-  if (int Offset = MLoc.getOffset()) {
-    // If the value is at a certain offset from frame register then
-    // use DW_OP_breg.
-    if (DWReg < 32)
-      return 1 + MCAsmInfo::getSLEB128Size(Offset);
-    else
-      return 1 + MCAsmInfo::getULEB128Size(MLoc.getReg()) 
-        + MCAsmInfo::getSLEB128Size(Offset);
-  }
-  if (DWReg < 32)
-    return 1;
-
-  return 1 + MCAsmInfo::getULEB128Size(DWReg);
-}
-
 /// EmitDwarfRegOp - Emit dwarf register operation.
 void AsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
   const TargetRegisterInfo *TRI = TM.getRegisterInfo();
-  unsigned Reg = TRI->getDwarfRegNum(MLoc.getReg(), false);
+  int Reg = TRI->getDwarfRegNum(MLoc.getReg(), false);
+
+  for (const unsigned *SR = TRI->getSuperRegisters(MLoc.getReg());
+       *SR && Reg < 0; ++SR) {
+    Reg = TRI->getDwarfRegNum(*SR, false);
+    // FIXME: Get the bit range this register uses of the superregister
+    // so that we can produce a DW_OP_bit_piece
+  }
+
+  // FIXME: Handle cases like a super register being encoded as
+  // DW_OP_reg 32 DW_OP_piece 4 DW_OP_reg 33
+
+  // FIXME: We have no reasonable way of handling errors in here. The
+  // caller might be in the middle of an dwarf expression. We should
+  // probably assert that Reg >= 0 once debug info generation is more mature.
+
   if (int Offset =  MLoc.getOffset()) {
     if (Reg < 32) {
       OutStreamer.AddComment(
@@ -816,6 +811,8 @@ void AsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
       EmitULEB128(Reg);
     }
   }
+
+  // FIXME: Produce a DW_OP_bit_piece if we used a superregister
 }
 
 bool AsmPrinter::doFinalization(Module &M) {
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index 6d1708a..dd5b0e2 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -206,78 +206,7 @@ void AsmPrinter::EmitSectionOffset(const MCSymbol *Label,
 // Dwarf Lowering Routines
 //===----------------------------------------------------------------------===//
 
-
-/// EmitFrameMoves - Emit frame instructions to describe the layout of the
-/// frame.
-void AsmPrinter::EmitFrameMoves(const std::vector<MachineMove> &Moves,
-                                MCSymbol *BaseLabel, bool isEH) const {
-  const TargetRegisterInfo *RI = TM.getRegisterInfo();
-  
-  int stackGrowth = TM.getTargetData()->getPointerSize();
-  if (TM.getFrameLowering()->getStackGrowthDirection() !=
-      TargetFrameLowering::StackGrowsUp)
-    stackGrowth *= -1;
-  
-  for (unsigned i = 0, N = Moves.size(); i < N; ++i) {
-    const MachineMove &Move = Moves[i];
-    MCSymbol *Label = Move.getLabel();
-    // Throw out move if the label is invalid.
-    if (Label && !Label->isDefined()) continue; // Not emitted, in dead code.
-    
-    const MachineLocation &Dst = Move.getDestination();
-    const MachineLocation &Src = Move.getSource();
-    
-    // Advance row if new location.
-    if (BaseLabel && Label) {
-      MCSymbol *ThisSym = Label;
-      if (ThisSym != BaseLabel) {
-        EmitCFAByte(dwarf::DW_CFA_advance_loc4);
-        EmitLabelDifference(ThisSym, BaseLabel, 4);
-        BaseLabel = ThisSym;
-      }
-    }
-    
-    // If advancing cfa.
-    if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) {
-      assert(!Src.isReg() && "Machine move not supported yet.");
-      
-      if (Src.getReg() == MachineLocation::VirtualFP) {
-        EmitCFAByte(dwarf::DW_CFA_def_cfa_offset);
-      } else {
-        EmitCFAByte(dwarf::DW_CFA_def_cfa);
-        EmitULEB128(RI->getDwarfRegNum(Src.getReg(), isEH), "Register");
-      }
-      
-      EmitULEB128(-Src.getOffset(), "Offset");
-      continue;
-    }
-    
-    if (Src.isReg() && Src.getReg() == MachineLocation::VirtualFP) {
-      assert(Dst.isReg() && "Machine move not supported yet.");
-      EmitCFAByte(dwarf::DW_CFA_def_cfa_register);
-      EmitULEB128(RI->getDwarfRegNum(Dst.getReg(), isEH), "Register");
-      continue;
-    }
-    
-    unsigned Reg = RI->getDwarfRegNum(Src.getReg(), isEH);
-    int Offset = Dst.getOffset() / stackGrowth;
-    
-    if (Offset < 0) {
-      EmitCFAByte(dwarf::DW_CFA_offset_extended_sf);
-      EmitULEB128(Reg, "Reg");
-      EmitSLEB128(Offset, "Offset");
-    } else if (Reg < 64) {
-      EmitCFAByte(dwarf::DW_CFA_offset + Reg);
-      EmitULEB128(Offset, "Offset");
-    } else {
-      EmitCFAByte(dwarf::DW_CFA_offset_extended);
-      EmitULEB128(Reg, "Reg");
-      EmitULEB128(Offset, "Offset");
-    }
-  }
-}
-
-/// EmitFrameMoves - Emit a frame instruction.
+/// EmitCFIFrameMove - Emit a frame instruction.
 void AsmPrinter::EmitCFIFrameMove(const MachineMove &Move) const {
   const TargetRegisterInfo *RI = TM.getRegisterInfo();
 
@@ -286,13 +215,12 @@ void AsmPrinter::EmitCFIFrameMove(const MachineMove &Move) const {
 
   // If advancing cfa.
   if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) {
-    assert(!Src.isReg() && "Machine move not supported yet.");
-
     if (Src.getReg() == MachineLocation::VirtualFP) {
       OutStreamer.EmitCFIDefCfaOffset(-Src.getOffset());
     } else {
-      assert("Machine move not supported yet");
       // Reg + Offset
+      OutStreamer.EmitCFIDefCfa(RI->getDwarfRegNum(Src.getReg(), true),
+                                Src.getOffset());
     }
   } else if (Src.isReg() && Src.getReg() == MachineLocation::VirtualFP) {
     assert(Dst.isReg() && "Machine move not supported yet.");
@@ -303,16 +231,3 @@ void AsmPrinter::EmitCFIFrameMove(const MachineMove &Move) const {
                               Dst.getOffset());
   }
 }
-
-/// EmitFrameMoves - Emit frame instructions to describe the layout of the
-/// frame.
-void AsmPrinter::EmitCFIFrameMoves(const std::vector<MachineMove> &Moves) const {
-  for (unsigned i = 0, N = Moves.size(); i < N; ++i) {
-    const MachineMove &Move = Moves[i];
-    MCSymbol *Label = Move.getLabel();
-    // Throw out move if the label is invalid.
-    if (Label && !Label->isDefined()) continue; // Not emitted, in dead code.
-
-    EmitCFIFrameMove(Move);
-  }
-}
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index d2be552..4da7876 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -8,7 +8,7 @@ add_llvm_library(LLVMAsmPrinter
   DwarfCompileUnit.cpp
   DwarfDebug.cpp
   DwarfException.cpp
-  DwarfTableException.cpp
   OcamlGCPrinter.cpp
+  Win64Exception.cpp
   )
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index dbd52c4..91b7d08 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -41,14 +41,17 @@ using namespace llvm;
 
 DwarfCFIException::DwarfCFIException(AsmPrinter *A)
   : DwarfException(A),
-    shouldEmitPersonality(false), shouldEmitLSDA(false), shouldEmitMoves(false)
-    {}
+    shouldEmitPersonality(false), shouldEmitLSDA(false), shouldEmitMoves(false),
+    moveTypeModule(AsmPrinter::CFI_M_None) {}
 
 DwarfCFIException::~DwarfCFIException() {}
 
 /// EndModule - Emit all exception information that should come after the
 /// content.
 void DwarfCFIException::EndModule() {
+  if (moveTypeModule == AsmPrinter::CFI_M_Debug)
+    Asm->OutStreamer.EmitCFISections(false, true);
+
   if (!Asm->MAI->isExceptionHandlingDwarf())
     return;
 
@@ -87,7 +90,13 @@ void DwarfCFIException::BeginFunction(const MachineFunction *MF) {
   bool hasLandingPads = !MMI->getLandingPads().empty();
 
   // See if we need frame move info.
-  shouldEmitMoves = Asm->needsCFIMoves();
+  AsmPrinter::CFIMoveType MoveType = Asm->needsCFIMoves();
+  if (MoveType == AsmPrinter::CFI_M_EH ||
+      (MoveType == AsmPrinter::CFI_M_Debug &&
+       moveTypeModule == AsmPrinter::CFI_M_None))
+    moveTypeModule = MoveType;
+
+  shouldEmitMoves = MoveType != AsmPrinter::CFI_M_None;
 
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   unsigned PerEncoding = TLOF.getPersonalityEncoding();
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 7ce0cfe..bff1a35 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -440,11 +440,27 @@ void CompileUnit::addBlockByrefAddress(DbgVariable *&DV, DIE *Die,
 }
 
 /// addConstantValue - Add constant value entry in variable DIE.
-bool CompileUnit::addConstantValue(DIE *Die, const MachineOperand &MO) {
+bool CompileUnit::addConstantValue(DIE *Die, const MachineOperand &MO,
+                                   DIType Ty) {
   assert (MO.isImm() && "Invalid machine operand!");
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-  unsigned Imm = MO.getImm();
-  addUInt(Block, 0, dwarf::DW_FORM_udata, Imm);
+  unsigned form = dwarf::DW_FORM_udata;
+  switch (Ty.getSizeInBits()) {
+    case 8: form = dwarf::DW_FORM_data1; break;
+    case 16: form = dwarf::DW_FORM_data2; break;
+    case 32: form = dwarf::DW_FORM_data4; break;
+    case 64: form = dwarf::DW_FORM_data8; break;
+    default: break;
+  }
+
+  DIBasicType BTy(Ty);
+  if (BTy.Verify() &&
+      (BTy.getEncoding()  == dwarf::DW_ATE_signed 
+       || BTy.getEncoding() == dwarf::DW_ATE_signed_char))
+    addSInt(Block, 0, form, MO.getImm());
+  else
+    addUInt(Block, 0, form, MO.getImm());
+
   addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
   return true;
 }
@@ -477,13 +493,21 @@ bool CompileUnit::addConstantFPValue(DIE *Die, const MachineOperand &MO) {
 /// addConstantValue - Add constant value entry in variable DIE.
 bool CompileUnit::addConstantValue(DIE *Die, ConstantInt *CI,
                                    bool Unsigned) {
-  if (CI->getBitWidth() <= 64) {
+  unsigned CIBitWidth = CI->getBitWidth();
+  if (CIBitWidth <= 64) {
+    unsigned form = 0;
+    switch (CIBitWidth) {
+    case 8: form = dwarf::DW_FORM_data1; break;
+    case 16: form = dwarf::DW_FORM_data2; break;
+    case 32: form = dwarf::DW_FORM_data4; break;
+    case 64: form = dwarf::DW_FORM_data8; break;
+    default: 
+      form = Unsigned ? dwarf::DW_FORM_udata : dwarf::DW_FORM_sdata;
+    }
     if (Unsigned)
-      addUInt(Die, dwarf::DW_AT_const_value, dwarf::DW_FORM_udata,
-              CI->getZExtValue());
+      addUInt(Die, dwarf::DW_AT_const_value, form, CI->getZExtValue());
     else
-      addSInt(Die, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata,
-              CI->getSExtValue());
+      addSInt(Die, dwarf::DW_AT_const_value, form, CI->getSExtValue());
     return true;
   }
 
@@ -581,8 +605,37 @@ void CompileUnit::addType(DIE *Entity, DIType Ty) {
   // Set up proxy.
   Entry = createDIEEntry(Buffer);
   insertDIEEntry(Ty, Entry);
-
   Entity->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, Entry);
+
+  // If this is a complete composite type then include it in the
+  // list of global types.
+  addGlobalType(Ty);
+}
+
+/// addGlobalType - Add a new global type to the compile unit.
+///
+void CompileUnit::addGlobalType(DIType Ty) {
+  DIDescriptor Context = Ty.getContext();
+  if (Ty.isCompositeType() && !Ty.getName().empty() && !Ty.isForwardDecl() 
+      && (Context.isCompileUnit() || Context.isFile() || Context.isNameSpace()))
+    if (DIEEntry *Entry = getDIEEntry(Ty))
+      GlobalTypes[Ty.getName()] = Entry->getEntry();
+}
+
+/// addPubTypes - Add type for pubtypes section.
+void CompileUnit::addPubTypes(DISubprogram SP) {
+  DICompositeType SPTy = SP.getType();
+  unsigned SPTag = SPTy.getTag();
+  if (SPTag != dwarf::DW_TAG_subroutine_type)
+    return;
+
+  DIArray Args = SPTy.getTypeArray();
+  for (unsigned i = 0, e = Args.getNumElements(); i != e; ++i) {
+    DIType ATy(Args.getElement(i));
+    if (!ATy.Verify())
+      continue;
+    addGlobalType(ATy);
+  }
 }
 
 /// constructTypeDIE - Construct basic type die from DIBasicType.
@@ -745,6 +798,10 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
       addToContextOwner(&Buffer, Context);
     }
 
+    if (CTy.isObjcClassComplete())
+      addUInt(&Buffer, dwarf::DW_AT_APPLE_objc_complete_type,
+              dwarf::DW_FORM_flag, 1);
+
     if (Tag == dwarf::DW_TAG_class_type) 
       addTemplateParams(Buffer, CTy.getTemplateParams());
 
@@ -809,6 +866,20 @@ CompileUnit::getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TPV)
   return ParamDIE;
 }
 
+/// getOrCreateNameSpace - Create a DIE for DINameSpace.
+DIE *CompileUnit::getOrCreateNameSpace(DINameSpace NS) {
+  DIE *NDie = getDIE(NS);
+  if (NDie)
+    return NDie;
+  NDie = new DIE(dwarf::DW_TAG_namespace);
+  insertDIE(NS, NDie);
+  if (!NS.getName().empty())
+    addString(NDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, NS.getName());
+  addSourceLine(NDie, NS);
+  addToContextOwner(NDie, NS.getContext());
+  return NDie;
+}
+
 /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
 void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy){
   DIE *DW_Subrange = new DIE(dwarf::DW_TAG_subrange_type);
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index f4f6fb8..60a9b28 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -87,9 +87,7 @@ public:
 
   /// addGlobalType - Add a new global type to the compile unit.
   ///
-  void addGlobalType(StringRef Name, DIE *Die) {
-    GlobalTypes[Name] = Die;
-  }
+  void addGlobalType(DIType Ty);
 
   /// getDIE - Returns the debug information entry map slot for the
   /// specified debug variable.
@@ -104,7 +102,7 @@ public:
     MDNodeToDieMap.insert(std::make_pair(N, D));
   }
 
-  /// getDIEEntry - Returns the debug information entry for the speciefied
+  /// getDIEEntry - Returns the debug information entry for the specified
   /// debug variable.
   DIEEntry *getDIEEntry(const MDNode *N) {
     DenseMap<const MDNode *, DIEEntry *>::iterator I =
@@ -182,7 +180,7 @@ public:
                   const MachineLocation &Location);
 
   /// addConstantValue - Add constant value entry in variable DIE.
-  bool addConstantValue(DIE *Die, const MachineOperand &MO);
+  bool addConstantValue(DIE *Die, const MachineOperand &MO, DIType Ty);
   bool addConstantValue(DIE *Die, ConstantInt *CI, bool Unsigned);
 
   /// addConstantFPValue - Add constant value entry in variable DIE.
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 26da800..8845bfa 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -56,10 +56,6 @@ static cl::opt<bool> UnknownLocations("use-unknown-locations", cl::Hidden,
      cl::desc("Make an absence of debug location information explicit."),
      cl::init(false));
 
-#ifndef NDEBUG
-STATISTIC(BlocksWithoutLineNo, "Number of blocks without any line number");
-#endif
-
 namespace {
   const char *DWARFGroupName = "DWARF Emission";
   const char *DbgTimerName = "DWARF Debug Writer";
@@ -260,7 +256,7 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
     CurrentFnDbgScope(0), PrevLabel(NULL) {
   NextStringPoolNumber = 0;
 
-  DwarfFrameSectionSym = DwarfInfoSectionSym = DwarfAbbrevSectionSym = 0;
+  DwarfInfoSectionSym = DwarfAbbrevSectionSym = 0;
   DwarfStrSectionSym = TextSectionSym = 0;
   DwarfDebugRangeSectionSym = DwarfDebugLocSectionSym = 0;
   FunctionBeginSym = FunctionEndSym = 0;
@@ -332,6 +328,11 @@ DIE *DwarfDebug::createSubprogramDIE(DISubprogram SP) {
   // Add function template parameters.
   SPCU->addTemplateParams(*SPDie, SP.getTemplateParams());
 
+  StringRef LinkageName = SP.getLinkageName();
+  if (!LinkageName.empty())
+    SPCU->addString(SPDie, dwarf::DW_AT_MIPS_linkage_name, dwarf::DW_FORM_string,
+                    getRealLinkageName(LinkageName));
+
   // If this DIE is going to refer declaration info using AT_specification
   // then there is no need to add other attributes.
   if (SP.getFunctionDeclaration().isSubprogram())
@@ -342,11 +343,6 @@ DIE *DwarfDebug::createSubprogramDIE(DISubprogram SP) {
     SPCU->addString(SPDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, 
                     SP.getName());
 
-  StringRef LinkageName = SP.getLinkageName();
-  if (!LinkageName.empty())
-    SPCU->addString(SPDie, dwarf::DW_AT_MIPS_linkage_name, dwarf::DW_FORM_string,
-                    getRealLinkageName(LinkageName));
-
   SPCU->addSourceLine(SPDie, SP);
 
   if (SP.isPrototyped()) 
@@ -588,12 +584,14 @@ DIE *DwarfDebug::constructInlinedScopeDIE(DbgScope *Scope) {
   if (!Scope->getScopeNode())
     return NULL;
   DIScope DS(Scope->getScopeNode());
-  DIE *ScopeDIE = new DIE(dwarf::DW_TAG_inlined_subroutine);
-
   DISubprogram InlinedSP = getDISubprogram(DS);
   CompileUnit *TheCU = getCompileUnit(InlinedSP);
   DIE *OriginDIE = TheCU->getDIE(InlinedSP);
-  assert(OriginDIE && "Unable to find Origin DIE!");
+  if (!OriginDIE) {
+    DEBUG(dbgs() << "Unable to find original DIE for inlined subprogram.");
+    return NULL;
+  }
+  DIE *ScopeDIE = new DIE(dwarf::DW_TAG_inlined_subroutine);
   TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_abstract_origin,
                      dwarf::DW_FORM_ref4, OriginDIE);
 
@@ -714,8 +712,9 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
         updated = true;
       }
       else if (DVInsn->getOperand(0).isImm())
-        updated = VariableCU->addConstantValue(VariableDie, 
-                                               DVInsn->getOperand(0));
+        updated = 
+          VariableCU->addConstantValue(VariableDie, DVInsn->getOperand(0),
+                                       DV->getType());
       else if (DVInsn->getOperand(0).isFPImm())
         updated =
           VariableCU->addConstantFPValue(VariableDie, DVInsn->getOperand(0));
@@ -750,26 +749,6 @@ DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
 
 }
 
-void CompileUnit::addPubTypes(DISubprogram SP) {
-  DICompositeType SPTy = SP.getType();
-  unsigned SPTag = SPTy.getTag();
-  if (SPTag != dwarf::DW_TAG_subroutine_type)
-    return;
-
-  DIArray Args = SPTy.getTypeArray();
-  for (unsigned i = 0, e = Args.getNumElements(); i != e; ++i) {
-    DIType ATy(Args.getElement(i));
-    if (!ATy.Verify())
-      continue;
-    DICompositeType CATy = getDICompositeType(ATy);
-    if (DIDescriptor(CATy).Verify() && !CATy.getName().empty()
-        && !CATy.isForwardDecl()) {
-      if (DIEEntry *Entry = getDIEEntry(CATy))
-        addGlobalType(CATy.getName(), Entry->getEntry());
-    }
-  }
-}
-
 /// constructScopeDIE - Construct a DIE for this scope.
 DIE *DwarfDebug::constructScopeDIE(DbgScope *Scope) {
   if (!Scope || !Scope->getScopeNode())
@@ -840,11 +819,9 @@ unsigned DwarfDebug::GetOrCreateSourceID(StringRef FileName,
     return GetOrCreateSourceID("<stdin>", StringRef());
 
   // MCStream expects full path name as filename.
-  if (!DirName.empty() && !FileName.startswith("/")) {
-    std::string FullPathName(DirName.data());
-    if (!DirName.endswith("/"))
-      FullPathName += "/";
-    FullPathName += FileName.data();
+  if (!DirName.empty() && !sys::path::is_absolute(FileName)) {
+    SmallString<128> FullPathName = DirName;
+    sys::path::append(FullPathName, FileName);
     // Here FullPathName will be copied into StringMap by GetOrCreateSourceID.
     return GetOrCreateSourceID(StringRef(FullPathName), StringRef());
   }
@@ -862,20 +839,6 @@ unsigned DwarfDebug::GetOrCreateSourceID(StringRef FileName,
   return SrcId;
 }
 
-/// getOrCreateNameSpace - Create a DIE for DINameSpace.
-DIE *CompileUnit::getOrCreateNameSpace(DINameSpace NS) {
-  DIE *NDie = getDIE(NS);
-  if (NDie)
-    return NDie;
-  NDie = new DIE(dwarf::DW_TAG_namespace);
-  insertDIE(NS, NDie);
-  if (!NS.getName().empty())
-    addString(NDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, NS.getName());
-  addSourceLine(NDie, NS);
-  addToContextOwner(NDie, NS.getContext());
-  return NDie;
-}
-
 /// constructCompileUnit - Create new CompileUnit for the given
 /// metadata node with tag DW_TAG_compile_unit.
 void DwarfDebug::constructCompileUnit(const MDNode *N) {
@@ -896,8 +859,8 @@ void DwarfDebug::constructCompileUnit(const MDNode *N) {
   NewCU->addUInt(Die, dwarf::DW_AT_entry_pc, dwarf::DW_FORM_addr, 0);
   // DW_AT_stmt_list is a offset of line number information for this
   // compile unit in debug_line section.
-  if (Asm->MAI->doesDwarfUsesAbsoluteLabelForStmtList())
-    NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_addr,
+  if(Asm->MAI->doesDwarfRequireRelocationForSectionOffset())
+    NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
                     Asm->GetTempSymbol("section_line"));
   else
     NewCU->addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0);
@@ -1020,12 +983,7 @@ void DwarfDebug::constructGlobalVariableDIE(const MDNode *N) {
                      getRealLinkageName(LinkageName));
   // Add type.
   TheCU->addType(VariableDIE, GTy);
-  if (GTy.isCompositeType() && !GTy.getName().empty()
-      && !GTy.isForwardDecl()) {
-    DIEEntry *Entry = TheCU->getDIEEntry(GTy);
-    assert(Entry && "Missing global type!");
-    TheCU->addGlobalType(GTy.getName(), Entry->getEntry());
-  }
+
   // Add scoping info.
   if (!GV.isLocalToUnit()) {
     TheCU->addUInt(VariableDIE, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
@@ -1113,44 +1071,68 @@ void DwarfDebug::beginModule(Module *M) {
   if (DisableDebugInfoPrinting)
     return;
 
-  DebugInfoFinder DbgFinder;
-  DbgFinder.processModule(*M);
+  // If module has named metadata anchors then use them, otherwise scan the module
+  // using debug info finder to collect debug info.
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (CU_Nodes) {
 
-  bool HasDebugInfo = false;
+    NamedMDNode *GV_Nodes = M->getNamedMetadata("llvm.dbg.gv");
+    NamedMDNode *SP_Nodes = M->getNamedMetadata("llvm.dbg.sp");
+    if (!GV_Nodes && !SP_Nodes)
+      // If there are not any global variables or any functions then
+      // there is not any debug info in this module.
+      return;
 
-  // Scan all the compile-units to see if there are any marked as the main unit.
-  // if not, we do not generate debug info.
-  for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
-       E = DbgFinder.compile_unit_end(); I != E; ++I) {
-    if (DICompileUnit(*I).isMain()) {
-      HasDebugInfo = true;
-      break;
-    }
-  }
+    for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i)
+      constructCompileUnit(CU_Nodes->getOperand(i));
 
-  if (!HasDebugInfo) return;
+    if (GV_Nodes)
+      for (unsigned i = 0, e = GV_Nodes->getNumOperands(); i != e; ++i)
+        constructGlobalVariableDIE(GV_Nodes->getOperand(i));
+
+    if (SP_Nodes)
+      for (unsigned i = 0, e = SP_Nodes->getNumOperands(); i != e; ++i)
+        constructSubprogramDIE(SP_Nodes->getOperand(i));
+    
+  } else {
 
+    DebugInfoFinder DbgFinder;
+    DbgFinder.processModule(*M);
+    
+    bool HasDebugInfo = false;
+    // Scan all the compile-units to see if there are any marked as the main unit.
+    // if not, we do not generate debug info.
+    for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
+           E = DbgFinder.compile_unit_end(); I != E; ++I) {
+      if (DICompileUnit(*I).isMain()) {
+        HasDebugInfo = true;
+        break;
+      }
+    }
+    if (!HasDebugInfo) return;
+    
+    // Create all the compile unit DIEs.
+    for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
+           E = DbgFinder.compile_unit_end(); I != E; ++I)
+      constructCompileUnit(*I);
+    
+    // Create DIEs for each global variable.
+    for (DebugInfoFinder::iterator I = DbgFinder.global_variable_begin(),
+           E = DbgFinder.global_variable_end(); I != E; ++I)
+      constructGlobalVariableDIE(*I);
+    
+    // Create DIEs for each subprogram.
+    for (DebugInfoFinder::iterator I = DbgFinder.subprogram_begin(),
+           E = DbgFinder.subprogram_end(); I != E; ++I)
+      constructSubprogramDIE(*I);
+  }
+  
   // Tell MMI that we have debug info.
   MMI->setDebugInfoAvailability(true);
-
+  
   // Emit initial sections.
   EmitSectionLabels();
 
-  // Create all the compile unit DIEs.
-  for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
-         E = DbgFinder.compile_unit_end(); I != E; ++I)
-    constructCompileUnit(*I);
-
-  // Create DIEs for each subprogram.
-  for (DebugInfoFinder::iterator I = DbgFinder.subprogram_begin(),
-         E = DbgFinder.subprogram_end(); I != E; ++I)
-    constructSubprogramDIE(*I);
-
-  // Create DIEs for each global variable.
-  for (DebugInfoFinder::iterator I = DbgFinder.global_variable_begin(),
-         E = DbgFinder.global_variable_end(); I != E; ++I)
-    constructGlobalVariableDIE(*I);
-
   //getOrCreateTypeDIE
   if (NamedMDNode *NMD = M->getNamedMetadata("llvm.dbg.enum"))
     for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
@@ -1239,14 +1221,6 @@ void DwarfDebug::endModule() {
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("section_end", i));
   }
 
-  // Emit common frame information.
-  emitCommonDebugFrame();
-
-  // Emit function debug frame information
-  for (std::vector<FunctionDebugFrameInfo>::iterator I = DebugFrames.begin(),
-         E = DebugFrames.end(); I != E; ++I)
-    emitFunctionDebugFrame(*I);
-
   // Compute DIE offsets and sizes.
   computeSizeAndOffsets();
 
@@ -1434,15 +1408,10 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
            HI = History.begin(), HE = History.end(); HI != HE; ++HI) {
       const MachineInstr *Begin = *HI;
       assert(Begin->isDebugValue() && "Invalid History entry");
-      MachineLocation MLoc;
-      if (Begin->getNumOperands() == 3) {
-        if (Begin->getOperand(0).isReg() && Begin->getOperand(1).isImm())
-          MLoc.set(Begin->getOperand(0).getReg(), Begin->getOperand(1).getImm());
-      } else
-        MLoc = Asm->getDebugValueLocation(Begin);
 
-      // FIXME: emitDebugLoc only understands registers.
-      if (!MLoc.getReg())
+      // Check if DBG_VALUE is truncating a range.
+      if (Begin->getNumOperands() > 1 && Begin->getOperand(0).isReg()
+          && !Begin->getOperand(0).getReg())
         continue;
 
       // Compute the range for a register location.
@@ -1466,7 +1435,25 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
       }
 
       // The value is valid until the next DBG_VALUE or clobber.
-      DotDebugLocEntries.push_back(DotDebugLocEntry(FLabel, SLabel, MLoc, Var));
+      MachineLocation MLoc;
+      if (Begin->getNumOperands() == 3) {
+        if (Begin->getOperand(0).isReg() && Begin->getOperand(1).isImm()) {
+          MLoc.set(Begin->getOperand(0).getReg(), 
+                   Begin->getOperand(1).getImm());
+          DotDebugLocEntries.
+            push_back(DotDebugLocEntry(FLabel, SLabel, MLoc, Var));
+        }
+        // FIXME: Handle isFPImm also.
+        else if (Begin->getOperand(0).isImm()) {
+          DotDebugLocEntries.
+            push_back(DotDebugLocEntry(FLabel, SLabel, 
+                                       Begin->getOperand(0).getImm()));
+        }
+      } else {
+        MLoc = Asm->getDebugValueLocation(Begin);
+        DotDebugLocEntries.
+          push_back(DotDebugLocEntry(FLabel, SLabel, MLoc, Var));
+      }
     }
     DotDebugLocEntries.push_back(DotDebugLocEntry());
   }
@@ -1503,12 +1490,17 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
   if (!MI->isDebugValue()) {
     DebugLoc DL = MI->getDebugLoc();
     if (DL != PrevInstLoc && (!DL.isUnknown() || UnknownLocations)) {
+      unsigned Flags = DWARF2_FLAG_IS_STMT;
       PrevInstLoc = DL;
+      if (DL == PrologEndLoc) {
+        Flags |= DWARF2_FLAG_PROLOGUE_END;
+        PrologEndLoc = DebugLoc();
+      }
       if (!DL.isUnknown()) {
         const MDNode *Scope = DL.getScope(Asm->MF->getFunction()->getContext());
-        recordSourceLine(DL.getLine(), DL.getCol(), Scope);
+        recordSourceLine(DL.getLine(), DL.getCol(), Scope, Flags);
       } else
-        recordSourceLine(0, 0, 0);
+        recordSourceLine(0, 0, 0, 0);
     }
   }
 
@@ -1820,41 +1812,22 @@ void DwarfDebug::identifyScopeMarkers() {
   }
 }
 
-/// FindFirstDebugLoc - Find the first debug location in the function. This
-/// is intended to be an approximation for the source position of the
-/// beginning of the function.
-static DebugLoc FindFirstDebugLoc(const MachineFunction *MF) {
-  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
-       I != E; ++I)
-    for (MachineBasicBlock::const_iterator MBBI = I->begin(), MBBE = I->end();
-         MBBI != MBBE; ++MBBI) {
-      DebugLoc DL = MBBI->getDebugLoc();
-      if (!DL.isUnknown())
-        return DL;
-    }
-  return DebugLoc();
+/// getScopeNode - Get MDNode for DebugLoc's scope.
+static MDNode *getScopeNode(DebugLoc DL, const LLVMContext &Ctx) {
+  if (MDNode *InlinedAt = DL.getInlinedAt(Ctx))
+    return getScopeNode(DebugLoc::getFromDILocation(InlinedAt), Ctx);
+  return DL.getScope(Ctx);
 }
 
-#ifndef NDEBUG
-/// CheckLineNumbers - Count basicblocks whose instructions do not have any
-/// line number information.
-static void CheckLineNumbers(const MachineFunction *MF) {
-  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
-       I != E; ++I) {
-    bool FoundLineNo = false;
-    for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
-         II != IE; ++II) {
-      const MachineInstr *MI = II;
-      if (!MI->getDebugLoc().isUnknown()) {
-        FoundLineNo = true;
-        break;
-      }
-    }
-    if (!FoundLineNo && I->size())
-      ++BlocksWithoutLineNo;      
-  }
+/// getFnDebugLoc - Walk up the scope chain of given debug loc and find
+/// line number  info for the function.
+static DebugLoc getFnDebugLoc(DebugLoc DL, const LLVMContext &Ctx) {
+  const MDNode *Scope = getScopeNode(DL, Ctx);
+  DISubprogram SP = getDISubprogram(Scope);
+  if (SP.Verify()) 
+    return DebugLoc::get(SP.getLineNumber(), 0, SP);
+  return DebugLoc();
 }
-#endif
 
 /// beginFunction - Gather pre-function debug information.  Assumes being
 /// emitted immediately after the function entry point.
@@ -1862,44 +1835,16 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   if (!MMI->hasDebugInfo()) return;
   if (!extractScopeInformation()) return;
 
-#ifndef NDEBUG
-  CheckLineNumbers(MF);
-#endif
-
   FunctionBeginSym = Asm->GetTempSymbol("func_begin",
                                         Asm->getFunctionNumber());
   // Assumes in correct section after the entry point.
   Asm->OutStreamer.EmitLabel(FunctionBeginSym);
 
-  // Emit label for the implicitly defined dbg.stoppoint at the start of the
-  // function.
-  DebugLoc FDL = FindFirstDebugLoc(MF);
-  if (FDL.isUnknown()) return;
-
-  const MDNode *Scope = FDL.getScope(MF->getFunction()->getContext());
-  const MDNode *TheScope = 0;
-
-  DISubprogram SP = getDISubprogram(Scope);
-  unsigned Line, Col;
-  if (SP.Verify()) {
-    Line = SP.getLineNumber();
-    Col = 0;
-    TheScope = SP;
-  } else {
-    Line = FDL.getLine();
-    Col = FDL.getCol();
-    TheScope = Scope;
-  }
-
-  recordSourceLine(Line, Col, TheScope);
-
   assert(UserVariables.empty() && DbgValues.empty() && "Maps weren't cleaned");
 
   /// ProcessedArgs - Collection of arguments already processed.
   SmallPtrSet<const MDNode *, 8> ProcessedArgs;
-
   const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
-
   /// LiveUserVar - Map physreg numbers to the MDNode they contain.
   std::vector<const MDNode*> LiveUserVar(TRI->getNumRegs());
 
@@ -1965,6 +1910,11 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
         if (!MI->isLabel())
           AtBlockEntry = false;
 
+        // First known non DBG_VALUE location marks beginning of function
+        // body.
+        if (PrologEndLoc.isUnknown() && !MI->getDebugLoc().isUnknown())
+          PrologEndLoc = MI->getDebugLoc();
+
         // Check if the instruction clobbers any registers with debug vars.
         for (MachineInstr::const_mop_iterator MOI = MI->operands_begin(),
                MOE = MI->operands_end(); MOI != MOE; ++MOI) {
@@ -2033,6 +1983,15 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
 
   PrevInstLoc = DebugLoc();
   PrevLabel = FunctionBeginSym;
+
+  // Record beginning of function.
+  if (!PrologEndLoc.isUnknown()) {
+    DebugLoc FnStartDL = getFnDebugLoc(PrologEndLoc,
+                                       MF->getFunction()->getContext());
+    recordSourceLine(FnStartDL.getLine(), FnStartDL.getCol(),
+                     FnStartDL.getScope(MF->getFunction()->getContext()),
+                     DWARF2_FLAG_IS_STMT);
+  }
 }
 
 /// endFunction - Gather and emit post-function debug information.
@@ -2147,7 +2106,8 @@ DbgScope *DwarfDebug::findDbgScope(const MachineInstr *MInsn) {
 /// recordSourceLine - Register a source line with debug info. Returns the
 /// unique label that was emitted and which provides correspondence to
 /// the source line list.
-void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S){
+void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S,
+                                  unsigned Flags) {
   StringRef Fn;
   StringRef Dir;
   unsigned Src = 1;
@@ -2175,7 +2135,7 @@ void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S){
 
     Src = GetOrCreateSourceID(Fn, Dir);
   }
-  Asm->OutStreamer.EmitDwarfLocDirective(Src, Line, Col, DWARF2_FLAG_IS_STMT,
+  Asm->OutStreamer.EmitDwarfLocDirective(Src, Line, Col, Flags,
                                          0, 0, Fn);
 }
 
@@ -2264,11 +2224,6 @@ void DwarfDebug::EmitSectionLabels() {
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
 
   // Dwarf sections base addresses.
-  if (Asm->MAI->doesDwarfRequireFrameSection()) {
-    DwarfFrameSectionSym =
-      EmitSectionSym(Asm, TLOF.getDwarfFrameSection(), "section_debug_frame");
-   }
-
   DwarfInfoSectionSym =
     EmitSectionSym(Asm, TLOF.getDwarfInfoSection(), "section_info");
   DwarfAbbrevSectionSym =
@@ -2476,91 +2431,6 @@ void DwarfDebug::emitEndOfLineMatrix(unsigned SectionEnd) {
   Asm->EmitInt8(1);
 }
 
-/// emitCommonDebugFrame - Emit common frame info into a debug frame section.
-///
-void DwarfDebug::emitCommonDebugFrame() {
-  if (!Asm->MAI->doesDwarfRequireFrameSection())
-    return;
-
-  int stackGrowth = Asm->getTargetData().getPointerSize();
-  if (Asm->TM.getFrameLowering()->getStackGrowthDirection() ==
-      TargetFrameLowering::StackGrowsDown)
-    stackGrowth *= -1;
-
-  // Start the dwarf frame section.
-  Asm->OutStreamer.SwitchSection(
-                              Asm->getObjFileLowering().getDwarfFrameSection());
-
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_frame_common"));
-  Asm->OutStreamer.AddComment("Length of Common Information Entry");
-  Asm->EmitLabelDifference(Asm->GetTempSymbol("debug_frame_common_end"),
-                           Asm->GetTempSymbol("debug_frame_common_begin"), 4);
-
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_frame_common_begin"));
-  Asm->OutStreamer.AddComment("CIE Identifier Tag");
-  Asm->EmitInt32((int)dwarf::DW_CIE_ID);
-  Asm->OutStreamer.AddComment("CIE Version");
-  Asm->EmitInt8(dwarf::DW_CIE_VERSION);
-  Asm->OutStreamer.AddComment("CIE Augmentation");
-  Asm->OutStreamer.EmitIntValue(0, 1, /*addrspace*/0); // nul terminator.
-  Asm->EmitULEB128(1, "CIE Code Alignment Factor");
-  Asm->EmitSLEB128(stackGrowth, "CIE Data Alignment Factor");
-  Asm->OutStreamer.AddComment("CIE RA Column");
-  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
-  const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
-  Asm->EmitInt8(RI->getDwarfRegNum(RI->getRARegister(), false));
-
-  std::vector<MachineMove> Moves;
-  TFI->getInitialFrameState(Moves);
-
-  Asm->EmitFrameMoves(Moves, 0, false);
-
-  Asm->EmitAlignment(2);
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_frame_common_end"));
-}
-
-/// emitFunctionDebugFrame - Emit per function frame info into a debug frame
-/// section.
-void DwarfDebug::
-emitFunctionDebugFrame(const FunctionDebugFrameInfo &DebugFrameInfo) {
-  if (!Asm->MAI->doesDwarfRequireFrameSection())
-    return;
-
-  // Start the dwarf frame section.
-  Asm->OutStreamer.SwitchSection(
-                              Asm->getObjFileLowering().getDwarfFrameSection());
-
-  Asm->OutStreamer.AddComment("Length of Frame Information Entry");
-  MCSymbol *DebugFrameBegin =
-    Asm->GetTempSymbol("debug_frame_begin", DebugFrameInfo.Number);
-  MCSymbol *DebugFrameEnd =
-    Asm->GetTempSymbol("debug_frame_end", DebugFrameInfo.Number);
-  Asm->EmitLabelDifference(DebugFrameEnd, DebugFrameBegin, 4);
-
-  Asm->OutStreamer.EmitLabel(DebugFrameBegin);
-
-  Asm->OutStreamer.AddComment("FDE CIE offset");
-  Asm->EmitSectionOffset(Asm->GetTempSymbol("debug_frame_common"),
-                         DwarfFrameSectionSym);
-
-  Asm->OutStreamer.AddComment("FDE initial location");
-  MCSymbol *FuncBeginSym =
-    Asm->GetTempSymbol("func_begin", DebugFrameInfo.Number);
-  Asm->OutStreamer.EmitSymbolValue(FuncBeginSym,
-                                   Asm->getTargetData().getPointerSize(),
-                                   0/*AddrSpace*/);
-
-
-  Asm->OutStreamer.AddComment("FDE address range");
-  Asm->EmitLabelDifference(Asm->GetTempSymbol("func_end",DebugFrameInfo.Number),
-                           FuncBeginSym, Asm->getTargetData().getPointerSize());
-
-  Asm->EmitFrameMoves(DebugFrameInfo.Moves, FuncBeginSym, false);
-
-  Asm->EmitAlignment(2);
-  Asm->OutStreamer.EmitLabel(DebugFrameEnd);
-}
-
 /// emitDebugPubNames - Emit visible names into a debug pubnames section.
 ///
 void DwarfDebug::emitDebugPubNames() {
@@ -2722,20 +2592,44 @@ void DwarfDebug::emitDebugLoc() {
       Asm->OutStreamer.EmitSymbolValue(Entry.Begin, Size, 0);
       Asm->OutStreamer.EmitSymbolValue(Entry.End, Size, 0);
       DIVariable DV(Entry.Variable);
-      if (DV.hasComplexAddress()) {
+      Asm->OutStreamer.AddComment("Loc expr size");
+      MCSymbol *begin = Asm->OutStreamer.getContext().CreateTempSymbol();
+      MCSymbol *end = Asm->OutStreamer.getContext().CreateTempSymbol();
+      Asm->EmitLabelDifference(end, begin, 2);
+      Asm->OutStreamer.EmitLabel(begin);
+      if (Entry.isConstant()) {
+        DIBasicType BTy(DV.getType());
+        if (BTy.Verify() &&
+            (BTy.getEncoding()  == dwarf::DW_ATE_signed 
+             || BTy.getEncoding() == dwarf::DW_ATE_signed_char)) {
+          Asm->OutStreamer.AddComment("DW_OP_consts");
+          Asm->EmitInt8(dwarf::DW_OP_consts);
+          Asm->EmitSLEB128(Entry.getConstant());
+        } else {
+          Asm->OutStreamer.AddComment("DW_OP_constu");
+          Asm->EmitInt8(dwarf::DW_OP_constu);
+          Asm->EmitULEB128(Entry.getConstant());
+        }
+      } else if (DV.hasComplexAddress()) {
         unsigned N = DV.getNumAddrElements();
         unsigned i = 0;
-        Asm->OutStreamer.AddComment("Loc expr size");
         if (N >= 2 && DV.getAddrElement(0) == DIBuilder::OpPlus) {
-          // If first address element is OpPlus then emit
-          // DW_OP_breg + Offset instead of DW_OP_reg + Offset.
-          MachineLocation Loc(Entry.Loc.getReg(), DV.getAddrElement(1));
-          Asm->EmitInt16(Asm->getDwarfRegOpSize(Loc) + N - 2);
-          Asm->EmitDwarfRegOp(Loc);
-//          Asm->EmitULEB128(DV.getAddrElement(1));
-          i = 2;
+          if (Entry.Loc.getOffset()) {
+            i = 2;
+            Asm->EmitDwarfRegOp(Entry.Loc);
+            Asm->OutStreamer.AddComment("DW_OP_deref");
+            Asm->EmitInt8(dwarf::DW_OP_deref);
+            Asm->OutStreamer.AddComment("DW_OP_plus_uconst");
+            Asm->EmitInt8(dwarf::DW_OP_plus_uconst);
+            Asm->EmitSLEB128(DV.getAddrElement(1));
+          } else {
+            // If first address element is OpPlus then emit
+            // DW_OP_breg + Offset instead of DW_OP_reg + Offset.
+            MachineLocation Loc(Entry.Loc.getReg(), DV.getAddrElement(1));
+            Asm->EmitDwarfRegOp(Loc);
+            i = 2;
+          }
         } else {
-          Asm->EmitInt16(Asm->getDwarfRegOpSize(Entry.Loc) + N);
           Asm->EmitDwarfRegOp(Entry.Loc);
         }
 
@@ -2750,10 +2644,10 @@ void DwarfDebug::emitDebugLoc() {
           else llvm_unreachable("unknown Opcode found in complex address");
         }
       } else {
-        Asm->OutStreamer.AddComment("Loc expr size");
-        Asm->EmitInt16(Asm->getDwarfRegOpSize(Entry.Loc));
+        // Regular entry.
         Asm->EmitDwarfRegOp(Entry.Loc);
       }
+      Asm->OutStreamer.EmitLabel(end);
     }
   }
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 25f2675..abda2e6 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -68,10 +68,19 @@ typedef struct DotDebugLocEntry {
   MachineLocation Loc;
   const MDNode *Variable;
   bool Merged;
-  DotDebugLocEntry() : Begin(0), End(0), Variable(0), Merged(false) {}
+  bool Constant;
+  int64_t iConstant;
+  DotDebugLocEntry() 
+    : Begin(0), End(0), Variable(0), Merged(false), 
+      Constant(false), iConstant(0) {}
   DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E, MachineLocation &L,
                    const MDNode *V) 
-    : Begin(B), End(E), Loc(L), Variable(V), Merged(false) {}
+    : Begin(B), End(E), Loc(L), Variable(V), Merged(false), 
+      Constant(false), iConstant(0) {}
+  DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E, int64_t i)
+    : Begin(B), End(E), Variable(0), Merged(false), 
+      Constant(true), iConstant(i) {}
+
   /// Empty entries are also used as a trigger to emit temp label. Such
   /// labels are referenced is used to find debug_loc offset for a given DIE.
   bool isEmpty() { return Begin == 0 && End == 0; }
@@ -82,6 +91,8 @@ typedef struct DotDebugLocEntry {
     Next->Begin = Begin;
     Merged = true;
   }
+  bool isConstant() { return Constant; }
+  int64_t getConstant() { return iConstant; }
 } DotDebugLocEntry;
 
 //===----------------------------------------------------------------------===//
@@ -253,6 +264,10 @@ class DwarfDebug {
   DebugLoc PrevInstLoc;
   MCSymbol *PrevLabel;
 
+  /// PrologEndLoc - This location indicates end of function prologue and
+  /// beginning of function body.
+  DebugLoc PrologEndLoc;
+
   struct FunctionDebugFrameInfo {
     unsigned Number;
     std::vector<MachineMove> Moves;
@@ -269,7 +284,7 @@ class DwarfDebug {
   // Section Symbols: these are assembler temporary labels that are emitted at
   // the beginning of each supported dwarf section.  These are used to form
   // section offsets and are created by EmitSectionLabels.
-  MCSymbol *DwarfFrameSectionSym, *DwarfInfoSectionSym, *DwarfAbbrevSectionSym;
+  MCSymbol *DwarfInfoSectionSym, *DwarfAbbrevSectionSym;
   MCSymbol *DwarfStrSectionSym, *TextSectionSym, *DwarfDebugRangeSectionSym;
   MCSymbol *DwarfDebugLocSectionSym;
   MCSymbol *FunctionBeginSym, *FunctionEndSym;
@@ -338,14 +353,6 @@ private:
   ///
   void emitEndOfLineMatrix(unsigned SectionEnd);
 
-  /// emitCommonDebugFrame - Emit common frame info into a debug frame section.
-  ///
-  void emitCommonDebugFrame();
-
-  /// emitFunctionDebugFrame - Emit per function frame info into a debug frame
-  /// section.
-  void emitFunctionDebugFrame(const FunctionDebugFrameInfo &DebugFrameInfo);
-
   /// emitDebugPubNames - Emit visible names into a debug pubnames section.
   ///
   void emitDebugPubNames();
@@ -410,7 +417,8 @@ private:
   /// recordSourceLine - Register a source line with debug info. Returns the
   /// unique label that was emitted and which provides correspondence to
   /// the source line list.
-  void recordSourceLine(unsigned Line, unsigned Col, const MDNode *Scope);
+  void recordSourceLine(unsigned Line, unsigned Col, const MDNode *Scope,
+                        unsigned Flags);
   
   /// recordVariableFrameIndex - Record a variable's index.
   void recordVariableFrameIndex(const DbgVariable *V, int Index);
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h
index f111641..b5f86ab 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -15,6 +15,7 @@
 #define LLVM_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/AsmPrinter.h"
 #include <vector>
 
 namespace llvm {
@@ -152,6 +153,8 @@ class DwarfCFIException : public DwarfException {
   /// should be emitted.
   bool shouldEmitMoves;
 
+  AsmPrinter::CFIMoveType moveTypeModule;
+
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
@@ -171,7 +174,7 @@ public:
   virtual void EndFunction();
 };
 
-class DwarfTableException : public DwarfException {
+class ARMException : public DwarfException {
   /// shouldEmitTable - Per-function flag to indicate if EH tables should
   /// be emitted.
   bool shouldEmitTable;
@@ -183,48 +186,12 @@ class DwarfTableException : public DwarfException {
   /// shouldEmitTableModule - Per-module flag to indicate if EH tables
   /// should be emitted.
   bool shouldEmitTableModule;
-
-  /// shouldEmitMovesModule - Per-module flag to indicate if frame moves
-  /// should be emitted.
-  bool shouldEmitMovesModule;
-
-  struct FunctionEHFrameInfo {
-    MCSymbol *FunctionEHSym;  // L_foo.eh
-    unsigned Number;
-    unsigned PersonalityIndex;
-    bool adjustsStack;
-    bool hasLandingPads;
-    std::vector<MachineMove> Moves;
-    const Function *function;
-
-    FunctionEHFrameInfo(MCSymbol *EHSym, unsigned Num, unsigned P,
-                        bool hC, bool hL,
-                        const std::vector<MachineMove> &M,
-                        const Function *f):
-      FunctionEHSym(EHSym), Number(Num), PersonalityIndex(P),
-      adjustsStack(hC), hasLandingPads(hL), Moves(M), function (f) { }
-  };
-
-  std::vector<FunctionEHFrameInfo> EHFrames;
-
-  /// UsesLSDA - Indicates whether an FDE that uses the CIE at the given index
-  /// uses an LSDA. If so, then we need to encode that information in the CIE's
-  /// augmentation.
-  DenseMap<unsigned, bool> UsesLSDA;
-
-  /// EmitCIE - Emit a Common Information Entry (CIE). This holds information
-  /// that is shared among many Frame Description Entries.  There is at least
-  /// one CIE in every non-empty .debug_frame section.
-  void EmitCIE(const Function *Personality, unsigned Index);
-
-  /// EmitFDE - Emit the Frame Description Entry (FDE) for the function.
-  void EmitFDE(const FunctionEHFrameInfo &EHFrameInfo);
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
   //
-  DwarfTableException(AsmPrinter *A);
-  virtual ~DwarfTableException();
+  ARMException(AsmPrinter *A);
+  virtual ~ARMException();
 
   /// EndModule - Emit all exception information that should come after the
   /// content.
@@ -238,25 +205,25 @@ public:
   virtual void EndFunction();
 };
 
+class Win64Exception : public DwarfException {
+  /// shouldEmitPersonality - Per-function flag to indicate if personality
+  /// info should be emitted.
+  bool shouldEmitPersonality;
 
-class ARMException : public DwarfException {
-  /// shouldEmitTable - Per-function flag to indicate if EH tables should
-  /// be emitted.
-  bool shouldEmitTable;
+  /// shouldEmitLSDA - Per-function flag to indicate if the LSDA
+  /// should be emitted.
+  bool shouldEmitLSDA;
 
   /// shouldEmitMoves - Per-function flag to indicate if frame moves info
   /// should be emitted.
   bool shouldEmitMoves;
 
-  /// shouldEmitTableModule - Per-module flag to indicate if EH tables
-  /// should be emitted.
-  bool shouldEmitTableModule;
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
   //
-  ARMException(AsmPrinter *A);
-  virtual ~ARMException();
+  Win64Exception(AsmPrinter *A);
+  virtual ~Win64Exception();
 
   /// EndModule - Emit all exception information that should come after the
   /// content.
diff --git a/lib/CodeGen/AsmPrinter/DwarfTableException.cpp b/lib/CodeGen/AsmPrinter/DwarfTableException.cpp
deleted file mode 100644
index b50d8bd..0000000
--- a/lib/CodeGen/AsmPrinter/DwarfTableException.cpp
+++ /dev/null
@@ -1,349 +0,0 @@
-//===-- CodeGen/AsmPrinter/DwarfTableException.cpp - Dwarf Exception Impl --==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains support for writing DWARF exception info into asm files.
-// The implementation emits all the necessary tables "by hands".
-//
-//===----------------------------------------------------------------------===//
-
-#include "DwarfException.h"
-#include "llvm/Module.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineLocation.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/Twine.h"
-using namespace llvm;
-
-DwarfTableException::DwarfTableException(AsmPrinter *A)
-  :  DwarfException(A),
-     shouldEmitTable(false), shouldEmitMoves(false),
-     shouldEmitTableModule(false), shouldEmitMovesModule(false) {}
-
-DwarfTableException::~DwarfTableException() {}
-
-/// EmitCIE - Emit a Common Information Entry (CIE). This holds information that
-/// is shared among many Frame Description Entries.  There is at least one CIE
-/// in every non-empty .debug_frame section.
-void DwarfTableException::EmitCIE(const Function *PersonalityFn, unsigned Index) {
-  // Size and sign of stack growth.
-  int stackGrowth = Asm->getTargetData().getPointerSize();
-  if (Asm->TM.getFrameLowering()->getStackGrowthDirection() ==
-      TargetFrameLowering::StackGrowsDown)
-    stackGrowth *= -1;
-
-  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
-
-  // Begin eh frame section.
-  Asm->OutStreamer.SwitchSection(TLOF.getEHFrameSection());
-
-  MCSymbol *EHFrameSym;
-  if (TLOF.isFunctionEHFrameSymbolPrivate())
-    EHFrameSym = Asm->GetTempSymbol("EH_frame", Index);
-  else
-    EHFrameSym = Asm->OutContext.GetOrCreateSymbol(Twine("EH_frame") +
-                                                   Twine(Index));
-  Asm->OutStreamer.EmitLabel(EHFrameSym);
-
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("section_eh_frame", Index));
-
-  // Define base labels.
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_frame_common", Index));
-
-  // Define the eh frame length.
-  Asm->OutStreamer.AddComment("Length of Common Information Entry");
-  Asm->EmitLabelDifference(Asm->GetTempSymbol("eh_frame_common_end", Index),
-                           Asm->GetTempSymbol("eh_frame_common_begin", Index),
-                           4);
-
-  // EH frame header.
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_frame_common_begin",Index));
-  Asm->OutStreamer.AddComment("CIE Identifier Tag");
-  Asm->OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/);
-  Asm->OutStreamer.AddComment("DW_CIE_VERSION");
-  Asm->OutStreamer.EmitIntValue(dwarf::DW_CIE_VERSION, 1/*size*/, 0/*addr*/);
-
-  // The personality presence indicates that language specific information will
-  // show up in the eh frame.  Find out how we are supposed to lower the
-  // personality function reference:
-
-  unsigned LSDAEncoding = TLOF.getLSDAEncoding();
-  unsigned FDEEncoding = TLOF.getFDEEncoding(false);
-  unsigned PerEncoding = TLOF.getPersonalityEncoding();
-
-  char Augmentation[6] = { 0 };
-  unsigned AugmentationSize = 0;
-  char *APtr = Augmentation + 1;
-
-  if (PersonalityFn) {
-    // There is a personality function.
-    *APtr++ = 'P';
-    AugmentationSize += 1 + Asm->GetSizeOfEncodedValue(PerEncoding);
-  }
-
-  if (UsesLSDA[Index]) {
-    // An LSDA pointer is in the FDE augmentation.
-    *APtr++ = 'L';
-    ++AugmentationSize;
-  }
-
-  if (FDEEncoding != dwarf::DW_EH_PE_absptr) {
-    // A non-default pointer encoding for the FDE.
-    *APtr++ = 'R';
-    ++AugmentationSize;
-  }
-
-  if (APtr != Augmentation + 1)
-    Augmentation[0] = 'z';
-
-  Asm->OutStreamer.AddComment("CIE Augmentation");
-  Asm->OutStreamer.EmitBytes(StringRef(Augmentation, strlen(Augmentation)+1),0);
-
-  // Round out reader.
-  Asm->EmitULEB128(1, "CIE Code Alignment Factor");
-  Asm->EmitSLEB128(stackGrowth, "CIE Data Alignment Factor");
-  Asm->OutStreamer.AddComment("CIE Return Address Column");
-
-  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
-  const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
-  Asm->EmitInt8(RI->getDwarfRegNum(RI->getRARegister(), true));
-
-  if (Augmentation[0]) {
-    Asm->EmitULEB128(AugmentationSize, "Augmentation Size");
-
-    // If there is a personality, we need to indicate the function's location.
-    if (PersonalityFn) {
-      Asm->EmitEncodingByte(PerEncoding, "Personality");
-      Asm->OutStreamer.AddComment("Personality");
-      Asm->EmitReference(PersonalityFn, PerEncoding);
-    }
-    if (UsesLSDA[Index])
-      Asm->EmitEncodingByte(LSDAEncoding, "LSDA");
-    if (FDEEncoding != dwarf::DW_EH_PE_absptr)
-      Asm->EmitEncodingByte(FDEEncoding, "FDE");
-  }
-
-  // Indicate locations of general callee saved registers in frame.
-  std::vector<MachineMove> Moves;
-  TFI->getInitialFrameState(Moves);
-  Asm->EmitFrameMoves(Moves, 0, true);
-
-  // On Darwin the linker honors the alignment of eh_frame, which means it must
-  // be 8-byte on 64-bit targets to match what gcc does.  Otherwise you get
-  // holes which confuse readers of eh_frame.
-  Asm->EmitAlignment(Asm->getTargetData().getPointerSize() == 4 ? 2 : 3);
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_frame_common_end", Index));
-}
-
-/// EmitFDE - Emit the Frame Description Entry (FDE) for the function.
-void DwarfTableException::EmitFDE(const FunctionEHFrameInfo &EHFrameInfo) {
-  assert(!EHFrameInfo.function->hasAvailableExternallyLinkage() &&
-         "Should not emit 'available externally' functions at all");
-
-  const Function *TheFunc = EHFrameInfo.function;
-  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
-
-  unsigned LSDAEncoding = TLOF.getLSDAEncoding();
-  unsigned FDEEncoding = TLOF.getFDEEncoding(false);
-
-  Asm->OutStreamer.SwitchSection(TLOF.getEHFrameSection());
-
-  // Externally visible entry into the functions eh frame info. If the
-  // corresponding function is static, this should not be externally visible.
-  if (!TheFunc->hasLocalLinkage() && TLOF.isFunctionEHSymbolGlobal())
-    Asm->OutStreamer.EmitSymbolAttribute(EHFrameInfo.FunctionEHSym,MCSA_Global);
-
-  // If corresponding function is weak definition, this should be too.
-  if (TheFunc->isWeakForLinker() && Asm->MAI->getWeakDefDirective())
-    Asm->OutStreamer.EmitSymbolAttribute(EHFrameInfo.FunctionEHSym,
-                                         MCSA_WeakDefinition);
-
-  // If corresponding function is hidden, this should be too.
-  if (TheFunc->hasHiddenVisibility())
-    if (MCSymbolAttr HiddenAttr = Asm->MAI->getHiddenVisibilityAttr())
-      Asm->OutStreamer.EmitSymbolAttribute(EHFrameInfo.FunctionEHSym,
-                                           HiddenAttr);
-
-  // If there are no calls then you can't unwind.  This may mean we can omit the
-  // EH Frame, but some environments do not handle weak absolute symbols. If
-  // UnwindTablesMandatory is set we cannot do this optimization; the unwind
-  // info is to be available for non-EH uses.
-  if (!EHFrameInfo.adjustsStack && !UnwindTablesMandatory &&
-      (!TheFunc->isWeakForLinker() ||
-       !Asm->MAI->getWeakDefDirective() ||
-       TLOF.getSupportsWeakOmittedEHFrame())) {
-    Asm->OutStreamer.EmitAssignment(EHFrameInfo.FunctionEHSym,
-                                    MCConstantExpr::Create(0, Asm->OutContext));
-    // This name has no connection to the function, so it might get
-    // dead-stripped when the function is not, erroneously.  Prohibit
-    // dead-stripping unconditionally.
-    if (Asm->MAI->hasNoDeadStrip())
-      Asm->OutStreamer.EmitSymbolAttribute(EHFrameInfo.FunctionEHSym,
-                                           MCSA_NoDeadStrip);
-  } else {
-    Asm->OutStreamer.EmitLabel(EHFrameInfo.FunctionEHSym);
-
-    // EH frame header.
-    Asm->OutStreamer.AddComment("Length of Frame Information Entry");
-    Asm->EmitLabelDifference(
-                Asm->GetTempSymbol("eh_frame_end", EHFrameInfo.Number),
-                Asm->GetTempSymbol("eh_frame_begin", EHFrameInfo.Number), 4);
-
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_frame_begin",
-                                                  EHFrameInfo.Number));
-
-    Asm->OutStreamer.AddComment("FDE CIE offset");
-    Asm->EmitLabelDifference(
-                       Asm->GetTempSymbol("eh_frame_begin", EHFrameInfo.Number),
-                       Asm->GetTempSymbol("eh_frame_common",
-                                          EHFrameInfo.PersonalityIndex), 4);
-
-    MCSymbol *EHFuncBeginSym =
-      Asm->GetTempSymbol("eh_func_begin", EHFrameInfo.Number);
-
-    Asm->OutStreamer.AddComment("FDE initial location");
-    Asm->EmitReference(EHFuncBeginSym, FDEEncoding);
-
-    Asm->OutStreamer.AddComment("FDE address range");
-    Asm->EmitLabelDifference(Asm->GetTempSymbol("eh_func_end",
-                                                EHFrameInfo.Number),
-                             EHFuncBeginSym,
-                             Asm->GetSizeOfEncodedValue(FDEEncoding));
-
-    // If there is a personality and landing pads then point to the language
-    // specific data area in the exception table.
-    if (MMI->getPersonalities()[0] != NULL) {
-      unsigned Size = Asm->GetSizeOfEncodedValue(LSDAEncoding);
-
-      Asm->EmitULEB128(Size, "Augmentation size");
-      Asm->OutStreamer.AddComment("Language Specific Data Area");
-      if (EHFrameInfo.hasLandingPads)
-        Asm->EmitReference(Asm->GetTempSymbol("exception", EHFrameInfo.Number),
-                           LSDAEncoding);
-      else
-        Asm->OutStreamer.EmitIntValue(0, Size/*size*/, 0/*addrspace*/);
-
-    } else {
-      Asm->EmitULEB128(0, "Augmentation size");
-    }
-
-    // Indicate locations of function specific callee saved registers in frame.
-    Asm->EmitFrameMoves(EHFrameInfo.Moves, EHFuncBeginSym, true);
-
-    // On Darwin the linker honors the alignment of eh_frame, which means it
-    // must be 8-byte on 64-bit targets to match what gcc does.  Otherwise you
-    // get holes which confuse readers of eh_frame.
-    Asm->EmitAlignment(Asm->getTargetData().getPointerSize() == 4 ? 2 : 3);
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_frame_end",
-                                                  EHFrameInfo.Number));
-
-    // If the function is marked used, this table should be also.  We cannot
-    // make the mark unconditional in this case, since retaining the table also
-    // retains the function in this case, and there is code around that depends
-    // on unused functions (calling undefined externals) being dead-stripped to
-    // link correctly.  Yes, there really is.
-    if (MMI->isUsedFunction(EHFrameInfo.function))
-      if (Asm->MAI->hasNoDeadStrip())
-        Asm->OutStreamer.EmitSymbolAttribute(EHFrameInfo.FunctionEHSym,
-                                             MCSA_NoDeadStrip);
-  }
-  Asm->OutStreamer.AddBlankLine();
-}
-
-/// EndModule - Emit all exception information that should come after the
-/// content.
-void DwarfTableException::EndModule() {
-  if (!Asm->MAI->isExceptionHandlingDwarf())
-    return;
-
-  if (!shouldEmitMovesModule && !shouldEmitTableModule)
-    return;
-
-  const std::vector<const Function*> &Personalities = MMI->getPersonalities();
-
-  for (unsigned I = 0, E = Personalities.size(); I < E; ++I)
-    EmitCIE(Personalities[I], I);
-
-  for (std::vector<FunctionEHFrameInfo>::iterator
-         I = EHFrames.begin(), E = EHFrames.end(); I != E; ++I)
-    EmitFDE(*I);
-}
-
-/// BeginFunction - Gather pre-function exception information. Assumes it's
-/// being emitted immediately after the function entry point.
-void DwarfTableException::BeginFunction(const MachineFunction *MF) {
-  shouldEmitTable = shouldEmitMoves = false;
-
-  // If any landing pads survive, we need an EH table.
-  shouldEmitTable = !MMI->getLandingPads().empty();
-
-  // See if we need frame move info.
-  shouldEmitMoves =
-    !Asm->MF->getFunction()->doesNotThrow() || UnwindTablesMandatory;
-
-  if (shouldEmitMoves || shouldEmitTable)
-    // Assumes in correct section after the entry point.
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin",
-                                                  Asm->getFunctionNumber()));
-
-  shouldEmitTableModule |= shouldEmitTable;
-  shouldEmitMovesModule |= shouldEmitMoves;
-}
-
-/// EndFunction - Gather and emit post-function exception information.
-///
-void DwarfTableException::EndFunction() {
-  if (!shouldEmitMoves && !shouldEmitTable) return;
-
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_end",
-                                                Asm->getFunctionNumber()));
-
-  // Record if this personality index uses a landing pad.
-  bool HasLandingPad = !MMI->getLandingPads().empty();
-  UsesLSDA[MMI->getPersonalityIndex()] |= HasLandingPad;
-
-  // Map all labels and get rid of any dead landing pads.
-  MMI->TidyLandingPads();
-
-  if (HasLandingPad)
-    EmitExceptionTable();
-
-  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
-  MCSymbol *FunctionEHSym =
-    Asm->GetSymbolWithGlobalValueBase(Asm->MF->getFunction(), ".eh",
-                                      TLOF.isFunctionEHFrameSymbolPrivate());
-
-  // Save EH frame information
-  EHFrames.
-    push_back(FunctionEHFrameInfo(FunctionEHSym,
-                                  Asm->getFunctionNumber(),
-                                  MMI->getPersonalityIndex(),
-                                  Asm->MF->getFrameInfo()->adjustsStack(),
-                                  !MMI->getLandingPads().empty(),
-                                  MMI->getFrameMoves(),
-                                  Asm->MF->getFunction()));
-}
diff --git a/lib/CodeGen/AsmPrinter/Win64Exception.cpp b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
new file mode 100644
index 0000000..c2ad5eb
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
@@ -0,0 +1,116 @@
+//===-- CodeGen/AsmPrinter/Win64Exception.cpp - Dwarf Exception Impl ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing Win64 exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfException.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+using namespace llvm;
+
+Win64Exception::Win64Exception(AsmPrinter *A)
+  : DwarfException(A),
+    shouldEmitPersonality(false), shouldEmitLSDA(false), shouldEmitMoves(false)
+    {}
+
+Win64Exception::~Win64Exception() {}
+
+/// EndModule - Emit all exception information that should come after the
+/// content.
+void Win64Exception::EndModule() {
+}
+
+/// BeginFunction - Gather pre-function exception information. Assumes it's
+/// being emitted immediately after the function entry point.
+void Win64Exception::BeginFunction(const MachineFunction *MF) {
+  shouldEmitMoves = shouldEmitPersonality = shouldEmitLSDA = false;
+
+  // If any landing pads survive, we need an EH table.
+  bool hasLandingPads = !MMI->getLandingPads().empty();
+
+  shouldEmitMoves = Asm->needsSEHMoves();
+
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  unsigned PerEncoding = TLOF.getPersonalityEncoding();
+  const Function *Per = MMI->getPersonalities()[MMI->getPersonalityIndex()];
+
+  shouldEmitPersonality = hasLandingPads &&
+    PerEncoding != dwarf::DW_EH_PE_omit && Per;
+
+  unsigned LSDAEncoding = TLOF.getLSDAEncoding();
+  shouldEmitLSDA = shouldEmitPersonality &&
+    LSDAEncoding != dwarf::DW_EH_PE_omit;
+
+  if (!shouldEmitPersonality && !shouldEmitMoves)
+    return;
+
+  Asm->OutStreamer.EmitWin64EHStartProc(Asm->CurrentFnSym);
+
+  if (!shouldEmitPersonality)
+    return;
+
+  MCSymbol *GCCHandlerSym =
+    Asm->GetExternalSymbolSymbol("_GCC_specific_handler");
+  Asm->OutStreamer.EmitWin64EHHandler(GCCHandlerSym, true, true);
+
+  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin",
+                                                Asm->getFunctionNumber()));
+}
+
+/// EndFunction - Gather and emit post-function exception information.
+///
+void Win64Exception::EndFunction() {
+  if (!shouldEmitPersonality && !shouldEmitMoves)
+    return;
+
+  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_end",
+                                                Asm->getFunctionNumber()));
+
+  // Map all labels and get rid of any dead landing pads.
+  MMI->TidyLandingPads();
+
+  if (shouldEmitPersonality) {
+    const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+    const Function *Per = MMI->getPersonalities()[MMI->getPersonalityIndex()];
+    const MCSymbol *Sym = TLOF.getCFIPersonalitySymbol(Per, Asm->Mang, MMI);
+
+    Asm->OutStreamer.PushSection();
+    Asm->OutStreamer.EmitWin64EHHandlerData();
+    Asm->OutStreamer.EmitValue(MCSymbolRefExpr::Create(Sym, Asm->OutContext),
+                               4);
+    EmitExceptionTable();
+    Asm->OutStreamer.PopSection();
+  }
+  Asm->OutStreamer.EmitWin64EHEndProc();
+}
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 7704340..719cd26 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -41,6 +41,7 @@ using namespace llvm;
 STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
 STATISTIC(NumBranchOpts, "Number of branches optimized");
 STATISTIC(NumTailMerge , "Number of block tails merged");
+STATISTIC(NumHoist     , "Number of times common instructions are hoisted");
 
 static cl::opt<cl::boolOrDefault> FlagEnableTailMerge("enable-tail-merge",
                               cl::init(cl::BOU_UNSET), cl::Hidden);
@@ -65,7 +66,7 @@ namespace {
   public:
     static char ID;
     explicit BranchFolderPass(bool defaultEnableTailMerge)
-      : MachineFunctionPass(ID), BranchFolder(defaultEnableTailMerge) {}
+      : MachineFunctionPass(ID), BranchFolder(defaultEnableTailMerge, true) {}
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
     virtual const char *getPassName() const { return "Control Flow Optimizer"; }
@@ -86,12 +87,14 @@ bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
 }
 
 
-BranchFolder::BranchFolder(bool defaultEnableTailMerge) {
+BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist) {
   switch (FlagEnableTailMerge) {
   case cl::BOU_UNSET: EnableTailMerge = defaultEnableTailMerge; break;
   case cl::BOU_TRUE: EnableTailMerge = true; break;
   case cl::BOU_FALSE: EnableTailMerge = false; break;
   }
+
+  EnableHoistCommonCode = CommonHoist;
 }
 
 /// RemoveDeadBlock - Remove the specified dead machine basic block from the
@@ -186,9 +189,10 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
 
   bool MadeChangeThisIteration = true;
   while (MadeChangeThisIteration) {
-    MadeChangeThisIteration = false;
-    MadeChangeThisIteration |= TailMergeBlocks(MF);
-    MadeChangeThisIteration |= OptimizeBranches(MF);
+    MadeChangeThisIteration    = TailMergeBlocks(MF);
+    MadeChangeThisIteration   |= OptimizeBranches(MF);
+    if (EnableHoistCommonCode)
+      MadeChangeThisIteration |= HoistCommonCode(MF);
     MadeChange |= MadeChangeThisIteration;
   }
 
@@ -910,7 +914,8 @@ bool BranchFolder::OptimizeBranches(MachineFunction &MF) {
   // Make sure blocks are numbered in order
   MF.RenumberBlocks();
 
-  for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ) {
+  for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
+       I != E; ) {
     MachineBasicBlock *MBB = I++;
     MadeChange |= OptimizeBlock(MBB);
 
@@ -1051,6 +1056,22 @@ ReoptimizeBlock:
         !MBB->hasAddressTaken() && !MBB->isLandingPad()) {
       DEBUG(dbgs() << "\nMerging into block: " << PrevBB
                    << "From MBB: " << *MBB);
+      // Remove redundant DBG_VALUEs first.
+      if (PrevBB.begin() != PrevBB.end()) {
+        MachineBasicBlock::iterator PrevBBIter = PrevBB.end();
+        --PrevBBIter;
+        MachineBasicBlock::iterator MBBIter = MBB->begin();
+        // Check if DBG_VALUE at the end of PrevBB is identical to the 
+        // DBG_VALUE at the beginning of MBB.
+        while (PrevBBIter != PrevBB.begin() && MBBIter != MBB->end()
+               && PrevBBIter->isDebugValue() && MBBIter->isDebugValue()) {
+          if (!MBBIter->isIdenticalTo(PrevBBIter))
+            break;
+          MachineInstr *DuplicateDbg = MBBIter;
+          ++MBBIter; -- PrevBBIter;
+          DuplicateDbg->eraseFromParent();
+        }
+      }
       PrevBB.splice(PrevBB.end(), MBB, MBB->begin(), MBB->end());
       PrevBB.removeSuccessor(PrevBB.succ_begin());;
       assert(PrevBB.succ_empty());
@@ -1339,3 +1360,282 @@ ReoptimizeBlock:
 
   return MadeChange;
 }
+
+//===----------------------------------------------------------------------===//
+//  Hoist Common Code
+//===----------------------------------------------------------------------===//
+
+/// HoistCommonCode - Hoist common instruction sequences at the start of basic
+/// blocks to their common predecessor.
+bool BranchFolder::HoistCommonCode(MachineFunction &MF) {
+  bool MadeChange = false;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ) {
+    MachineBasicBlock *MBB = I++;
+    MadeChange |= HoistCommonCodeInSuccs(MBB);
+  }
+
+  return MadeChange;
+}
+
+/// findFalseBlock - BB has a fallthrough. Find its 'false' successor given
+/// its 'true' successor.
+static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
+                                         MachineBasicBlock *TrueBB) {
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+         E = BB->succ_end(); SI != E; ++SI) {
+    MachineBasicBlock *SuccBB = *SI;
+    if (SuccBB != TrueBB)
+      return SuccBB;
+  }
+  return NULL;
+}
+
+/// findHoistingInsertPosAndDeps - Find the location to move common instructions
+/// in successors to. The location is ususally just before the terminator,
+/// however if the terminator is a conditional branch and its previous
+/// instruction is the flag setting instruction, the previous instruction is
+/// the preferred location. This function also gathers uses and defs of the
+/// instructions from the insertion point to the end of the block. The data is
+/// used by HoistCommonCodeInSuccs to ensure safety.
+static
+MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
+                                                  const TargetInstrInfo *TII,
+                                                  const TargetRegisterInfo *TRI,
+                                                  SmallSet<unsigned,4> &Uses,
+                                                  SmallSet<unsigned,4> &Defs) {
+  MachineBasicBlock::iterator Loc = MBB->getFirstTerminator();
+  if (!TII->isUnpredicatedTerminator(Loc))
+    return MBB->end();
+
+  for (unsigned i = 0, e = Loc->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = Loc->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isUse()) {
+      Uses.insert(Reg);
+      for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+        Uses.insert(*AS);
+    } else if (!MO.isDead())
+      // Don't try to hoist code in the rare case the terminator defines a
+      // register that is later used.
+      return MBB->end();
+  }
+
+  if (Uses.empty())
+    return Loc;
+  if (Loc == MBB->begin())
+    return MBB->end();
+
+  // The terminator is probably a conditional branch, try not to separate the
+  // branch from condition setting instruction.
+  MachineBasicBlock::iterator PI = Loc;
+  --PI;
+  while (PI != MBB->begin() && Loc->isDebugValue())
+    --PI;
+
+  bool IsDef = false;
+  for (unsigned i = 0, e = PI->getNumOperands(); !IsDef && i != e; ++i) {
+    const MachineOperand &MO = PI->getOperand(i);
+    if (!MO.isReg() || MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (Uses.count(Reg))
+      IsDef = true;
+  }
+  if (!IsDef)
+    // The condition setting instruction is not just before the conditional
+    // branch.
+    return Loc;
+
+  // Be conservative, don't insert instruction above something that may have
+  // side-effects. And since it's potentially bad to separate flag setting
+  // instruction from the conditional branch, just abort the optimization
+  // completely.
+  // Also avoid moving code above predicated instruction since it's hard to
+  // reason about register liveness with predicated instruction.
+  bool DontMoveAcrossStore = true;
+  if (!PI->isSafeToMove(TII, 0, DontMoveAcrossStore) ||
+      TII->isPredicated(PI))
+    return MBB->end();
+
+
+  // Find out what registers are live. Note this routine is ignoring other live
+  // registers which are only used by instructions in successor blocks.
+  for (unsigned i = 0, e = PI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = PI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isUse()) {
+      Uses.insert(Reg);
+      for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+        Uses.insert(*AS);
+    } else {
+      if (Uses.count(Reg)) {
+        Uses.erase(Reg);
+        for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
+          Uses.erase(*SR); // Use getSubRegisters to be conservative
+      }
+      Defs.insert(Reg);
+      for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+        Defs.insert(*AS);
+    }
+  }
+
+  return PI;
+}
+
+/// HoistCommonCodeInSuccs - If the successors of MBB has common instruction
+/// sequence at the start of the function, move the instructions before MBB
+/// terminator if it's legal.
+bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  if (TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, true) || !TBB || Cond.empty())
+    return false;
+
+  if (!FBB) FBB = findFalseBlock(MBB, TBB);
+  if (!FBB)
+    // Malformed bcc? True and false blocks are the same?
+    return false;
+
+  // Restrict the optimization to cases where MBB is the only predecessor,
+  // it is an obvious win.
+  if (TBB->pred_size() > 1 || FBB->pred_size() > 1)
+    return false;
+
+  // Find a suitable position to hoist the common instructions to. Also figure
+  // out which registers are used or defined by instructions from the insertion
+  // point to the end of the block.
+  SmallSet<unsigned, 4> Uses, Defs;
+  MachineBasicBlock::iterator Loc =
+    findHoistingInsertPosAndDeps(MBB, TII, TRI, Uses, Defs);
+  if (Loc == MBB->end())
+    return false;
+
+  bool HasDups = false;
+  SmallVector<unsigned, 4> LocalDefs;
+  SmallSet<unsigned, 4> LocalDefsSet;
+  MachineBasicBlock::iterator TIB = TBB->begin();
+  MachineBasicBlock::iterator FIB = FBB->begin();
+  MachineBasicBlock::iterator TIE = TBB->end();
+  MachineBasicBlock::iterator FIE = FBB->end();
+  while (TIB != TIE && FIB != FIE) {
+    // Skip dbg_value instructions. These do not count.
+    if (TIB->isDebugValue()) {
+      while (TIB != TIE && TIB->isDebugValue())
+        ++TIB;
+      if (TIB == TIE)
+        break;
+    }
+    if (FIB->isDebugValue()) {
+      while (FIB != FIE && FIB->isDebugValue())
+        ++FIB;
+      if (FIB == FIE)
+        break;
+    }
+    if (!TIB->isIdenticalTo(FIB, MachineInstr::CheckKillDead))
+      break;
+
+    if (TII->isPredicated(TIB))
+      // Hard to reason about register liveness with predicated instruction.
+      break;
+
+    bool IsSafe = true;
+    for (unsigned i = 0, e = TIB->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = TIB->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      if (MO.isDef()) {
+        if (Uses.count(Reg)) {
+          // Avoid clobbering a register that's used by the instruction at
+          // the point of insertion.
+          IsSafe = false;
+          break;
+        }
+
+        if (Defs.count(Reg) && !MO.isDead()) {
+          // Don't hoist the instruction if the def would be clobber by the
+          // instruction at the point insertion. FIXME: This is overly
+          // conservative. It should be possible to hoist the instructions
+          // in BB2 in the following example:
+          // BB1:
+          // r1, eflag = op1 r2, r3
+          // brcc eflag
+          //
+          // BB2:
+          // r1 = op2, ...
+          //    = op3, r1<kill>
+          IsSafe = false;
+          break;
+        }
+      } else if (!LocalDefsSet.count(Reg)) {
+        if (Defs.count(Reg)) {
+          // Use is defined by the instruction at the point of insertion.
+          IsSafe = false;
+          break;
+        }
+      }
+    }
+    if (!IsSafe)
+      break;
+
+    bool DontMoveAcrossStore = true;
+    if (!TIB->isSafeToMove(TII, 0, DontMoveAcrossStore))
+      break;
+
+    // Track local defs so we can update liveins.
+    for (unsigned i = 0, e = TIB->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = TIB->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      if (MO.isDef()) {
+        if (!MO.isDead()) {
+          LocalDefs.push_back(Reg);
+          LocalDefsSet.insert(Reg);
+          for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
+            LocalDefsSet.insert(*SR);
+        }
+      } else if (MO.isKill() && LocalDefsSet.count(Reg)) {
+        LocalDefsSet.erase(Reg);
+        for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
+          LocalDefsSet.erase(*SR);
+      }
+    }
+
+    HasDups = true;;
+    ++TIB;
+    ++FIB;
+  }
+
+  if (!HasDups)
+    return false;
+
+  MBB->splice(Loc, TBB, TBB->begin(), TIB);
+  FBB->erase(FBB->begin(), FIB);
+
+  // Update livein's.
+  for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
+    unsigned Def = LocalDefs[i];
+    if (LocalDefsSet.count(Def)) {
+      TBB->addLiveIn(Def);
+      FBB->addLiveIn(Def);
+    }
+  }
+
+  ++NumHoist;
+  return true;
+}
diff --git a/lib/CodeGen/BranchFolding.h b/lib/CodeGen/BranchFolding.h
index 15dfa7f..4daf4ec 100644
--- a/lib/CodeGen/BranchFolding.h
+++ b/lib/CodeGen/BranchFolding.h
@@ -19,11 +19,10 @@ namespace llvm {
   class RegScavenger;
   class TargetInstrInfo;
   class TargetRegisterInfo;
-  template<typename T> class SmallVectorImpl;
 
   class BranchFolder {
   public:
-    explicit BranchFolder(bool defaultEnableTailMerge);
+    explicit BranchFolder(bool defaultEnableTailMerge, bool CommonHoist);
 
     bool OptimizeFunction(MachineFunction &MF,
                           const TargetInstrInfo *tii,
@@ -85,6 +84,7 @@ namespace llvm {
     std::vector<SameTailElt> SameTails;
 
     bool EnableTailMerge;
+    bool EnableHoistCommonCode;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
     MachineModuleInfo *MMI;
@@ -110,6 +110,9 @@ namespace llvm {
     bool OptimizeBlock(MachineBasicBlock *MBB);
     void RemoveDeadBlock(MachineBasicBlock *MBB);
     bool OptimizeImpDefsBlock(MachineBasicBlock *MBB);
+
+    bool HoistCommonCode(MachineFunction &MF);
+    bool HoistCommonCodeInSuccs(MachineBasicBlock *MBB);
   };
 }
 
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 2ca3859..c726d92 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -67,6 +67,7 @@ add_llvm_library(LLVMCodeGen
   RegAllocGreedy.cpp
   RegAllocLinearScan.cpp
   RegAllocPBQP.cpp
+  RegisterClassInfo.cpp
   RegisterCoalescer.cpp
   RegisterScavenging.cpp
   RenderMachineFunction.cpp
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index e5894b8..5d722ee 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -165,7 +165,7 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
     return;
 
   // Mark li as unspillable if all live ranges are tiny.
-  if (li.isZeroLength()) {
+  if (li.isZeroLength(LIS.getSlotIndexes())) {
     li.markNotSpillable();
     return;
   }
diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
index bfb6ba1..14eb054 100644
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -22,21 +23,22 @@
 #include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
-CCState::CCState(CallingConv::ID CC, bool isVarArg, const TargetMachine &tm,
-                 SmallVector<CCValAssign, 16> &locs, LLVMContext &C)
-  : CallingConv(CC), IsVarArg(isVarArg), TM(tm),
+CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
+                 const TargetMachine &tm, SmallVector<CCValAssign, 16> &locs,
+                 LLVMContext &C)
+  : CallingConv(CC), IsVarArg(isVarArg), MF(mf), TM(tm),
     TRI(*TM.getRegisterInfo()), Locs(locs), Context(C),
-    CallOrPrologue(Invalid) {
+    CallOrPrologue(Unknown) {
   // No stack is used.
   StackOffset = 0;
-  
+
   clearFirstByValReg();
   UsedRegs.resize((TRI.getNumRegs()+31)/32);
 }
 
-// HandleByVal - Allocate a stack slot large enough to pass an argument by
-// value. The size and alignment information of the argument is encoded in its
-// parameter attribute.
+// HandleByVal - Allocate space on the stack large enough to pass an argument
+// by value. The size and alignment information of the argument is encoded in
+// its parameter attribute.
 void CCState::HandleByVal(unsigned ValNo, MVT ValVT,
                           MVT LocVT, CCValAssign::LocInfo LocInfo,
                           int MinSize, int MinAlign,
@@ -47,7 +49,9 @@ void CCState::HandleByVal(unsigned ValNo, MVT ValVT,
     Size = MinSize;
   if (MinAlign > (int)Align)
     Align = MinAlign;
-  TM.getTargetLowering()->HandleByVal(const_cast<CCState*>(this), Size);
+  if (MF.getFrameInfo()->getMaxAlignment() < Align)
+    MF.getFrameInfo()->setMaxAlignment(Align);
+  TM.getTargetLowering()->HandleByVal(this, Size);
   unsigned Offset = AllocateStack(Size, Align);
   addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
 }
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index f79598d..51d984f 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -421,7 +421,8 @@ unsigned CriticalAntiDepBreaker::
 BreakAntiDependencies(const std::vector<SUnit>& SUnits,
                       MachineBasicBlock::iterator Begin,
                       MachineBasicBlock::iterator End,
-                      unsigned InsertPosIndex) {
+                      unsigned InsertPosIndex,
+                      DbgValueVector &DbgValues) {
   // The code below assumes that there is at least one instruction,
   // so just duck out immediately if the block is empty.
   if (SUnits.empty()) return 0;
@@ -628,14 +629,10 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
           // as well.
           const SUnit *SU = MISUnitMap[Q->second->getParent()];
           if (!SU) continue;
-          for (unsigned i = 0, e = SU->DbgInstrList.size() ; i < e ; ++i) {
-            MachineInstr *DI = SU->DbgInstrList[i];
-            assert (DI->getNumOperands()==3 && DI->getOperand(0).isReg() &&
-                    DI->getOperand(0).getReg()
-                    && "Non register dbg_value attached to SUnit!");
-            if (DI->getOperand(0).getReg() == AntiDepReg)
-              DI->getOperand(0).setReg(NewReg);
-          }
+          for (DbgValueVector::iterator DVI = DbgValues.begin(),
+                 DVE = DbgValues.end(); DVI != DVE; ++DVI)
+            if (DVI->second == Q->second->getParent())
+              UpdateDbgValue(DVI->first, AntiDepReg, NewReg);
         }
 
         // We just went back in time and modified history; the
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.h b/lib/CodeGen/CriticalAntiDepBreaker.h
index 0daaef2..5bbb8f5 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.h
+++ b/lib/CodeGen/CriticalAntiDepBreaker.h
@@ -79,7 +79,8 @@ class TargetRegisterInfo;
     unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
                                    MachineBasicBlock::iterator Begin,
                                    MachineBasicBlock::iterator End,
-                                   unsigned InsertPosIndex);
+                                   unsigned InsertPosIndex,
+                                   DbgValueVector &DbgValues);
 
     /// Observe - Update liveness information to account for the current
     /// instruction, which will not be scheduled.
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index 34b1a39..22c5465 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -30,6 +30,7 @@ using namespace llvm;
 
 STATISTIC(NumLandingPadsSplit,     "Number of landing pads split");
 STATISTIC(NumUnwindsLowered,       "Number of unwind instructions lowered");
+STATISTIC(NumResumesLowered,       "Number of eh.resume calls lowered");
 STATISTIC(NumExceptionValuesMoved, "Number of eh.exception calls moved");
 
 namespace {
@@ -63,7 +64,7 @@ namespace {
     BBSet LandingPads;
 
     bool NormalizeLandingPads();
-    bool LowerUnwinds();
+    bool LowerUnwindsAndResumes();
     bool MoveExceptionValueCalls();
 
     Instruction *CreateExceptionValueCall(BasicBlock *BB);
@@ -251,10 +252,7 @@ bool DwarfEHPrepare::HandleURoRInvokes() {
 
   if (!URoR) {
     URoR = F->getParent()->getFunction("_Unwind_Resume_or_Rethrow");
-    if (!URoR) {
-      URoR = F->getParent()->getFunction("_Unwind_SjLj_Resume");
-      if (!URoR) return CleanupSelectors(CatchAllSels);
-    }
+    if (!URoR) return CleanupSelectors(CatchAllSels);
   }
 
   SmallPtrSet<InvokeInst*, 32> URoRInvokes;
@@ -480,20 +478,25 @@ bool DwarfEHPrepare::NormalizeLandingPads() {
 /// rethrowing any previously caught exception.  This will crash horribly
 /// at runtime if there is no such exception: using unwind to throw a new
 /// exception is currently not supported.
-bool DwarfEHPrepare::LowerUnwinds() {
-  SmallVector<TerminatorInst*, 16> UnwindInsts;
-
-  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
-    TerminatorInst *TI = I->getTerminator();
-    if (isa<UnwindInst>(TI))
-      UnwindInsts.push_back(TI);
+bool DwarfEHPrepare::LowerUnwindsAndResumes() {
+  SmallVector<Instruction*, 16> ResumeInsts;
+
+  for (Function::iterator fi = F->begin(), fe = F->end(); fi != fe; ++fi) {
+    for (BasicBlock::iterator bi = fi->begin(), be = fi->end(); bi != be; ++bi){
+      if (isa<UnwindInst>(bi))
+        ResumeInsts.push_back(bi);
+      else if (CallInst *call = dyn_cast<CallInst>(bi))
+        if (Function *fn = dyn_cast<Function>(call->getCalledValue()))
+          if (fn->getName() == "llvm.eh.resume")
+            ResumeInsts.push_back(bi);
+    }
   }
 
-  if (UnwindInsts.empty()) return false;
+  if (ResumeInsts.empty()) return false;
 
   // Find the rewind function if we didn't already.
   if (!RewindFunction) {
-    LLVMContext &Ctx = UnwindInsts[0]->getContext();
+    LLVMContext &Ctx = ResumeInsts[0]->getContext();
     std::vector<const Type*>
       Params(1, Type::getInt8PtrTy(Ctx));
     FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx),
@@ -504,24 +507,36 @@ bool DwarfEHPrepare::LowerUnwinds() {
 
   bool Changed = false;
 
-  for (SmallVectorImpl<TerminatorInst*>::iterator
-         I = UnwindInsts.begin(), E = UnwindInsts.end(); I != E; ++I) {
-    TerminatorInst *TI = *I;
+  for (SmallVectorImpl<Instruction*>::iterator
+         I = ResumeInsts.begin(), E = ResumeInsts.end(); I != E; ++I) {
+    Instruction *RI = *I;
 
-    // Replace the unwind instruction with a call to _Unwind_Resume (or the
-    // appropriate target equivalent) followed by an UnreachableInst.
+    // Replace the resuming instruction with a call to _Unwind_Resume (or the
+    // appropriate target equivalent).
+
+    llvm::Value *ExnValue;
+    if (isa<UnwindInst>(RI))
+      ExnValue = CreateExceptionValueCall(RI->getParent());
+    else
+      ExnValue = cast<CallInst>(RI)->getArgOperand(0);
 
     // Create the call...
-    CallInst *CI = CallInst::Create(RewindFunction,
-                                    CreateExceptionValueCall(TI->getParent()),
-                                    "", TI);
+    CallInst *CI = CallInst::Create(RewindFunction, ExnValue, "", RI);
     CI->setCallingConv(TLI->getLibcallCallingConv(RTLIB::UNWIND_RESUME));
-    // ...followed by an UnreachableInst.
-    new UnreachableInst(TI->getContext(), TI);
 
-    // Nuke the unwind instruction.
-    TI->eraseFromParent();
-    ++NumUnwindsLowered;
+    // ...followed by an UnreachableInst, if it was an unwind.
+    // Calls to llvm.eh.resume are typically already followed by this.
+    if (isa<UnwindInst>(RI))
+      new UnreachableInst(RI->getContext(), RI);
+
+    if (isa<UnwindInst>(RI))
+      ++NumUnwindsLowered;
+    else
+      ++NumResumesLowered;
+
+    // Nuke the resume instruction.
+    RI->eraseFromParent();
+
     Changed = true;
   }
 
@@ -657,8 +672,8 @@ bool DwarfEHPrepare::runOnFunction(Function &Fn) {
   // basic block where an invoke unwind edge ends).
   Changed |= NormalizeLandingPads();
 
-  // Turn unwind instructions into libcalls.
-  Changed |= LowerUnwinds();
+  // Turn unwind instructions and eh.resume calls into libcalls.
+  Changed |= LowerUnwindsAndResumes();
 
   // TODO: Move eh.selector calls to landing pads and combine them.
 
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 790200b..8b2c981 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -265,7 +265,7 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   if (!TII) return false;
 
   // Tail merge tend to expose more if-conversion opportunities.
-  BranchFolder BF(true);
+  BranchFolder BF(true, false);
   bool BFChange = BF.OptimizeFunction(MF, TII,
                                    MF.getTarget().getRegisterInfo(),
                                    getAnalysisIfAvailable<MachineModuleInfo>());
@@ -399,7 +399,7 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   BBAnalysis.clear();
 
   if (MadeChange && IfCvtBranchFold) {
-    BranchFolder BF(false);
+    BranchFolder BF(false, false);
     BF.OptimizeFunction(MF, TII,
                         MF.getTarget().getRegisterInfo(),
                         getAnalysisIfAvailable<MachineModuleInfo>());
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index b1a33a6..19ae333 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -16,6 +16,7 @@
 #include "Spiller.h"
 #include "LiveRangeEdit.h"
 #include "VirtRegMap.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
@@ -31,6 +32,18 @@
 
 using namespace llvm;
 
+STATISTIC(NumSpilledRanges,   "Number of spilled live ranges");
+STATISTIC(NumSnippets,        "Number of snippets included in spills");
+STATISTIC(NumSpills,          "Number of spills inserted");
+STATISTIC(NumReloads,         "Number of reloads inserted");
+STATISTIC(NumFolded,          "Number of folded stack accesses");
+STATISTIC(NumFoldedLoads,     "Number of folded loads");
+STATISTIC(NumRemats,          "Number of rematerialized defs for spilling");
+STATISTIC(NumOmitReloadSpill, "Number of omitted spills after reloads");
+STATISTIC(NumHoistLocal,      "Number of locally hoisted spills");
+STATISTIC(NumHoistGlobal,     "Number of globally hoisted spills");
+STATISTIC(NumRedundantSpills, "Number of redundant spills identified");
+
 namespace {
 class InlineSpiller : public Spiller {
   MachineFunctionPass &Pass;
@@ -247,10 +260,11 @@ void InlineSpiller::collectRegsToSpill() {
     if (!isSnippet(SnipLI))
       continue;
     SnippetCopies.insert(MI);
-    if (!isRegToSpill(SnipReg))
-      RegsToSpill.push_back(SnipReg);
-
+    if (isRegToSpill(SnipReg))
+      continue;
+    RegsToSpill.push_back(SnipReg);
     DEBUG(dbgs() << "\talso spill snippet " << SnipLI << '\n');
+    ++NumSnippets;
   }
 }
 
@@ -469,9 +483,10 @@ bool InlineSpiller::hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI) {
                << *StackInt << '\n');
 
   // Already spilled everywhere.
-  if (SVI.AllDefsAreReloads)
+  if (SVI.AllDefsAreReloads) {
+    ++NumOmitReloadSpill;
     return true;
-
+  }
   // We are going to spill SVI.SpillVNI immediately after its def, so clear out
   // any later spills of the same value.
   eliminateRedundantSpills(SibLI, SVI.SpillVNI);
@@ -493,6 +508,11 @@ bool InlineSpiller::hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI) {
   LIS.InsertMachineInstrInMaps(MII);
   VRM.addSpillSlotUse(StackSlot, MII);
   DEBUG(dbgs() << "\thoisted: " << SVI.SpillVNI->def << '\t' << *MII);
+
+  if (MBB == CopyMI->getParent())
+    ++NumHoistLocal;
+  else
+    ++NumHoistGlobal;
   return true;
 }
 
@@ -547,6 +567,7 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
         // eliminateDeadDefs won't normally remove stores, so switch opcode.
         MI->setDesc(TII.get(TargetOpcode::KILL));
         DeadDefs.push_back(MI);
+        ++NumRedundantSpills;
       }
     }
   } while (!WorkList.empty());
@@ -642,6 +663,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
   if (RM.OrigMI->getDesc().canFoldAsLoad() &&
       foldMemoryOperand(MI, Ops, RM.OrigMI)) {
     Edit->markRematerialized(RM.ParentVNI);
+    ++NumFoldedLoads;
     return true;
   }
 
@@ -668,6 +690,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
   VNInfo *DefVNI = NewLI.getNextValue(DefIdx, 0, LIS.getVNInfoAllocator());
   NewLI.addRange(LiveRange(DefIdx, UseIdx.getDefIndex(), DefVNI));
   DEBUG(dbgs() << "\tinterval: " << NewLI << '\n');
+  ++NumRemats;
   return true;
 }
 
@@ -794,6 +817,7 @@ bool InlineSpiller::foldMemoryOperand(MachineBasicBlock::iterator MI,
     VRM.addSpillSlotUse(StackSlot, FoldMI);
   MI->eraseFromParent();
   DEBUG(dbgs() << "\tfolded: " << *FoldMI);
+  ++NumFolded;
   return true;
 }
 
@@ -811,6 +835,7 @@ void InlineSpiller::insertReload(LiveInterval &NewLI,
   VNInfo *LoadVNI = NewLI.getNextValue(LoadIdx, 0,
                                        LIS.getVNInfoAllocator());
   NewLI.addRange(LiveRange(LoadIdx, Idx, LoadVNI));
+  ++NumReloads;
 }
 
 /// insertSpill - Insert a spill of NewLI.reg after MI.
@@ -825,10 +850,12 @@ void InlineSpiller::insertSpill(LiveInterval &NewLI, const LiveInterval &OldLI,
   DEBUG(dbgs() << "\tspilled: " << StoreIdx << '\t' << *MI);
   VNInfo *StoreVNI = NewLI.getNextValue(Idx, 0, LIS.getVNInfoAllocator());
   NewLI.addRange(LiveRange(Idx, StoreIdx, StoreVNI));
+  ++NumSpills;
 }
 
 /// spillAroundUses - insert spill code around each use of Reg.
 void InlineSpiller::spillAroundUses(unsigned Reg) {
+  DEBUG(dbgs() << "spillAroundUses " << PrintReg(Reg) << '\n');
   LiveInterval &OldLI = LIS.getInterval(Reg);
 
   // Iterate over instructions using Reg.
@@ -876,6 +903,12 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
     // Check for a sibling copy.
     unsigned SibReg = isFullCopyOf(MI, Reg);
     if (SibReg && isSibling(SibReg)) {
+      // This may actually be a copy between snippets.
+      if (isRegToSpill(SibReg)) {
+        DEBUG(dbgs() << "Found new snippet copy: " << *MI);
+        SnippetCopies.insert(MI);
+        continue;
+      }
       if (Writes) {
         // Hoist the spill of a sib-reg copy.
         if (hoistSpill(OldLI, MI)) {
@@ -957,13 +990,15 @@ void InlineSpiller::spillAll() {
   }
 
   // Finally delete the SnippetCopies.
-  for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(Edit->getReg());
-       MachineInstr *MI = RI.skipInstruction();) {
-    assert(SnippetCopies.count(MI) && "Remaining use wasn't a snippet copy");
-    // FIXME: Do this with a LiveRangeEdit callback.
-    VRM.RemoveMachineInstrFromMaps(MI);
-    LIS.RemoveMachineInstrFromMaps(MI);
-    MI->eraseFromParent();
+  for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) {
+    for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(RegsToSpill[i]);
+         MachineInstr *MI = RI.skipInstruction();) {
+      assert(SnippetCopies.count(MI) && "Remaining use wasn't a snippet copy");
+      // FIXME: Do this with a LiveRangeEdit callback.
+      VRM.RemoveMachineInstrFromMaps(MI);
+      LIS.RemoveMachineInstrFromMaps(MI);
+      MI->eraseFromParent();
+    }
   }
 
   // Delete all spilled registers.
@@ -972,6 +1007,7 @@ void InlineSpiller::spillAll() {
 }
 
 void InlineSpiller::spill(LiveRangeEdit &edit) {
+  ++NumSpilledRanges;
   Edit = &edit;
   assert(!TargetRegisterInfo::isStackSlot(edit.getReg())
          && "Trying to spill a stack slot.");
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index e1dad2e..589d0a9 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/PassManager.h"
+#include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Assembly/PrintModulePass.h"
 #include "llvm/CodeGen/AsmPrinter.h"
@@ -32,7 +33,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/StandardPasses.h"
 using namespace llvm;
 
 namespace llvm {
@@ -292,7 +292,11 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   // Standard LLVM-Level Passes.
 
   // Basic AliasAnalysis support.
-  createStandardAliasAnalysisPasses(&PM);
+  // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
+  // BasicAliasAnalysis wins if they disagree. This is intended to help
+  // support "obvious" type-punning idioms.
+  PM.add(createTypeBasedAliasAnalysisPass());
+  PM.add(createBasicAliasAnalysisPass());
 
   // Before running any passes, run the verifier to determine if the input
   // coming from the front-end and/or optimizer is valid.
@@ -324,8 +328,8 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
     PM.add(createSjLjEHPass(getTargetLowering()));
     // FALLTHROUGH
   case ExceptionHandling::DwarfCFI:
-  case ExceptionHandling::DwarfTable:
   case ExceptionHandling::ARM:
+  case ExceptionHandling::Win64:
     PM.add(createDwarfEHPass(this));
     break;
   case ExceptionHandling::None:
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 8b21483..292928f 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -101,9 +101,13 @@ class UserValue {
   void insertDebugValue(MachineBasicBlock *MBB, SlotIndex Idx, unsigned LocNo,
                         LiveIntervals &LIS, const TargetInstrInfo &TII);
 
+  /// splitLocation - Replace OldLocNo ranges with NewRegs ranges where NewRegs
+  /// is live. Returns true if any changes were made.
+  bool splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs);
+
 public:
   /// UserValue - Create a new UserValue.
-  UserValue(const MDNode *var, unsigned o, DebugLoc L, 
+  UserValue(const MDNode *var, unsigned o, DebugLoc L,
             LocMap::Allocator &alloc)
     : variable(var), offset(o), dl(L), leader(this), next(0), locInts(alloc)
   {}
@@ -215,6 +219,10 @@ public:
   void renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx,
                       const TargetRegisterInfo *TRI);
 
+  /// splitRegister - Replace OldReg ranges with NewRegs ranges where NewRegs is
+  /// live. Returns true if any changes were made.
+  bool splitRegister(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs);
+
   /// rewriteLocations - Rewrite virtual register locations according to the
   /// provided virtual register map.
   void rewriteLocations(VirtRegMap &VRM, const TargetRegisterInfo &TRI);
@@ -228,7 +236,7 @@ public:
   /// Only first one needs DebugLoc to identify variable's lexical scope
   /// in source file.
   DebugLoc findDebugLoc();
-  void print(raw_ostream&, const TargetRegisterInfo*);
+  void print(raw_ostream&, const TargetMachine*);
 };
 } // namespace
 
@@ -293,6 +301,9 @@ public:
   /// renameRegister - Replace all references to OldReg with NewReg:SubIdx.
   void renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx);
 
+  /// splitRegister -  Replace all references to OldReg with NewRegs.
+  void splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs);
+
   /// emitDebugVariables - Recreate DBG_VALUE instruction from data structures.
   void emitDebugValues(VirtRegMap *VRM);
 
@@ -300,7 +311,7 @@ public:
 };
 } // namespace
 
-void UserValue::print(raw_ostream &OS, const TargetRegisterInfo *TRI) {
+void UserValue::print(raw_ostream &OS, const TargetMachine *TM) {
   if (const MDString *MDS = dyn_cast<MDString>(variable->getOperand(2)))
     OS << "!\"" << MDS->getString() << "\"\t";
   if (offset)
@@ -312,15 +323,17 @@ void UserValue::print(raw_ostream &OS, const TargetRegisterInfo *TRI) {
     else
       OS << I.value();
   }
-  for (unsigned i = 0, e = locations.size(); i != e; ++i)
-    OS << " Loc" << i << '=' << locations[i];
+  for (unsigned i = 0, e = locations.size(); i != e; ++i) {
+    OS << " Loc" << i << '=';
+    locations[i].print(OS, TM);
+  }
   OS << '\n';
 }
 
 void LDVImpl::print(raw_ostream &OS) {
   OS << "********** DEBUG VARIABLES **********\n";
   for (unsigned i = 0, e = userValues.size(); i != e; ++i)
-    userValues[i]->print(OS, TRI);
+    userValues[i]->print(OS, &MF->getTarget());
 }
 
 void UserValue::coalesceLocation(unsigned LocNo) {
@@ -677,6 +690,143 @@ renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx) {
     static_cast<LDVImpl*>(pImpl)->renameRegister(OldReg, NewReg, SubIdx);
 }
 
+//===----------------------------------------------------------------------===//
+//                           Live Range Splitting
+//===----------------------------------------------------------------------===//
+
+bool
+UserValue::splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs) {
+  DEBUG({
+    dbgs() << "Splitting Loc" << OldLocNo << '\t';
+    print(dbgs(), 0);
+  });
+  bool DidChange = false;
+  LocMap::iterator LocMapI;
+  LocMapI.setMap(locInts);
+  for (unsigned i = 0; i != NewRegs.size(); ++i) {
+    LiveInterval *LI = NewRegs[i];
+    if (LI->empty())
+      continue;
+
+    // Don't allocate the new LocNo until it is needed.
+    unsigned NewLocNo = ~0u;
+
+    // Iterate over the overlaps between locInts and LI.
+    LocMapI.find(LI->beginIndex());
+    if (!LocMapI.valid())
+      continue;
+    LiveInterval::iterator LII = LI->advanceTo(LI->begin(), LocMapI.start());
+    LiveInterval::iterator LIE = LI->end();
+    while (LocMapI.valid() && LII != LIE) {
+      // At this point, we know that LocMapI.stop() > LII->start.
+      LII = LI->advanceTo(LII, LocMapI.start());
+      if (LII == LIE)
+        break;
+
+      // Now LII->end > LocMapI.start(). Do we have an overlap?
+      if (LocMapI.value() == OldLocNo && LII->start < LocMapI.stop()) {
+        // Overlapping correct location. Allocate NewLocNo now.
+        if (NewLocNo == ~0u) {
+          MachineOperand MO = MachineOperand::CreateReg(LI->reg, false);
+          MO.setSubReg(locations[OldLocNo].getSubReg());
+          NewLocNo = getLocationNo(MO);
+          DidChange = true;
+        }
+
+        SlotIndex LStart = LocMapI.start();
+        SlotIndex LStop  = LocMapI.stop();
+
+        // Trim LocMapI down to the LII overlap.
+        if (LStart < LII->start)
+          LocMapI.setStartUnchecked(LII->start);
+        if (LStop > LII->end)
+          LocMapI.setStopUnchecked(LII->end);
+
+        // Change the value in the overlap. This may trigger coalescing.
+        LocMapI.setValue(NewLocNo);
+
+        // Re-insert any removed OldLocNo ranges.
+        if (LStart < LocMapI.start()) {
+          LocMapI.insert(LStart, LocMapI.start(), OldLocNo);
+          ++LocMapI;
+          assert(LocMapI.valid() && "Unexpected coalescing");
+        }
+        if (LStop > LocMapI.stop()) {
+          ++LocMapI;
+          LocMapI.insert(LII->end, LStop, OldLocNo);
+          --LocMapI;
+        }
+      }
+
+      // Advance to the next overlap.
+      if (LII->end < LocMapI.stop()) {
+        if (++LII == LIE)
+          break;
+        LocMapI.advanceTo(LII->start);
+      } else {
+        ++LocMapI;
+        if (!LocMapI.valid())
+          break;
+        LII = LI->advanceTo(LII, LocMapI.start());
+      }
+    }
+  }
+
+  // Finally, remove any remaining OldLocNo intervals and OldLocNo itself.
+  locations.erase(locations.begin() + OldLocNo);
+  LocMapI.goToBegin();
+  while (LocMapI.valid()) {
+    unsigned v = LocMapI.value();
+    if (v == OldLocNo) {
+      DEBUG(dbgs() << "Erasing [" << LocMapI.start() << ';'
+                   << LocMapI.stop() << ")\n");
+      LocMapI.erase();
+    } else {
+      if (v > OldLocNo)
+        LocMapI.setValueUnchecked(v-1);
+      ++LocMapI;
+    }
+  }
+
+  DEBUG({dbgs() << "Split result: \t"; print(dbgs(), 0);});
+  return DidChange;
+}
+
+bool
+UserValue::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
+  bool DidChange = false;
+  // Split locations referring to OldReg. Iterate backwards so splitLocation can
+  // safely erase unuused locations.
+  for (unsigned i = locations.size(); i ; --i) {
+    unsigned LocNo = i-1;
+    const MachineOperand *Loc = &locations[LocNo];
+    if (!Loc->isReg() || Loc->getReg() != OldReg)
+      continue;
+    DidChange |= splitLocation(LocNo, NewRegs);
+  }
+  return DidChange;
+}
+
+void LDVImpl::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
+  bool DidChange = false;
+  for (UserValue *UV = lookupVirtReg(OldReg); UV; UV = UV->getNext())
+    DidChange |= UV->splitRegister(OldReg, NewRegs);
+
+  if (!DidChange)
+    return;
+
+  // Map all of the new virtual registers.
+  UserValue *UV = lookupVirtReg(OldReg);
+  for (unsigned i = 0; i != NewRegs.size(); ++i)
+    mapVirtReg(NewRegs[i]->reg, UV);
+}
+
+void LiveDebugVariables::
+splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
+  if (pImpl)
+    static_cast<LDVImpl*>(pImpl)->splitRegister(OldReg, NewRegs);
+}
+
 void
 UserValue::rewriteLocations(VirtRegMap &VRM, const TargetRegisterInfo &TRI) {
   // Iterate over locations in reverse makes it easier to handle coalescing.
@@ -690,6 +840,9 @@ UserValue::rewriteLocations(VirtRegMap &VRM, const TargetRegisterInfo &TRI) {
     unsigned VirtReg = Loc.getReg();
     if (VRM.isAssignedReg(VirtReg) &&
         TargetRegisterInfo::isPhysicalRegister(VRM.getPhys(VirtReg))) {
+      // This can create a %noreg operand in rare cases when the sub-register
+      // index is no longer available. That means the user value is in a
+      // non-existent sub-register, and %noreg is exactly what we want.
       Loc.substPhysReg(VRM.getPhys(VirtReg), TRI);
     } else if (VRM.getStackSlot(VirtReg) != VirtRegMap::NO_STACK_SLOT &&
                VRM.isSpillSlotUsed(VRM.getStackSlot(VirtReg))) {
@@ -701,7 +854,6 @@ UserValue::rewriteLocations(VirtRegMap &VRM, const TargetRegisterInfo &TRI) {
     }
     coalesceLocation(LocNo);
   }
-  DEBUG(print(dbgs(), &TRI));
 }
 
 /// findInsertLocation - Find an iterator for inserting a DBG_VALUE
@@ -793,6 +945,7 @@ void LDVImpl::emitDebugValues(VirtRegMap *VRM) {
   DEBUG(dbgs() << "********** EMITTING LIVE DEBUG VARIABLES **********\n");
   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
   for (unsigned i = 0, e = userValues.size(); i != e; ++i) {
+    DEBUG(userValues[i]->print(dbgs(), &MF->getTarget()));
     userValues[i]->rewriteLocations(*VRM, *TRI);
     userValues[i]->emitDebugValues(VRM, *LIS, *TII);
   }
diff --git a/lib/CodeGen/LiveDebugVariables.h b/lib/CodeGen/LiveDebugVariables.h
index a6e40a1..3ce3c39 100644
--- a/lib/CodeGen/LiveDebugVariables.h
+++ b/lib/CodeGen/LiveDebugVariables.h
@@ -21,10 +21,12 @@
 #ifndef LLVM_CODEGEN_LIVEDEBUGVARIABLES_H
 #define LLVM_CODEGEN_LIVEDEBUGVARIABLES_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 
 namespace llvm {
 
+class LiveInterval;
 class VirtRegMap;
 
 class LiveDebugVariables : public MachineFunctionPass {
@@ -42,6 +44,11 @@ public:
   ///               register.
   void renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx);
 
+  /// splitRegister - Move any user variables in OldReg to the live ranges in
+  /// NewRegs where they are live. Mark the values as unavailable where no new
+  /// register is live.
+  void splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs);
+
   /// emitDebugValues - Emit new DBG_VALUE instructions reflecting the changes
   /// that happened during register allocation.
   /// @param VRM Rename virtual registers according to map.
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index f8a3dbb..052abad 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -15,6 +15,7 @@
 #include "LiveRangeEdit.h"
 #include "VirtRegMap.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -24,6 +25,10 @@
 
 using namespace llvm;
 
+STATISTIC(NumDCEDeleted,     "Number of instructions deleted by DCE");
+STATISTIC(NumDCEFoldedLoads, "Number of single use loads folded after DCE");
+STATISTIC(NumFracRanges,     "Number of live ranges fractured by DCE");
+
 LiveInterval &LiveRangeEdit::createFrom(unsigned OldReg,
                                         LiveIntervals &LIS,
                                         VirtRegMap &VRM) {
@@ -199,6 +204,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   UseMI->eraseFromParent();
   DefMI->addRegisterDead(LI->reg, 0);
   Dead.push_back(DefMI);
+  ++NumDCEFoldedLoads;
   return true;
 }
 
@@ -269,6 +275,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
         delegate_->LRE_WillEraseInstruction(MI);
       LIS.RemoveMachineInstrFromMaps(MI);
       MI->eraseFromParent();
+      ++NumDCEDeleted;
     }
 
     if (ToShrink.empty())
@@ -290,6 +297,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
     unsigned NumComp = ConEQ.Classify(LI);
     if (NumComp <= 1)
       continue;
+    ++NumFracRanges;
     DEBUG(dbgs() << NumComp << " components: " << *LI << '\n');
     SmallVector<LiveInterval*, 8> Dups(1, LI);
     for (unsigned i = 1; i != NumComp; ++i) {
diff --git a/lib/CodeGen/LiveRangeEdit.h b/lib/CodeGen/LiveRangeEdit.h
index 14d227e..db6740c 100644
--- a/lib/CodeGen/LiveRangeEdit.h
+++ b/lib/CodeGen/LiveRangeEdit.h
@@ -18,8 +18,9 @@
 #ifndef LLVM_CODEGEN_LIVERANGEEDIT_H
 #define LLVM_CODEGEN_LIVERANGEEDIT_H
 
-#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/LiveInterval.h"
 
 namespace llvm {
 
@@ -113,6 +114,10 @@ public:
   bool empty() const { return size() == 0; }
   LiveInterval *get(unsigned idx) const { return newRegs_[idx+firstNew_]; }
 
+  ArrayRef<LiveInterval*> regs() const {
+    return ArrayRef<LiveInterval*>(newRegs_).slice(firstNew_);
+  }
+
   /// FIXME: Temporary accessors until we can get rid of
   /// LiveIntervals::AddIntervalsForSpills
   SmallVectorImpl<LiveInterval*> *getNewVRegs() { return &newRegs_; }
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 57f3e34..68946a2 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -485,6 +485,30 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
         << " -- BB#" << NMBB->getNumber()
         << " -- BB#" << Succ->getNumber() << '\n');
 
+  // On some targets like Mips, branches may kill virtual registers. Make sure
+  // that LiveVariables is properly updated after updateTerminator replaces the
+  // terminators.
+  LiveVariables *LV = P->getAnalysisIfAvailable<LiveVariables>();
+
+  // Collect a list of virtual registers killed by the terminators.
+  SmallVector<unsigned, 4> KilledRegs;
+  if (LV)
+    for (iterator I = getFirstTerminator(), E = end(); I != E; ++I) {
+      MachineInstr *MI = I;
+      for (MachineInstr::mop_iterator OI = MI->operands_begin(),
+           OE = MI->operands_end(); OI != OE; ++OI) {
+        if (!OI->isReg() || !OI->isUse() || !OI->isKill() || OI->isUndef())
+          continue;
+        unsigned Reg = OI->getReg();
+        if (TargetRegisterInfo::isVirtualRegister(Reg) &&
+            LV->getVarInfo(Reg).removeKill(MI)) {
+          KilledRegs.push_back(Reg);
+          DEBUG(dbgs() << "Removing terminator kill: " << *MI);
+          OI->setIsKill(false);
+        }
+      }
+    }
+
   ReplaceUsesOfBlockWith(Succ, NMBB);
   updateTerminator();
 
@@ -502,9 +526,22 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
       if (i->getOperand(ni+1).getMBB() == this)
         i->getOperand(ni+1).setMBB(NMBB);
 
-  if (LiveVariables *LV =
-        P->getAnalysisIfAvailable<LiveVariables>())
+  // Update LiveVariables.
+  if (LV) {
+    // Restore kills of virtual registers that were killed by the terminators.
+    while (!KilledRegs.empty()) {
+      unsigned Reg = KilledRegs.pop_back_val();
+      for (iterator I = end(), E = begin(); I != E;) {
+        if (!(--I)->addRegisterKilled(Reg, NULL, /* addIfNotFound= */ false))
+          continue;
+        LV->getVarInfo(Reg).Kills.push_back(I);
+        DEBUG(dbgs() << "Restored terminator kill: " << *I);
+        break;
+      }
+    }
+    // Update relevant live-through information.
     LV->addNewBlock(NMBB, this, Succ);
+  }
 
   if (MachineDominatorTree *MDT =
       P->getAnalysisIfAvailable<MachineDominatorTree>()) {
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index d81e4a1..50750a5 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -65,7 +65,11 @@ MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
     FrameInfo->setMaxAlignment(Attribute::getStackAlignmentFromAttrs(
         Fn->getAttributes().getFnAttributes()));
   ConstantPool = new (Allocator) MachineConstantPool(TM.getTargetData());
-  Alignment = TM.getTargetLowering()->getFunctionAlignment(F);
+  Alignment = TM.getTargetLowering()->getMinFunctionAlignment();
+  // FIXME: Shouldn't use pref alignment if explicit alignment is set on Fn.
+  if (!Fn->hasFnAttr(Attribute::OptimizeForSize))
+    Alignment = std::max(Alignment,
+                         TM.getTargetLowering()->getPrefFunctionAlignment());
   FunctionNumber = FunctionNum;
   JumpTableInfo = 0;
 }
@@ -300,31 +304,19 @@ void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const {
     OS << "Function Live Ins: ";
     for (MachineRegisterInfo::livein_iterator
          I = RegInfo->livein_begin(), E = RegInfo->livein_end(); I != E; ++I) {
-      if (TRI)
-        OS << "%" << TRI->getName(I->first);
-      else
-        OS << " %physreg" << I->first;
-      
+      OS << PrintReg(I->first, TRI);
       if (I->second)
-        OS << " in reg%" << I->second;
-
+        OS << " in " << PrintReg(I->second, TRI);
       if (llvm::next(I) != E)
         OS << ", ";
     }
     OS << '\n';
   }
   if (RegInfo && !RegInfo->liveout_empty()) {
-    OS << "Function Live Outs: ";
+    OS << "Function Live Outs:";
     for (MachineRegisterInfo::liveout_iterator
-         I = RegInfo->liveout_begin(), E = RegInfo->liveout_end(); I != E; ++I){
-      if (TRI)
-        OS << '%' << TRI->getName(*I);
-      else
-        OS << "%physreg" << *I;
-
-      if (llvm::next(I) != E)
-        OS << " ";
-    }
+         I = RegInfo->liveout_begin(), E = RegInfo->liveout_end(); I != E; ++I)
+      OS << ' ' << PrintReg(*I, TRI);
     OS << '\n';
   }
   
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 71df6f8..36b0b83 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -125,7 +125,8 @@ void MachineOperand::substPhysReg(unsigned Reg, const TargetRegisterInfo &TRI) {
   assert(TargetRegisterInfo::isPhysicalRegister(Reg));
   if (getSubReg()) {
     Reg = TRI.getSubReg(Reg, getSubReg());
-    assert(Reg && "Invalid SubReg for physical register");
+    // Note that getSubReg() may return 0 if the sub-register doesn't exist.
+    // That won't happen in legal code.
     setSubReg(0);
   }
   setReg(Reg);
@@ -763,19 +764,35 @@ bool MachineInstr::isIdenticalTo(const MachineInstr *Other,
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = getOperand(i);
     const MachineOperand &OMO = Other->getOperand(i);
+    if (!MO.isReg()) {
+      if (!MO.isIdenticalTo(OMO))
+        return false;
+      continue;
+    }
+
     // Clients may or may not want to ignore defs when testing for equality.
     // For example, machine CSE pass only cares about finding common
     // subexpressions, so it's safe to ignore virtual register defs.
-    if (Check != CheckDefs && MO.isReg() && MO.isDef()) {
+    if (MO.isDef()) {
       if (Check == IgnoreDefs)
         continue;
-      // Check == IgnoreVRegDefs
-      if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) ||
-          TargetRegisterInfo::isPhysicalRegister(OMO.getReg()))
-        if (MO.getReg() != OMO.getReg())
+      else if (Check == IgnoreVRegDefs) {
+        if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) ||
+            TargetRegisterInfo::isPhysicalRegister(OMO.getReg()))
+          if (MO.getReg() != OMO.getReg())
+            return false;
+      } else {
+        if (!MO.isIdenticalTo(OMO))
           return false;
-    } else if (!MO.isIdenticalTo(OMO))
-      return false;
+        if (Check == CheckKillDead && MO.isDead() != OMO.isDead())
+          return false;
+      }
+    } else {
+      if (!MO.isIdenticalTo(OMO))
+        return false;
+      if (Check == CheckKillDead && MO.isKill() != OMO.isKill())
+        return false;
+    }
   }
   return true;
 }
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 7244d5f..08ff5bb 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -79,6 +79,8 @@ MachineRegisterInfo::constrainRegClass(unsigned Reg,
 unsigned
 MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass){
   assert(RegClass && "Cannot create register without RegClass!");
+  assert(RegClass->isAllocatable() &&
+         "Virtual register RegClass must be allocatable.");
 
   // New virtual register number.
   unsigned Reg = TargetRegisterInfo::index2VirtReg(getNumVirtRegs());
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index f95f411..471463b 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -23,6 +23,7 @@
 // the verifier errors.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Instructions.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveVariables.h"
@@ -32,6 +33,7 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -394,7 +396,13 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
     if ((*I)->isLandingPad())
       LandingPadSuccs.insert(*I);
   }
-  if (LandingPadSuccs.size() > 1)
+
+  const MCAsmInfo *AsmInfo = TM->getMCAsmInfo();
+  const BasicBlock *BB = MBB->getBasicBlock();
+  if (LandingPadSuccs.size() > 1 &&
+      !(AsmInfo &&
+        AsmInfo->getExceptionHandlingType() == ExceptionHandling::SjLj &&
+        BB && isa<SwitchInst>(BB->getTerminator())))
     report("MBB has more than one landing pad successor", MBB);
 
   // Call AnalyzeBranch. If it succeeds, there several more conditions to check.
@@ -402,11 +410,6 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   SmallVector<MachineOperand, 4> Cond;
   if (!TII->AnalyzeBranch(*const_cast<MachineBasicBlock *>(MBB),
                           TBB, FBB, Cond)) {
-    // If the block branches directly to a landing pad successor, pretend that
-    // the landing pad is a normal block.
-    LandingPadSuccs.erase(TBB);
-    LandingPadSuccs.erase(FBB);
-
     // Ok, AnalyzeBranch thinks it knows what's going on with this block. Let's
     // check whether its answers match up with reality.
     if (!TBB && !FBB) {
@@ -741,7 +744,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
           RC = SRC;
         }
         if (const TargetRegisterClass *DRC = TOI.getRegClass(TRI)) {
-          if (RC != DRC && !RC->hasSuperClass(DRC)) {
+          if (!RC->hasSuperClassEq(DRC)) {
             report("Illegal virtual register for instruction", MO, MONum);
             *OS << "Expected a " << DRC->getName() << " register, but got a "
                 << RC->getName() << " register\n";
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 60c24b7..ba8501f 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -304,7 +304,7 @@ void SchedulePostRATDList::Schedule() {
   if (AntiDepBreak != NULL) {
     unsigned Broken =
       AntiDepBreak->BreakAntiDependencies(SUnits, Begin, InsertPos,
-                                          InsertPosIndex);
+                                          InsertPosIndex, DbgValues);
 
     if (Broken != 0) {
       // We made changes. Update the dependency graph.
@@ -540,10 +540,16 @@ void SchedulePostRATDList::ReleaseSucc(SUnit *SU, SDep *SuccEdge) {
 #endif
   --SuccSU->NumPredsLeft;
 
-  // Compute how many cycles it will be before this actually becomes
-  // available.  This is the max of the start time of all predecessors plus
-  // their latencies.
-  SuccSU->setDepthToAtLeast(SU->getDepth() + SuccEdge->getLatency());
+  // Standard scheduler algorithms will recompute the depth of the successor
+  // here as such:
+  //   SuccSU->setDepthToAtLeast(SU->getDepth() + SuccEdge->getLatency());
+  //
+  // However, we lazily compute node depth instead. Note that
+  // ScheduleNodeTopDown has already updated the depth of this node which causes
+  // all descendents to be marked dirty. Setting the successor depth explicitly
+  // here would cause depth to be recomputed for all its ancestors. If the
+  // successor is not yet ready (because of a transitively redundant edge) then
+  // this causes depth computation to be quadratic in the size of the DAG.
 
   // If all the node's predecessors are scheduled, this node is ready
   // to be scheduled. Ignore the special ExitSU node.
@@ -655,6 +661,12 @@ void SchedulePostRATDList::ListScheduleTopDown() {
       ScheduleNodeTopDown(FoundSUnit, CurCycle);
       HazardRec->EmitInstruction(FoundSUnit);
       CycleHasInsts = true;
+      if (HazardRec->atIssueLimit()) {
+        DEBUG(dbgs() << "*** Max instructions per cycle " << CurCycle << '\n');
+        HazardRec->AdvanceCycle();
+        ++CurCycle;
+        CycleHasInsts = false;
+      }
     } else {
       if (CycleHasInsts) {
         DEBUG(dbgs() << "*** Finished cycle " << CurCycle << '\n');
diff --git a/lib/CodeGen/RegAllocBase.h b/lib/CodeGen/RegAllocBase.h
index f431d5a..03164211 100644
--- a/lib/CodeGen/RegAllocBase.h
+++ b/lib/CodeGen/RegAllocBase.h
@@ -39,6 +39,7 @@
 
 #include "llvm/ADT/OwningPtr.h"
 #include "LiveIntervalUnion.h"
+#include "RegisterClassInfo.h"
 
 namespace llvm {
 
@@ -91,6 +92,7 @@ protected:
   MachineRegisterInfo *MRI;
   VirtRegMap *VRM;
   LiveIntervals *LIS;
+  RegisterClassInfo RegClassInfo;
   LiveUnionArray PhysReg2LiveUnion;
 
   // Current queries, one per physreg. They must be reinitialized each time we
@@ -113,6 +115,10 @@ protected:
     return Queries[PhysReg];
   }
 
+  // Invalidate all cached information about virtual registers - live ranges may
+  // have changed.
+  void invalidateVirtRegs() { ++UserTag; }
+
   // The top-level driver. The output is a VirtRegMap that us updated with
   // physical register assignments.
   //
diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index d92d80f..1d77b29 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@@ -13,10 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "regalloc"
+#include "RegAllocBase.h"
 #include "LiveDebugVariables.h"
 #include "LiveIntervalUnion.h"
 #include "LiveRangeEdit.h"
-#include "RegAllocBase.h"
 #include "RenderMachineFunction.h"
 #include "Spiller.h"
 #include "VirtRegMap.h"
@@ -85,7 +85,6 @@ class RABasic : public MachineFunctionPass, public RegAllocBase
 {
   // context
   MachineFunction *MF;
-  BitVector ReservedRegs;
 
   // analyses
   LiveStacks *LS;
@@ -235,6 +234,8 @@ void RegAllocBase::init(VirtRegMap &vrm, LiveIntervals &lis) {
   MRI = &vrm.getRegInfo();
   VRM = &vrm;
   LIS = &lis;
+  RegClassInfo.runOnMachineFunction(vrm.getMachineFunction());
+
   const unsigned NumRegs = TRI->getNumRegs();
   if (NumRegs != PhysReg2LiveUnion.numRegs()) {
     PhysReg2LiveUnion.init(UnionAllocator, NumRegs);
@@ -309,7 +310,7 @@ void RegAllocBase::allocatePhysRegs() {
     }
 
     // Invalidate all interference queries, live ranges could have changed.
-    ++UserTag;
+    invalidateVirtRegs();
 
     // selectOrSplit requests the allocator to return an available physical
     // register if possible and populate a list of new live intervals that
@@ -321,6 +322,23 @@ void RegAllocBase::allocatePhysRegs() {
     VirtRegVec SplitVRegs;
     unsigned AvailablePhysReg = selectOrSplit(*VirtReg, SplitVRegs);
 
+    if (AvailablePhysReg == ~0u) {
+      // selectOrSplit failed to find a register!
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "Ran out of registers during register allocation!"
+             "\nCannot allocate: " << *VirtReg;
+      for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(VirtReg->reg);
+      MachineInstr *MI = I.skipInstruction();) {
+        if (!MI->isInlineAsm())
+          continue;
+        Msg << "\nPlease check your inline asm statement for "
+          "invalid constraints:\n";
+        MI->print(Msg, &VRM->getMachineFunction().getTarget());
+      }
+      report_fatal_error(Msg.str());
+    }
+
     if (AvailablePhysReg)
       assign(*VirtReg, AvailablePhysReg);
 
@@ -462,14 +480,11 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
   SmallVector<unsigned, 8> PhysRegSpillCands;
 
   // Check for an available register in this class.
-  const TargetRegisterClass *TRC = MRI->getRegClass(VirtReg.reg);
-
-  for (TargetRegisterClass::iterator I = TRC->allocation_order_begin(*MF),
-         E = TRC->allocation_order_end(*MF);
-       I != E; ++I) {
-
+  ArrayRef<unsigned> Order =
+    RegClassInfo.getOrder(MRI->getRegClass(VirtReg.reg));
+  for (ArrayRef<unsigned>::iterator I = Order.begin(), E = Order.end(); I != E;
+       ++I) {
     unsigned PhysReg = *I;
-    if (ReservedRegs.test(PhysReg)) continue;
 
     // Check interference and as a side effect, intialize queries for this
     // VirtReg and its aliases.
@@ -498,8 +513,11 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
     // Tell the caller to allocate to this newly freed physical register.
     return *PhysRegI;
   }
+
   // No other spill candidates were found, so spill the current VirtReg.
   DEBUG(dbgs() << "spilling: " << VirtReg << '\n');
+  if (!VirtReg.isSpillable())
+    return ~0u;
   LiveRangeEdit LRE(VirtReg, SplitVRegs);
   spiller().spill(LRE);
 
@@ -517,9 +535,6 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) {
   DEBUG(RMF = &getAnalysis<RenderMachineFunction>());
 
   RegAllocBase::init(getAnalysis<VirtRegMap>(), getAnalysis<LiveIntervals>());
-
-  ReservedRegs = TRI->getReservedRegs(*MF);
-
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
 
   allocatePhysRegs();
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index b2fd6e0..9765203 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "regalloc"
+#include "RegisterClassInfo.h"
 #include "llvm/BasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -58,6 +59,7 @@ namespace {
     MachineRegisterInfo *MRI;
     const TargetRegisterInfo *TRI;
     const TargetInstrInfo *TII;
+    RegisterClassInfo RegClassInfo;
 
     // Basic block currently being allocated.
     MachineBasicBlock *MBB;
@@ -113,9 +115,6 @@ namespace {
     // instruction, and so cannot be allocated.
     BitVector UsedInInstr;
 
-    // Allocatable - vector of allocatable physical registers.
-    BitVector Allocatable;
-
     // SkippedInstrs - Descriptors of instructions whose clobber list was
     // ignored because all registers were spilled. It is still necessary to
     // mark all the clobbered registers as used by the function.
@@ -483,7 +482,7 @@ void RAFast::allocVirtReg(MachineInstr *MI, LiveRegEntry &LRE, unsigned Hint) {
 
   // Ignore invalid hints.
   if (Hint && (!TargetRegisterInfo::isPhysicalRegister(Hint) ||
-               !RC->contains(Hint) || !Allocatable.test(Hint)))
+               !RC->contains(Hint) || !RegClassInfo.isAllocatable(Hint)))
     Hint = 0;
 
   // Take hint when possible.
@@ -499,14 +498,12 @@ void RAFast::allocVirtReg(MachineInstr *MI, LiveRegEntry &LRE, unsigned Hint) {
     }
   }
 
-  TargetRegisterClass::iterator AOB = RC->allocation_order_begin(*MF);
-  TargetRegisterClass::iterator AOE = RC->allocation_order_end(*MF);
+  ArrayRef<unsigned> AO = RegClassInfo.getOrder(RC);
 
   // First try to find a completely free register.
-  for (TargetRegisterClass::iterator I = AOB; I != AOE; ++I) {
+  for (ArrayRef<unsigned>::iterator I = AO.begin(), E = AO.end(); I != E; ++I) {
     unsigned PhysReg = *I;
-    if (PhysRegState[PhysReg] == regFree && !UsedInInstr.test(PhysReg) &&
-        Allocatable.test(PhysReg))
+    if (PhysRegState[PhysReg] == regFree && !UsedInInstr.test(PhysReg))
       return assignVirtToPhysReg(LRE, PhysReg);
   }
 
@@ -514,11 +511,7 @@ void RAFast::allocVirtReg(MachineInstr *MI, LiveRegEntry &LRE, unsigned Hint) {
                << RC->getName() << "\n");
 
   unsigned BestReg = 0, BestCost = spillImpossible;
-  for (TargetRegisterClass::iterator I = AOB; I != AOE; ++I) {
-    if (!Allocatable.test(*I)) {
-      DEBUG(dbgs() << "\tRegister " << *I << " is not allocatable.\n");
-      continue;
-    }
+  for (ArrayRef<unsigned>::iterator I = AO.begin(), E = AO.end(); I != E; ++I) {
     unsigned Cost = calcSpillCost(*I);
     DEBUG(dbgs() << "\tRegister: " << *I << "\n");
     DEBUG(dbgs() << "\tCost: " << Cost << "\n");
@@ -772,7 +765,7 @@ void RAFast::AllocateBasicBlock() {
   // Add live-in registers as live.
   for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
          E = MBB->livein_end(); I != E; ++I)
-    if (Allocatable.test(*I))
+    if (RegClassInfo.isAllocatable(*I))
       definePhysReg(MII, *I, regReserved);
 
   SmallVector<unsigned, 8> VirtDead;
@@ -903,7 +896,7 @@ void RAFast::AllocateBasicBlock() {
         }
         continue;
       }
-      if (!Allocatable.test(Reg)) continue;
+      if (!RegClassInfo.isAllocatable(Reg)) continue;
       if (MO.isUse()) {
         usePhysReg(MO);
       } else if (MO.isEarlyClobber()) {
@@ -992,7 +985,7 @@ void RAFast::AllocateBasicBlock() {
       unsigned Reg = MO.getReg();
 
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-        if (!Allocatable.test(Reg)) continue;
+        if (!RegClassInfo.isAllocatable(Reg)) continue;
         definePhysReg(MI, Reg, (MO.isImplicit() || MO.isDead()) ?
                                regFree : regReserved);
         continue;
@@ -1048,9 +1041,8 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
   TM = &Fn.getTarget();
   TRI = TM->getRegisterInfo();
   TII = TM->getInstrInfo();
-
+  RegClassInfo.runOnMachineFunction(Fn);
   UsedInInstr.resize(TRI->getNumRegs());
-  Allocatable = TRI->getAllocatableSet(*MF);
 
   // initialize the virtual->physical register map to have a 'null'
   // mapping for all virtual registers
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 7c461d8..8d06325 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -62,7 +62,6 @@ class RAGreedy : public MachineFunctionPass,
 
   // context
   MachineFunction *MF;
-  BitVector ReservedRegs;
 
   // analyses
   SlotIndexes *Indexes;
@@ -72,6 +71,7 @@ class RAGreedy : public MachineFunctionPass,
   MachineLoopRanges *LoopRanges;
   EdgeBundles *Bundles;
   SpillPlacement *SpillPlacer;
+  LiveDebugVariables *DebugVars;
 
   // state
   std::auto_ptr<Spiller> SpillerInstance;
@@ -99,6 +99,8 @@ class RAGreedy : public MachineFunctionPass,
     RS_Spill     ///< Produced by spilling.
   };
 
+  static const char *const StageName[];
+
   IndexedMap<unsigned char, VirtReg2IndexFunctor> LRStage;
 
   LiveRangeStage getStage(const LiveInterval &VirtReg) const {
@@ -115,6 +117,15 @@ class RAGreedy : public MachineFunctionPass,
     }
   }
 
+  // Eviction. Sometimes an assigned live range can be evicted without
+  // conditions, but other times it must be split after being evicted to avoid
+  // infinite loops.
+  enum CanEvict {
+    CE_Never,    ///< Can never evict.
+    CE_Always,   ///< Can always evict.
+    CE_WithSplit ///< Can evict only if range is also split or spilled.
+  };
+
   // splitting state.
   std::auto_ptr<SplitAnalysis> SA;
   std::auto_ptr<SplitEditor> SE;
@@ -143,10 +154,6 @@ class RAGreedy : public MachineFunctionPass,
   /// class.
   SmallVector<GlobalSplitCandidate, 32> GlobalCand;
 
-  /// For every instruction in SA->UseSlots, store the previous non-copy
-  /// instruction.
-  SmallVector<SlotIndex, 8> PrevSlot;
-
 public:
   RAGreedy();
 
@@ -183,9 +190,7 @@ private:
   void splitAroundRegion(LiveInterval&, GlobalSplitCandidate&,
                          SmallVectorImpl<LiveInterval*>&);
   void calcGapWeights(unsigned, SmallVectorImpl<float>&);
-  SlotIndex getPrevMappedIndex(const MachineInstr*);
-  void calcPrevSlots();
-  unsigned nextSplitPoint(unsigned);
+  CanEvict canEvict(LiveInterval &A, LiveInterval &B);
   bool canEvictInterference(LiveInterval&, unsigned, float&);
 
   unsigned tryAssign(LiveInterval&, AllocationOrder&,
@@ -203,6 +208,17 @@ private:
 
 char RAGreedy::ID = 0;
 
+#ifndef NDEBUG
+const char *const RAGreedy::StageName[] = {
+  "RS_New",
+  "RS_First",
+  "RS_Second",
+  "RS_Global",
+  "RS_Local",
+  "RS_Spill"
+};
+#endif
+
 // Hysteresis to use when comparing floats.
 // This helps stabilize decisions based on float comparisons.
 const float Hysteresis = 0.98f;
@@ -377,6 +393,20 @@ unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
 //                         Interference eviction
 //===----------------------------------------------------------------------===//
 
+/// canEvict - determine if A can evict the assigned live range B. The eviction
+/// policy defined by this function together with the allocation order defined
+/// by enqueue() decides which registers ultimately end up being split and
+/// spilled.
+///
+/// This function must define a non-circular relation when it returns CE_Always,
+/// otherwise infinite eviction loops are possible. When evicting a <= RS_Second
+/// range, it is possible to return CE_WithSplit which forces the evicted
+/// register to be split or spilled before it can evict anything again. That
+/// guarantees progress.
+RAGreedy::CanEvict RAGreedy::canEvict(LiveInterval &A, LiveInterval &B) {
+  return A.weight > B.weight ? CE_Always : CE_Never;
+}
+
 /// canEvict - Return true if all interferences between VirtReg and PhysReg can
 /// be evicted.
 /// Return false if any interference is heavier than MaxWeight.
@@ -397,6 +427,16 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
         return false;
       if (Intf->weight >= MaxWeight)
         return false;
+      switch (canEvict(VirtReg, *Intf)) {
+      case CE_Always:
+        break;
+      case CE_Never:
+        return false;
+      case CE_WithSplit:
+        if (getStage(*Intf) > RS_Second)
+          return false;
+        break;
+      }
       Weight = std::max(Weight, Intf->weight);
     }
   }
@@ -415,7 +455,7 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
   NamedRegionTimer T("Evict", TimerGroupName, TimePassesIsEnabled);
 
   // Keep track of the lightest single interference seen so far.
-  float BestWeight = VirtReg.weight;
+  float BestWeight = HUGE_VALF;
   unsigned BestPhys = 0;
 
   Order.rewind();
@@ -456,6 +496,11 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
       unassign(*Intf, VRM->getPhys(Intf->reg));
       ++NumEvicted;
       NewVRegs.push_back(Intf);
+      // Prevent looping by forcing the evicted ranges to be split before they
+      // can evict anything else.
+      if (getStage(*Intf) < RS_Second &&
+          canEvict(VirtReg, *Intf) == CE_WithSplit)
+        LRStage[Intf->reg] = RS_Second;
     }
   }
   return BestPhys;
@@ -499,7 +544,7 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
         BC.Entry = SpillPlacement::MustSpill, ++Ins;
       else if (Intf.first() < BI.FirstUse)
         BC.Entry = SpillPlacement::PrefSpill, ++Ins;
-      else if (Intf.first() < (BI.LiveThrough ? BI.LastUse : BI.Kill))
+      else if (Intf.first() < BI.LastUse)
         ++Ins;
     }
 
@@ -509,7 +554,7 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
         BC.Exit = SpillPlacement::MustSpill, ++Ins;
       else if (Intf.last() > BI.LastUse)
         BC.Exit = SpillPlacement::PrefSpill, ++Ins;
-      else if (Intf.last() > (BI.LiveThrough ? BI.FirstUse : BI.Def))
+      else if (Intf.last() > BI.FirstUse)
         ++Ins;
     }
 
@@ -758,7 +803,7 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg,
       DEBUG(dbgs() << ", no interference");
       if (!BI.LiveThrough) {
         DEBUG(dbgs() << ", not live-through.\n");
-        SE->useIntv(SE->enterIntvBefore(BI.Def), Stop);
+        SE->useIntv(SE->enterIntvBefore(BI.FirstUse), Stop);
         continue;
       }
       if (!RegIn) {
@@ -775,10 +820,10 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg,
     // Block has interference.
     DEBUG(dbgs() << ", interference to " << Intf.last());
 
-    if (!BI.LiveThrough && Intf.last() <= BI.Def) {
+    if (!BI.LiveThrough && Intf.last() <= BI.FirstUse) {
       // The interference doesn't reach the outgoing segment.
-      DEBUG(dbgs() << " doesn't affect def from " << BI.Def << '\n');
-      SE->useIntv(BI.Def, Stop);
+      DEBUG(dbgs() << " doesn't affect def from " << BI.FirstUse << '\n');
+      SE->useIntv(BI.FirstUse, Stop);
       continue;
     }
 
@@ -834,7 +879,7 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg,
       DEBUG(dbgs() << ", no interference");
       if (!BI.LiveThrough) {
         DEBUG(dbgs() << ", killed in block.\n");
-        SE->useIntv(Start, SE->leaveIntvAfter(BI.Kill));
+        SE->useIntv(Start, SE->leaveIntvAfter(BI.LastUse));
         continue;
       }
       if (!RegOut) {
@@ -867,10 +912,10 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg,
     // Block has interference.
     DEBUG(dbgs() << ", interference from " << Intf.first());
 
-    if (!BI.LiveThrough && Intf.first() >= BI.Kill) {
+    if (!BI.LiveThrough && Intf.first() >= BI.LastUse) {
       // The interference doesn't reach the outgoing segment.
-      DEBUG(dbgs() << " doesn't affect kill at " << BI.Kill << '\n');
-      SE->useIntv(Start, BI.Kill);
+      DEBUG(dbgs() << " doesn't affect kill at " << BI.LastUse << '\n');
+      SE->useIntv(Start, BI.LastUse);
       continue;
     }
 
@@ -920,8 +965,10 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg,
 
   SmallVector<unsigned, 8> IntvMap;
   SE->finish(&IntvMap);
+  DebugVars->splitRegister(VirtReg.reg, LREdit.regs());
+
   LRStage.resize(MRI->getNumVirtRegs());
-  unsigned OrigBlocks = SA->getNumThroughBlocks() + SA->getUseBlocks().size();
+  unsigned OrigBlocks = SA->getNumLiveBlocks();
 
   // Sort out the new intervals created by splitting. We get four kinds:
   // - Remainder intervals should not be split again.
@@ -1083,47 +1130,6 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
   }
 }
 
-/// getPrevMappedIndex - Return the slot index of the last non-copy instruction
-/// before MI that has a slot index. If MI is the first mapped instruction in
-/// its block, return the block start index instead.
-///
-SlotIndex RAGreedy::getPrevMappedIndex(const MachineInstr *MI) {
-  assert(MI && "Missing MachineInstr");
-  const MachineBasicBlock *MBB = MI->getParent();
-  MachineBasicBlock::const_iterator B = MBB->begin(), I = MI;
-  while (I != B)
-    if (!(--I)->isDebugValue() && !I->isCopy())
-      return Indexes->getInstructionIndex(I);
-  return Indexes->getMBBStartIdx(MBB);
-}
-
-/// calcPrevSlots - Fill in the PrevSlot array with the index of the previous
-/// real non-copy instruction for each instruction in SA->UseSlots.
-///
-void RAGreedy::calcPrevSlots() {
-  const SmallVectorImpl<SlotIndex> &Uses = SA->UseSlots;
-  PrevSlot.clear();
-  PrevSlot.reserve(Uses.size());
-  for (unsigned i = 0, e = Uses.size(); i != e; ++i) {
-    const MachineInstr *MI = Indexes->getInstructionFromIndex(Uses[i]);
-    PrevSlot.push_back(getPrevMappedIndex(MI).getDefIndex());
-  }
-}
-
-/// nextSplitPoint - Find the next index into SA->UseSlots > i such that it may
-/// be beneficial to split before UseSlots[i].
-///
-/// 0 is always a valid split point
-unsigned RAGreedy::nextSplitPoint(unsigned i) {
-  const SmallVectorImpl<SlotIndex> &Uses = SA->UseSlots;
-  const unsigned Size = Uses.size();
-  assert(i != Size && "No split points after the end");
-  // Allow split before i when Uses[i] is not adjacent to the previous use.
-  while (++i != Size && PrevSlot[i].getBaseIndex() <= Uses[i-1].getBaseIndex())
-    ;
-  return i;
-}
-
 /// tryLocalSplit - Try to split VirtReg into smaller intervals inside its only
 /// basic block.
 ///
@@ -1151,11 +1157,27 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     dbgs() << '\n';
   });
 
-  // For every use, find the previous mapped non-copy instruction.
-  // We use this to detect valid split points, and to estimate new interval
-  // sizes.
-  calcPrevSlots();
+  // Since we allow local split results to be split again, there is a risk of
+  // creating infinite loops. It is tempting to require that the new live
+  // ranges have less instructions than the original. That would guarantee
+  // convergence, but it is too strict. A live range with 3 instructions can be
+  // split 2+3 (including the COPY), and we want to allow that.
+  //
+  // Instead we use these rules:
+  //
+  // 1. Allow any split for ranges with getStage() < RS_Local. (Except for the
+  //    noop split, of course).
+  // 2. Require progress be made for ranges with getStage() >= RS_Local. All
+  //    the new ranges must have fewer instructions than before the split.
+  // 3. New ranges with the same number of instructions are marked RS_Local,
+  //    smaller ranges are marked RS_New.
+  //
+  // These rules allow a 3 -> 2+3 split once, which we need. They also prevent
+  // excessive splitting and infinite loops.
+  //
+  bool ProgressRequired = getStage(VirtReg) >= RS_Local;
 
+  // Best split candidate.
   unsigned BestBefore = NumGaps;
   unsigned BestAfter = 0;
   float BestDiff = 0;
@@ -1173,13 +1195,11 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     // The new spill weight must be larger than any gap interference.
 
     // We will split before Uses[SplitBefore] and after Uses[SplitAfter].
-    unsigned SplitBefore = 0, SplitAfter = nextSplitPoint(1) - 1;
+    unsigned SplitBefore = 0, SplitAfter = 1;
 
     // MaxGap should always be max(GapWeight[SplitBefore..SplitAfter-1]).
     // It is the spill weight that needs to be evicted.
     float MaxGap = GapWeight[0];
-    for (unsigned i = 1; i != SplitAfter; ++i)
-      MaxGap = std::max(MaxGap, GapWeight[i]);
 
     for (;;) {
       // Live before/after split?
@@ -1197,32 +1217,22 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
       }
       // Should the interval be extended or shrunk?
       bool Shrink = true;
-      if (MaxGap < HUGE_VALF) {
-        // Estimate the new spill weight.
-        //
-        // Each instruction reads and writes the register, except the first
-        // instr doesn't read when !FirstLive, and the last instr doesn't write
-        // when !LastLive.
-        //
-        // We will be inserting copies before and after, so the total number of
-        // reads and writes is 2 * EstUses.
-        //
-        const unsigned EstUses = 2*(SplitAfter - SplitBefore) +
-                                 2*(LiveBefore + LiveAfter);
 
-        // Try to guess the size of the new interval. This should be trivial,
-        // but the slot index of an inserted copy can be a lot smaller than the
-        // instruction it is inserted before if there are many dead indexes
-        // between them.
-        //
-        // We measure the distance from the instruction before SplitBefore to
-        // get a conservative estimate.
-        //
-        // The final distance can still be different if inserting copies
-        // triggers a slot index renumbering.
+      // How many gaps would the new range have?
+      unsigned NewGaps = LiveBefore + SplitAfter - SplitBefore + LiveAfter;
+
+      // Legally, without causing looping?
+      bool Legal = !ProgressRequired || NewGaps < NumGaps;
+
+      if (Legal && MaxGap < HUGE_VALF) {
+        // Estimate the new spill weight. Each instruction reads or writes the
+        // register. Conservatively assume there are no read-modify-write
+        // instructions.
         //
-        const float EstWeight = normalizeSpillWeight(blockFreq * EstUses,
-                              PrevSlot[SplitBefore].distance(Uses[SplitAfter]));
+        // Try to guess the size of the new interval.
+        const float EstWeight = normalizeSpillWeight(blockFreq * (NewGaps + 1),
+                                 Uses[SplitBefore].distance(Uses[SplitAfter]) +
+                                 (LiveBefore + LiveAfter)*SlotIndex::InstrDist);
         // Would this split be possible to allocate?
         // Never allocate all gaps, we wouldn't be making progress.
         DEBUG(dbgs() << " w=" << EstWeight);
@@ -1240,8 +1250,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 
       // Try to shrink.
       if (Shrink) {
-        SplitBefore = nextSplitPoint(SplitBefore);
-        if (SplitBefore < SplitAfter) {
+        if (++SplitBefore < SplitAfter) {
           DEBUG(dbgs() << " shrink\n");
           // Recompute the max when necessary.
           if (GapWeight[SplitBefore - 1] >= MaxGap) {
@@ -1261,10 +1270,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
       }
 
       DEBUG(dbgs() << " extend\n");
-      for (unsigned e = nextSplitPoint(SplitAfter + 1) - 1;
-           SplitAfter != e; ++SplitAfter)
-        MaxGap = std::max(MaxGap, GapWeight[SplitAfter]);
-          continue;
+      MaxGap = std::max(MaxGap, GapWeight[SplitAfter++]);
     }
   }
 
@@ -1283,8 +1289,27 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   SlotIndex SegStart = SE->enterIntvBefore(Uses[BestBefore]);
   SlotIndex SegStop  = SE->leaveIntvAfter(Uses[BestAfter]);
   SE->useIntv(SegStart, SegStop);
-  SE->finish();
-  setStage(NewVRegs.begin(), NewVRegs.end(), RS_Local);
+  SmallVector<unsigned, 8> IntvMap;
+  SE->finish(&IntvMap);
+  DebugVars->splitRegister(VirtReg.reg, LREdit.regs());
+
+  // If the new range has the same number of instructions as before, mark it as
+  // RS_Local so the next split will be forced to make progress. Otherwise,
+  // leave the new intervals as RS_New so they can compete.
+  bool LiveBefore = BestBefore != 0 || BI.LiveIn;
+  bool LiveAfter = BestAfter != NumGaps || BI.LiveOut;
+  unsigned NewGaps = LiveBefore + BestAfter - BestBefore + LiveAfter;
+  if (NewGaps >= NumGaps) {
+    DEBUG(dbgs() << "Tagging non-progress ranges: ");
+    assert(!ProgressRequired && "Didn't make progress when it was required.");
+    LRStage.resize(MRI->getNumVirtRegs());
+    for (unsigned i = 0, e = IntvMap.size(); i != e; ++i)
+      if (IntvMap[i] == 1) {
+        LRStage[LREdit.get(i)->reg] = RS_Local;
+        DEBUG(dbgs() << PrintReg(LREdit.get(i)->reg));
+      }
+    DEBUG(dbgs() << '\n');
+  }
   ++NumLocalSplits;
 
   return 0;
@@ -1315,6 +1340,17 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
 
   SA->analyze(&VirtReg);
 
+  // FIXME: SplitAnalysis may repair broken live ranges coming from the
+  // coalescer. That may cause the range to become allocatable which means that
+  // tryRegionSplit won't be making progress. This check should be replaced with
+  // an assertion when the coalescer is fixed.
+  if (SA->didRepairRange()) {
+    // VirtReg has changed, so all cached queries are invalid.
+    invalidateVirtRegs();
+    if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs))
+      return PhysReg;
+  }
+
   // First try to split around a region spanning multiple blocks.
   unsigned PhysReg = tryRegionSplit(VirtReg, Order, NewVRegs);
   if (PhysReg || !NewVRegs.empty())
@@ -1343,19 +1379,25 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
 unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
                                  SmallVectorImpl<LiveInterval*> &NewVRegs) {
   // First try assigning a free register.
-  AllocationOrder Order(VirtReg.reg, *VRM, ReservedRegs);
+  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo);
   if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs))
     return PhysReg;
 
-  if (unsigned PhysReg = tryEvict(VirtReg, Order, NewVRegs))
-    return PhysReg;
+  LiveRangeStage Stage = getStage(VirtReg);
+  DEBUG(dbgs() << StageName[Stage] << '\n');
+
+  // Try to evict a less worthy live range, but only for ranges from the primary
+  // queue. The RS_Second ranges already failed to do this, and they should not
+  // get a second chance until they have been split.
+  if (Stage != RS_Second)
+    if (unsigned PhysReg = tryEvict(VirtReg, Order, NewVRegs))
+      return PhysReg;
 
   assert(NewVRegs.empty() && "Cannot append to existing NewVRegs");
 
   // The first time we see a live range, don't try to split or spill.
   // Wait until the second time, when all smaller ranges have been allocated.
   // This gives a better picture of the interference to split around.
-  LiveRangeStage Stage = getStage(VirtReg);
   if (Stage == RS_First) {
     LRStage[VirtReg.reg] = RS_Second;
     DEBUG(dbgs() << "wait for second round\n");
@@ -1363,7 +1405,10 @@ unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
     return 0;
   }
 
-  assert(Stage < RS_Spill && "Cannot allocate after spilling");
+  // If we couldn't allocate a register from spilling, there is probably some
+  // invalid inline assembly. The base class wil report it.
+  if (Stage >= RS_Spill)
+    return ~0u;
 
   // Try splitting VirtReg or interferences.
   unsigned PhysReg = trySplit(VirtReg, Order, NewVRegs);
@@ -1396,12 +1441,12 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   RegAllocBase::init(getAnalysis<VirtRegMap>(), getAnalysis<LiveIntervals>());
   Indexes = &getAnalysis<SlotIndexes>();
   DomTree = &getAnalysis<MachineDominatorTree>();
-  ReservedRegs = TRI->getReservedRegs(*MF);
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
   Loops = &getAnalysis<MachineLoopInfo>();
   LoopRanges = &getAnalysis<MachineLoopRanges>();
   Bundles = &getAnalysis<EdgeBundles>();
   SpillPlacer = &getAnalysis<SpillPlacement>();
+  DebugVars = &getAnalysis<LiveDebugVariables>();
 
   SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops));
   SE.reset(new SplitEditor(*SA, *LIS, *VRM, *DomTree));
@@ -1420,7 +1465,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   }
 
   // Write out new DBG_VALUE instructions.
-  getAnalysis<LiveDebugVariables>().emitDebugValues(VRM);
+  DebugVars->emitDebugValues(VRM);
 
   // The pass output is in VirtRegMap. Release all the transient data.
   releaseMemory();
diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp
new file mode 100644
index 0000000..75b0c90
--- /dev/null
+++ b/lib/CodeGen/RegisterClassInfo.cpp
@@ -0,0 +1,114 @@
+//===-- RegisterClassInfo.cpp - Dynamic Register Class Info ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the RegisterClassInfo class which provides dynamic
+// information about target register classes. Callee saved and reserved
+// registers depends on calling conventions and other dynamic information, so
+// some things cannot be determined statically.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "RegisterClassInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+RegisterClassInfo::RegisterClassInfo() : Tag(0), MF(0), TRI(0), CalleeSaved(0)
+{}
+
+void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
+  bool Update = false;
+  MF = &mf;
+
+  // Allocate new array the first time we see a new target.
+  if (MF->getTarget().getRegisterInfo() != TRI) {
+    TRI = MF->getTarget().getRegisterInfo();
+    RegClass.reset(new RCInfo[TRI->getNumRegClasses()]);
+    Update = true;
+  }
+
+  // Does this MF have different CSRs?
+  const unsigned *CSR = TRI->getCalleeSavedRegs(MF);
+  if (Update || CSR != CalleeSaved) {
+    // Build a CSRNum map. Every CSR alias gets an entry pointing to the last
+    // overlapping CSR.
+    CSRNum.clear();
+    CSRNum.resize(TRI->getNumRegs(), 0);
+    for (unsigned N = 0; unsigned Reg = CSR[N]; ++N)
+      for (const unsigned *AS = TRI->getOverlaps(Reg);
+           unsigned Alias = *AS; ++AS)
+        CSRNum[Alias] = N + 1; // 0 means no CSR, 1 means CalleeSaved[0], ...
+    Update = true;
+  }
+  CalleeSaved = CSR;
+
+  // Different reserved registers?
+  BitVector RR = TRI->getReservedRegs(*MF);
+  if (RR != Reserved)
+    Update = true;
+  Reserved = RR;
+
+  // Invalidate cached information from previous function.
+  if (Update)
+    ++Tag;
+}
+
+/// compute - Compute the preferred allocation order for RC with reserved
+/// registers filtered out. Volatile registers come first followed by CSR
+/// aliases ordered according to the CSR order specified by the target.
+void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
+  RCInfo &RCI = RegClass[RC->getID()];
+
+  // Raw register count, including all reserved regs.
+  unsigned NumRegs = RC->getNumRegs();
+
+  if (!RCI.Order)
+    RCI.Order.reset(new unsigned[NumRegs]);
+
+  unsigned N = 0;
+  SmallVector<unsigned, 16> CSRAlias;
+
+  // FIXME: Once targets reserve registers instead of removing them from the
+  // allocation order, we can simply use begin/end here.
+  TargetRegisterClass::iterator AOB = RC->allocation_order_begin(*MF);
+  TargetRegisterClass::iterator AOE = RC->allocation_order_end(*MF);
+
+  for (TargetRegisterClass::iterator I = AOB; I != AOE; ++I) {
+    unsigned PhysReg = *I;
+    // Remove reserved registers from the allocation order.
+    if (Reserved.test(PhysReg))
+      continue;
+    if (CSRNum[PhysReg])
+      // PhysReg aliases a CSR, save it for later.
+      CSRAlias.push_back(PhysReg);
+    else
+      RCI.Order[N++] = PhysReg;
+  }
+  RCI.NumRegs = N + CSRAlias.size();
+  assert (RCI.NumRegs <= NumRegs && "Allocation order larger than regclass");
+
+  // CSR aliases go after the volatile registers, preserve the target's order.
+  std::copy(CSRAlias.begin(), CSRAlias.end(), &RCI.Order[N]);
+
+  DEBUG({
+    dbgs() << "AllocationOrder(" << RC->getName() << ") = [";
+    for (unsigned I = 0; I != N; ++I)
+      dbgs() << ' ' << PrintReg(RCI.Order[I], TRI);
+    dbgs() << " ]\n";
+  });
+
+  // RCI is now up-to-date.
+  RCI.Tag = Tag;
+}
+
diff --git a/lib/CodeGen/RegisterClassInfo.h b/lib/CodeGen/RegisterClassInfo.h
new file mode 100644
index 0000000..6f7d9c9
--- /dev/null
+++ b/lib/CodeGen/RegisterClassInfo.h
@@ -0,0 +1,121 @@
+//===-- RegisterClassInfo.h - Dynamic Register Class Info -*- C++ -*-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the RegisterClassInfo class which provides dynamic
+// information about target register classes. Callee saved and reserved
+// registers depends on calling conventions and other dynamic information, so
+// some things cannot be determined statically.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REGISTERCLASSINFO_H
+#define LLVM_CODEGEN_REGISTERCLASSINFO_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+
+class RegisterClassInfo {
+  struct RCInfo {
+    unsigned Tag;
+    unsigned NumRegs;
+    OwningArrayPtr<unsigned> Order;
+
+    RCInfo() : Tag(0), NumRegs(0) {}
+    operator ArrayRef<unsigned>() const {
+      return ArrayRef<unsigned>(Order.get(), NumRegs);
+    }
+  };
+
+  // Brief cached information for each register class.
+  OwningArrayPtr<RCInfo> RegClass;
+
+  // Tag changes whenever cached information needs to be recomputed. An RCInfo
+  // entry is valid when its tag matches.
+  unsigned Tag;
+
+  const MachineFunction *MF;
+  const TargetRegisterInfo *TRI;
+
+  // Callee saved registers of last MF. Assumed to be valid until the next
+  // runOnFunction() call.
+  const unsigned *CalleeSaved;
+
+  // Map register number to CalleeSaved index + 1;
+  SmallVector<uint8_t, 4> CSRNum;
+
+  // Reserved registers in the current MF.
+  BitVector Reserved;
+
+  // Compute all information about RC.
+  void compute(const TargetRegisterClass *RC) const;
+
+  // Return an up-to-date RCInfo for RC.
+  const RCInfo &get(const TargetRegisterClass *RC) const {
+    const RCInfo &RCI = RegClass[RC->getID()];
+    if (Tag != RCI.Tag)
+      compute(RC);
+    return RCI;
+  }
+
+public:
+  RegisterClassInfo();
+
+  /// runOnFunction - Prepare to answer questions about MF. This must be called
+  /// before any other methods are used.
+  void runOnMachineFunction(const MachineFunction &MF);
+
+  /// getNumAllocatableRegs - Returns the number of actually allocatable
+  /// registers in RC in the current function.
+  unsigned getNumAllocatableRegs(const TargetRegisterClass *RC) const {
+    return get(RC).NumRegs;
+  }
+
+  /// getOrder - Returns the preferred allocation order for RC. The order
+  /// contains no reserved registers, and registers that alias callee saved
+  /// registers come last.
+  ArrayRef<unsigned> getOrder(const TargetRegisterClass *RC) const {
+    return get(RC);
+  }
+
+  /// getLastCalleeSavedAlias - Returns the last callee saved register that
+  /// overlaps PhysReg, or 0 if Reg doesn't overlap a CSR.
+  unsigned getLastCalleeSavedAlias(unsigned PhysReg) const {
+    assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
+    if (unsigned N = CSRNum[PhysReg])
+      return CalleeSaved[N-1];
+    return 0;
+  }
+
+  /// isReserved - Returns true when PhysReg is a reserved register.
+  ///
+  /// Reserved registers may belong to an allocatable register class, but the
+  /// target has explicitly requested that they are not used.
+  ///
+  bool isReserved(unsigned PhysReg) const {
+    return Reserved.test(PhysReg);
+  }
+
+  /// isAllocatable - Returns true when PhysReg belongs to an allocatable
+  /// register class and it hasn't been reserved.
+  ///
+  /// Allocatable registers may show up in the allocation order of some virtual
+  /// register, so a register allocator needs to track its liveness and
+  /// availability.
+  bool isAllocatable(unsigned PhysReg) const {
+    return TRI->get(PhysReg).inAllocatableClass && !isReserved(PhysReg);
+  }
+};
+} // end namespace llvm
+
+#endif
+
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index ebfe533..9e9a145 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -154,13 +154,16 @@ void RegScavenger::forward() {
   BitVector DeadRegs(NumPhysRegs);
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg() || MO.isUndef())
+    if (!MO.isReg())
       continue;
     unsigned Reg = MO.getReg();
     if (!Reg || isReserved(Reg))
       continue;
 
     if (MO.isUse()) {
+      // Ignore undef uses.
+      if (MO.isUndef())
+        continue;
       // Two-address operands implicitly kill.
       if (!isPred && (MO.isKill() || MI->isRegTiedToDefOperand(i)))
         addRegWithSubRegs(KillRegs, Reg);
@@ -178,12 +181,14 @@ void RegScavenger::forward() {
   // Verify uses and defs.
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg() || MO.isUndef())
+    if (!MO.isReg())
       continue;
     unsigned Reg = MO.getReg();
     if (!Reg || isReserved(Reg))
       continue;
     if (MO.isUse()) {
+      if (MO.isUndef())
+        continue;
       if (!isUsed(Reg)) {
         // Check if it's partial live: e.g.
         // D0 = insert_subreg D0<undef>, S0
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 67c209e..2363df4 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -35,8 +35,9 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      const MachineDominatorTree &mdt)
   : ScheduleDAG(mf), MLI(mli), MDT(mdt), MFI(mf.getFrameInfo()),
     InstrItins(mf.getTarget().getInstrItineraryData()),
-    Defs(TRI->getNumRegs()), Uses(TRI->getNumRegs()), LoopRegs(MLI, MDT) {
-  DbgValueVec.clear();
+    Defs(TRI->getNumRegs()), Uses(TRI->getNumRegs()), 
+    LoopRegs(MLI, MDT), FirstDbgValue(0) {
+  DbgValues.clear();
 }
 
 /// Run - perform scheduling.
@@ -120,7 +121,7 @@ static const Value *getUnderlyingObjectForInstr(const MachineInstr *MI,
     // such aliases.
     if (PSV->isAliased(MFI))
       return 0;
-    
+
     MayAlias = PSV->mayAlias(MFI);
     return V;
   }
@@ -174,7 +175,7 @@ void ScheduleDAGInstrs::AddSchedBarrierDeps() {
     for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
            SE = BB->succ_end(); SI != SE; ++SI)
       for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
-             E = (*SI)->livein_end(); I != E; ++I) {    
+             E = (*SI)->livein_end(); I != E; ++I) {
         unsigned Reg = *I;
         if (Seen.insert(Reg))
           Uses[Reg].push_back(&ExitSU);
@@ -200,11 +201,6 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
   std::map<const Value *, SUnit *> AliasMemDefs, NonAliasMemDefs;
   std::map<const Value *, std::vector<SUnit *> > AliasMemUses, NonAliasMemUses;
 
-  // Keep track of dangling debug references to registers.
-  std::vector<std::pair<MachineInstr*, unsigned> >
-    DanglingDebugValue(TRI->getNumRegs(),
-    std::make_pair(static_cast<MachineInstr*>(0), 0));
-
   // Check to see if the scheduler cares about latencies.
   bool UnitLatencies = ForceUnitLatencies();
 
@@ -214,26 +210,32 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
 
   // Remove any stale debug info; sometimes BuildSchedGraph is called again
   // without emitting the info from the previous call.
-  DbgValueVec.clear();
+  DbgValues.clear();
+  FirstDbgValue = NULL;
 
   // Model data dependencies between instructions being scheduled and the
   // ExitSU.
   AddSchedBarrierDeps();
 
+  for (int i = 0, e = TRI->getNumRegs(); i != e; ++i) {
+    assert(Defs[i].empty() && "Only BuildGraph should push/pop Defs");
+  }
+
   // Walk the list of instructions, from bottom moving up.
+  MachineInstr *PrevMI = NULL;
   for (MachineBasicBlock::iterator MII = InsertPos, MIE = Begin;
        MII != MIE; --MII) {
     MachineInstr *MI = prior(MII);
-    // DBG_VALUE does not have SUnit's built, so just remember these for later
-    // reinsertion.
+    if (MI && PrevMI) {
+      DbgValues.push_back(std::make_pair(PrevMI, MI));
+      PrevMI = NULL;
+    }
+
     if (MI->isDebugValue()) {
-      if (MI->getNumOperands()==3 && MI->getOperand(0).isReg() &&
-          MI->getOperand(0).getReg())
-        DanglingDebugValue[MI->getOperand(0).getReg()] =
-             std::make_pair(MI, DbgValueVec.size());
-      DbgValueVec.push_back(MI);
+      PrevMI = MI;
       continue;
     }
+
     const TargetInstrDesc &TID = MI->getDesc();
     assert(!TID.isTerminator() && !MI->isLabel() &&
            "Cannot schedule terminators or labels!");
@@ -257,13 +259,8 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
 
       assert(TRI->isPhysicalRegister(Reg) && "Virtual register encountered!");
 
-      if (MO.isDef() && DanglingDebugValue[Reg].first!=0) {
-        SU->DbgInstrList.push_back(DanglingDebugValue[Reg].first);
-        DbgValueVec[DanglingDebugValue[Reg].second] = 0;
-        DanglingDebugValue[Reg] = std::make_pair((MachineInstr*)0, 0);
-      }
-
       std::vector<SUnit *> &UseList = Uses[Reg];
+      // Defs are push in the order they are visited and never reordered.
       std::vector<SUnit *> &DefList = Defs[Reg];
       // Optionally add output and anti dependencies. For anti
       // dependencies we use a latency of 0 because for a multi-issue
@@ -283,9 +280,9 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
           DefSU->addPred(SDep(SU, Kind, AOLatency, /*Reg=*/Reg));
       }
       for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) {
-        std::vector<SUnit *> &DefList = Defs[*Alias];
-        for (unsigned i = 0, e = DefList.size(); i != e; ++i) {
-          SUnit *DefSU = DefList[i];
+        std::vector<SUnit *> &MemDefList = Defs[*Alias];
+        for (unsigned i = 0, e = MemDefList.size(); i != e; ++i) {
+          SUnit *DefSU = MemDefList[i];
           if (DefSU == &ExitSU)
             continue;
           if (DefSU != SU &&
@@ -393,6 +390,16 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
         UseList.clear();
         if (!MO.isDead())
           DefList.clear();
+
+        // Calls will not be reordered because of chain dependencies (see
+        // below). Since call operands are dead, calls may continue to be added
+        // to the DefList making dependence checking quadratic in the size of
+        // the block. Instead, we leave only one call at the back of the
+        // DefList.
+        if (SU->isCall) {
+          while (!DefList.empty() && DefList.back()->isCall)
+            DefList.pop_back();
+        }
         DefList.push_back(SU);
       } else {
         UseList.push_back(SU);
@@ -411,11 +418,11 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
 #define STORE_LOAD_LATENCY 1
     unsigned TrueMemOrderLatency = 0;
     if (TID.isCall() || MI->hasUnmodeledSideEffects() ||
-        (MI->hasVolatileMemoryRef() && 
+        (MI->hasVolatileMemoryRef() &&
          (!TID.mayLoad() || !MI->isInvariantLoad(AA)))) {
       // Be conservative with these and add dependencies on all memory
       // references, even those that are known to not alias.
-      for (std::map<const Value *, SUnit *>::iterator I = 
+      for (std::map<const Value *, SUnit *>::iterator I =
              NonAliasMemDefs.begin(), E = NonAliasMemDefs.end(); I != E; ++I) {
         I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
       }
@@ -458,9 +465,9 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
         // A store to a specific PseudoSourceValue. Add precise dependencies.
         // Record the def in MemDefs, first adding a dep if there is
         // an existing def.
-        std::map<const Value *, SUnit *>::iterator I = 
+        std::map<const Value *, SUnit *>::iterator I =
           ((MayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V));
-        std::map<const Value *, SUnit *>::iterator IE = 
+        std::map<const Value *, SUnit *>::iterator IE =
           ((MayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
         if (I != IE) {
           I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0, /*Reg=*/0,
@@ -513,39 +520,41 @@ void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) {
       if (MI->isInvariantLoad(AA)) {
         // Invariant load, no chain dependencies needed!
       } else {
-        if (const Value *V = 
+        if (const Value *V =
             getUnderlyingObjectForInstr(MI, MFI, MayAlias)) {
           // A load from a specific PseudoSourceValue. Add precise dependencies.
-          std::map<const Value *, SUnit *>::iterator I = 
+          std::map<const Value *, SUnit *>::iterator I =
             ((MayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V));
-          std::map<const Value *, SUnit *>::iterator IE = 
+          std::map<const Value *, SUnit *>::iterator IE =
             ((MayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
           if (I != IE)
             I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0, /*Reg=*/0,
                                     /*isNormalMemory=*/true));
           if (MayAlias)
             AliasMemUses[V].push_back(SU);
-          else 
+          else
             NonAliasMemUses[V].push_back(SU);
         } else {
           // A load with no underlying object. Depend on all
           // potentially aliasing stores.
-          for (std::map<const Value *, SUnit *>::iterator I = 
+          for (std::map<const Value *, SUnit *>::iterator I =
                  AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I)
             I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
-          
+
           PendingLoads.push_back(SU);
           MayAlias = true;
         }
-        
+
         // Add dependencies on alias and barrier chains, if needed.
         if (MayAlias && AliasChain)
           AliasChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
         if (BarrierChain)
           BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
-      } 
+      }
     }
   }
+  if (PrevMI)
+    FirstDbgValue = PrevMI;
 
   for (int i = 0, e = TRI->getNumRegs(); i != e; ++i) {
     Defs[i].clear();
@@ -572,11 +581,11 @@ void ScheduleDAGInstrs::ComputeLatency(SUnit *SU) {
   }
 }
 
-void ScheduleDAGInstrs::ComputeOperandLatency(SUnit *Def, SUnit *Use, 
+void ScheduleDAGInstrs::ComputeOperandLatency(SUnit *Def, SUnit *Use,
                                               SDep& dep) const {
   if (!InstrItins || InstrItins->isEmpty())
     return;
-  
+
   // For a data dependency with a known register...
   if ((dep.getKind() != SDep::Data) || (dep.getReg() == 0))
     return;
@@ -655,39 +664,33 @@ MachineBasicBlock *ScheduleDAGInstrs::EmitSchedule() {
     BB->remove(I);
   }
 
-  // First reinsert any remaining debug_values; these are either constants,
-  // or refer to live-in registers.  The beginning of the block is the right
-  // place for the latter.  The former might reasonably be placed elsewhere
-  // using some kind of ordering algorithm, but right now it doesn't matter.
-  for (int i = DbgValueVec.size()-1; i>=0; --i)
-    if (DbgValueVec[i])
-      BB->insert(InsertPos, DbgValueVec[i]);
+  // If first instruction was a DBG_VALUE then put it back.
+  if (FirstDbgValue)
+    BB->insert(InsertPos, FirstDbgValue);
 
   // Then re-insert them according to the given schedule.
   for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
-    SUnit *SU = Sequence[i];
-    if (!SU) {
+    if (SUnit *SU = Sequence[i])
+      BB->insert(InsertPos, SU->getInstr());
+    else
       // Null SUnit* is a noop.
       EmitNoop();
-      continue;
-    }
-
-    BB->insert(InsertPos, SU->getInstr());
-    for (unsigned i = 0, e = SU->DbgInstrList.size() ; i < e ; ++i)
-      BB->insert(InsertPos, SU->DbgInstrList[i]);
   }
 
   // Update the Begin iterator, as the first instruction in the block
   // may have been scheduled later.
-  if (!DbgValueVec.empty()) {
-    for (int i = DbgValueVec.size()-1; i>=0; --i)
-      if (DbgValueVec[i]!=0) {
-        Begin = DbgValueVec[DbgValueVec.size()-1];
-        break;
-      }
-  } else if (!Sequence.empty())
+  if (!Sequence.empty())
     Begin = Sequence[0]->getInstr();
 
-  DbgValueVec.clear();
+  // Reinsert any remaining debug_values.
+  for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator
+         DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
+    std::pair<MachineInstr *, MachineInstr *> P = *prior(DI);
+    MachineInstr *DbgValue = P.first;
+    MachineInstr *OrigPrivMI = P.second;
+    BB->insertAfter(OrigPrivMI, DbgValue);
+  }
+  DbgValues.clear();
+  FirstDbgValue = NULL;
   return BB;
 }
diff --git a/lib/CodeGen/ScheduleDAGInstrs.h b/lib/CodeGen/ScheduleDAGInstrs.h
index c878287..8a4ea85 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.h
+++ b/lib/CodeGen/ScheduleDAGInstrs.h
@@ -110,10 +110,6 @@ namespace llvm {
     std::vector<std::vector<SUnit *> > Defs;
     std::vector<std::vector<SUnit *> > Uses;
  
-    /// DbgValueVec - Remember DBG_VALUEs that refer to a particular
-    /// register.
-    std::vector<MachineInstr *>DbgValueVec;
-
     /// PendingLoads - Remember where unknown loads are after the most recent
     /// unknown store, as we iterate. As with Defs and Uses, this is here
     /// to minimize construction/destruction.
@@ -128,6 +124,14 @@ namespace llvm {
     ///
     SmallSet<unsigned, 8> LoopLiveInRegs;
 
+  protected:
+
+    /// DbgValues - Remember instruction that preceeds DBG_VALUE.
+    typedef std::vector<std::pair<MachineInstr *, MachineInstr *> > 
+      DbgValueVector;
+    DbgValueVector DbgValues;
+    MachineInstr *FirstDbgValue;
+
   public:
     MachineBasicBlock::iterator Begin;    // The beginning of the range to
                                           // be scheduled. The range extends
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f427511..e843f5f 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -165,6 +165,8 @@ namespace {
     SDValue visitMULHS(SDNode *N);
     SDValue visitSMUL_LOHI(SDNode *N);
     SDValue visitUMUL_LOHI(SDNode *N);
+    SDValue visitSMULO(SDNode *N);
+    SDValue visitUMULO(SDNode *N);
     SDValue visitSDIVREM(SDNode *N);
     SDValue visitUDIVREM(SDNode *N);
     SDValue visitAND(SDNode *N);
@@ -529,7 +531,8 @@ SDValue DAGCombiner::ReassociateOps(unsigned Opc, DebugLoc DL,
                                    cast<ConstantSDNode>(N0.getOperand(1)),
                                    cast<ConstantSDNode>(N1));
       return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
-    } else if (N0.hasOneUse()) {
+    }
+    if (N0.hasOneUse()) {
       // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one use
       SDValue OpNode = DAG.getNode(Opc, N0.getDebugLoc(), VT,
                                    N0.getOperand(0), N1);
@@ -546,7 +549,8 @@ SDValue DAGCombiner::ReassociateOps(unsigned Opc, DebugLoc DL,
                                    cast<ConstantSDNode>(N1.getOperand(1)),
                                    cast<ConstantSDNode>(N0));
       return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode);
-    } else if (N1.hasOneUse()) {
+    }
+    if (N1.hasOneUse()) {
       // reassoc. (op y, (op x, c1)) -> (op (op x, y), c1) iff x+c1 has one use
       SDValue OpNode = DAG.getNode(Opc, N0.getDebugLoc(), VT,
                                    N1.getOperand(0), N0);
@@ -990,6 +994,9 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
           dbgs() << "\nWith: ";
           RV.getNode()->dump(&DAG);
           dbgs() << '\n');
+    
+    // Transfer debug value.
+    DAG.TransferDbgValues(SDValue(N, 0), RV);
     WorkListRemover DeadNodes(*this);
     if (N->getNumValues() == RV.getNode()->getNumValues())
       DAG.ReplaceAllUsesWith(N, RV.getNode(), &DeadNodes);
@@ -1045,6 +1052,8 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::MULHS:              return visitMULHS(N);
   case ISD::SMUL_LOHI:          return visitSMUL_LOHI(N);
   case ISD::UMUL_LOHI:          return visitUMUL_LOHI(N);
+  case ISD::SMULO:              return visitSMULO(N);
+  case ISD::UMULO:              return visitUMULO(N);
   case ISD::SDIVREM:            return visitSDIVREM(N);
   case ISD::UDIVREM:            return visitUDIVREM(N);
   case ISD::AND:                return visitAND(N);
@@ -1566,7 +1575,8 @@ static SDValue tryFoldToZero(DebugLoc DL, const TargetLowering &TLI, EVT VT,
                              SelectionDAG &DAG, bool LegalOperations) {
   if (!VT.isVector()) {
     return DAG.getConstant(0, VT);
-  } else if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) {
+  }
+  if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) {
     // Produce a vector of zeros.
     SDValue El = DAG.getConstant(0, VT.getVectorElementType());
     std::vector<SDValue> Ops(VT.getVectorNumElements(), El);
@@ -2174,6 +2184,26 @@ SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitSMULO(SDNode *N) {
+  // (smulo x, 2) -> (saddo x, x)
+  if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    if (C2->getAPIntValue() == 2)
+      return DAG.getNode(ISD::SADDO, N->getDebugLoc(), N->getVTList(),
+                         N->getOperand(0), N->getOperand(0));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUMULO(SDNode *N) {
+  // (umulo x, 2) -> (uaddo x, x)
+  if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    if (C2->getAPIntValue() == 2)
+      return DAG.getNode(ISD::UADDO, N->getDebugLoc(), N->getVTList(),
+                         N->getOperand(0), N->getOperand(0));
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSDIVREM(SDNode *N) {
   SDValue Res = SimplifyNodeWithTwoResults(N, ISD::SDIV, ISD::SREM);
   if (Res.getNode()) return Res;
@@ -3062,26 +3092,27 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     }
   }
 
-  // fold (shl (srl x, c1), c2) -> (shl (and x, (shl -1, c1)), (sub c2, c1)) or
-  //                               (srl (and x, (shl -1, c1)), (sub c1, c2))
+  // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
+  //                               (and (srl x, (sub c1, c2), MASK)
   if (N1C && N0.getOpcode() == ISD::SRL &&
       N0.getOperand(1).getOpcode() == ISD::Constant) {
     uint64_t c1 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
     if (c1 < VT.getSizeInBits()) {
       uint64_t c2 = N1C->getZExtValue();
-      SDValue HiBitsMask =
-        DAG.getConstant(APInt::getHighBitsSet(VT.getSizeInBits(),
-                                              VT.getSizeInBits() - c1),
-                        VT);
-      SDValue Mask = DAG.getNode(ISD::AND, N0.getDebugLoc(), VT,
-                                 N0.getOperand(0),
-                                 HiBitsMask);
-      if (c2 > c1)
-        return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, Mask,
-                           DAG.getConstant(c2-c1, N1.getValueType()));
-      else
-        return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, Mask,
-                           DAG.getConstant(c1-c2, N1.getValueType()));
+      APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+                                         VT.getSizeInBits() - c1);
+      SDValue Shift;
+      if (c2 > c1) {
+        Mask = Mask.shl(c2-c1);
+        Shift = DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0.getOperand(0),
+                            DAG.getConstant(c2-c1, N1.getValueType()));
+      } else {
+        Mask = Mask.lshr(c1-c2);
+        Shift = DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0.getOperand(0),
+                            DAG.getConstant(c1-c2, N1.getValueType()));
+      }
+      return DAG.getNode(ISD::AND, N0.getDebugLoc(), VT, Shift,
+                         DAG.getConstant(Mask, VT));
     }
   }
   // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
@@ -4014,7 +4045,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
       EVT EltVT = VT.getVectorElementType();
       SmallVector<SDValue,8> OneOps(VT.getVectorNumElements(),
                                     DAG.getConstant(1, EltVT));
-      if (VT.getSizeInBits() == N0VT.getSizeInBits()) {
+      if (VT.getSizeInBits() == N0VT.getSizeInBits())
         // We know that the # elements of the results is the same as the
         // # elements of the compare (and the # elements of the compare result
         // for that matter).  Check to see that they are the same size.  If so,
@@ -4026,25 +4057,24 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                                  cast<CondCodeSDNode>(N0.getOperand(2))->get()),
                            DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT,
                                        &OneOps[0], OneOps.size()));
-      } else {
-        // If the desired elements are smaller or larger than the source
-        // elements we can use a matching integer vector type and then
-        // truncate/sign extend
-        EVT MatchingElementType =
-          EVT::getIntegerVT(*DAG.getContext(),
-                            N0VT.getScalarType().getSizeInBits());
-        EVT MatchingVectorType =
-          EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
-                           N0VT.getVectorNumElements());
-        SDValue VsetCC =
-          DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
-                        N0.getOperand(1),
-                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
-        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
-                           DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT),
-                           DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT,
-                                       &OneOps[0], OneOps.size()));
-      }
+
+      // If the desired elements are smaller or larger than the source
+      // elements we can use a matching integer vector type and then
+      // truncate/sign extend
+      EVT MatchingElementType =
+        EVT::getIntegerVT(*DAG.getContext(),
+                          N0VT.getScalarType().getSizeInBits());
+      EVT MatchingVectorType =
+        EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
+                         N0VT.getVectorNumElements());
+      SDValue VsetCC =
+        DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
+                      N0.getOperand(1),
+                      cast<CondCodeSDNode>(N0.getOperand(2))->get());
+      return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                         DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT),
+                         DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT,
+                                     &OneOps[0], OneOps.size()));
     }
 
     // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
@@ -6494,18 +6524,18 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
   // (vextract (scalar_to_vector val, 0) -> val
   SDValue InVec = N->getOperand(0);
 
- if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
-   // Check if the result type doesn't match the inserted element type. A
-   // SCALAR_TO_VECTOR may truncate the inserted element and the
-   // EXTRACT_VECTOR_ELT may widen the extracted vector.
-   SDValue InOp = InVec.getOperand(0);
-   EVT NVT = N->getValueType(0);
-   if (InOp.getValueType() != NVT) {
-     assert(InOp.getValueType().isInteger() && NVT.isInteger());
-     return DAG.getSExtOrTrunc(InOp, InVec.getDebugLoc(), NVT);
-   }
-   return InOp;
- }
+  if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    // Check if the result type doesn't match the inserted element type. A
+    // SCALAR_TO_VECTOR may truncate the inserted element and the
+    // EXTRACT_VECTOR_ELT may widen the extracted vector.
+    SDValue InOp = InVec.getOperand(0);
+    EVT NVT = N->getValueType(0);
+    if (InOp.getValueType() != NVT) {
+      assert(InOp.getValueType().isInteger() && NVT.isInteger());
+      return DAG.getSExtOrTrunc(InOp, InVec.getDebugLoc(), NVT);
+    }
+    return InOp;
+  }
 
   // Perform only after legalization to ensure build_vector / vector_shuffle
   // optimizations have already been done.
@@ -6566,7 +6596,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       }
     }
 
-    if (!LN0 || !LN0->hasOneUse() || LN0->isVolatile())
+    if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile())
       return SDValue();
 
     // If Idx was -1 above, Elt is going to be -1, so just return undef.
@@ -7505,18 +7535,17 @@ bool DAGCombiner::FindAliasInfo(SDNode *N,
     SrcValueAlign = LD->getOriginalAlignment();
     TBAAInfo = LD->getTBAAInfo();
     return true;
-  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+  }
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     Ptr = ST->getBasePtr();
     Size = ST->getMemoryVT().getSizeInBits() >> 3;
     SrcValue = ST->getSrcValue();
     SrcValueOffset = ST->getSrcValueOffset();
     SrcValueAlign = ST->getOriginalAlignment();
     TBAAInfo = ST->getTBAAInfo();
-  } else {
-    llvm_unreachable("FindAliasInfo expected a memory operand");
+    return false;
   }
-
-  return false;
+  llvm_unreachable("FindAliasInfo expected a memory operand");
 }
 
 /// GatherAllAliases - Walk up chain skipping non-aliasing memory nodes,
@@ -7629,13 +7658,13 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
   // Accumulate all the aliases to this node.
   GatherAllAliases(N, OldChain, Aliases);
 
-  if (Aliases.size() == 0) {
-    // If no operands then chain to entry token.
+  // If no operands then chain to entry token.
+  if (Aliases.size() == 0)
     return DAG.getEntryNode();
-  } else if (Aliases.size() == 1) {
-    // If a single operand then chain to it.  We don't need to revisit it.
+
+  // If a single operand then chain to it.  We don't need to revisit it.
+  if (Aliases.size() == 1)
     return Aliases[0];
-  }
 
   // Construct a custom tailored token factor.
   return DAG.getNode(ISD::TokenFactor, N->getDebugLoc(), MVT::Other,
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 3af9482..797f174 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Operator.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -110,8 +111,8 @@ unsigned FastISel::getRegForValue(const Value *V) {
   // of whether FastISel can handle them.
   MVT VT = RealVT.getSimpleVT();
   if (!TLI.isTypeLegal(VT)) {
-    // Promote MVT::i1 to a legal type though, because it's common and easy.
-    if (VT == MVT::i1)
+    // Handle integer promotions, though, because they're common and easy.
+    if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
       VT = TLI.getTypeToTransformTo(V->getContext(), VT).getSimpleVT();
     else
       return 0;
@@ -234,10 +235,10 @@ unsigned FastISel::lookUpRegForValue(const Value *V) {
 /// NOTE: This is only necessary because we might select a block that uses
 /// a value before we select the block that defines the value.  It might be
 /// possible to fix this by selecting blocks in reverse postorder.
-unsigned FastISel::UpdateValueMap(const Value *I, unsigned Reg) {
+void FastISel::UpdateValueMap(const Value *I, unsigned Reg, unsigned NumRegs) {
   if (!isa<Instruction>(I)) {
     LocalValueMap[I] = Reg;
-    return Reg;
+    return;
   }
 
   unsigned &AssignedReg = FuncInfo.ValueMap[I];
@@ -246,12 +247,11 @@ unsigned FastISel::UpdateValueMap(const Value *I, unsigned Reg) {
     AssignedReg = Reg;
   else if (Reg != AssignedReg) {
     // Arrange for uses of AssignedReg to be replaced by uses of Reg.
-    FuncInfo.RegFixups[AssignedReg] = Reg;
+    for (unsigned i = 0; i < NumRegs; i++)
+      FuncInfo.RegFixups[AssignedReg+i] = Reg+i;
 
     AssignedReg = Reg;
   }
-
-  return AssignedReg;
 }
 
 std::pair<unsigned, bool> FastISel::getRegForGEPIndex(const Value *Idx) {
@@ -628,6 +628,16 @@ bool FastISel::SelectCall(const User *I) {
 
     return true;
   }
+  case Intrinsic::objectsize: {
+    ConstantInt *CI = cast<ConstantInt>(Call->getArgOperand(1));
+    unsigned long long Res = CI->isZero() ? -1ULL : 0;
+    Constant *ResCI = ConstantInt::get(Call->getType(), Res);
+    unsigned ResultReg = getRegForValue(ResCI);
+    if (ResultReg == 0)
+      return false;
+    UpdateValueMap(Call, ResultReg);
+    return true;
+  }
   }
 
   // An arbitrary call. Bail.
@@ -643,21 +653,13 @@ bool FastISel::SelectCast(const User *I, unsigned Opcode) {
     // Unhandled type. Halt "fast" selection and bail.
     return false;
 
-  // Check if the destination type is legal. Or as a special case,
-  // it may be i1 if we're doing a truncate because that's
-  // easy and somewhat common.
+  // Check if the destination type is legal.
   if (!TLI.isTypeLegal(DstVT))
-    if (DstVT != MVT::i1 || Opcode != ISD::TRUNCATE)
-      // Unhandled type. Halt "fast" selection and bail.
-      return false;
+    return false;
 
-  // Check if the source operand is legal. Or as a special case,
-  // it may be i1 if we're doing zero-extension because that's
-  // easy and somewhat common.
+  // Check if the source operand is legal.
   if (!TLI.isTypeLegal(SrcVT))
-    if (SrcVT != MVT::i1 || Opcode != ISD::ZERO_EXTEND)
-      // Unhandled type. Halt "fast" selection and bail.
-      return false;
+    return false;
 
   unsigned InputReg = getRegForValue(I->getOperand(0));
   if (!InputReg)
@@ -666,18 +668,6 @@ bool FastISel::SelectCast(const User *I, unsigned Opcode) {
 
   bool InputRegIsKill = hasTrivialKill(I->getOperand(0));
 
-  // If the operand is i1, arrange for the high bits in the register to be zero.
-  if (SrcVT == MVT::i1) {
-   SrcVT = TLI.getTypeToTransformTo(I->getContext(), SrcVT);
-   InputReg = FastEmitZExtFromI1(SrcVT.getSimpleVT(), InputReg, InputRegIsKill);
-   if (!InputReg)
-     return false;
-   InputRegIsKill = true;
-  }
-  // If the result is i1, truncate to the target's type for i1 first.
-  if (DstVT == MVT::i1)
-    DstVT = TLI.getTypeToTransformTo(I->getContext(), DstVT);
-
   unsigned ResultReg = FastEmit_r(SrcVT.getSimpleVT(),
                                   DstVT.getSimpleVT(),
                                   Opcode,
@@ -829,6 +819,47 @@ FastISel::SelectFNeg(const User *I) {
 }
 
 bool
+FastISel::SelectExtractValue(const User *U) {
+  const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(U);
+  if (!EVI)
+    return false;
+
+  // Make sure we only try to handle extracts with a legal result.  But also
+  // allow i1 because it's easy.
+  EVT RealVT = TLI.getValueType(EVI->getType(), /*AllowUnknown=*/true);
+  if (!RealVT.isSimple())
+    return false;
+  MVT VT = RealVT.getSimpleVT();
+  if (!TLI.isTypeLegal(VT) && VT != MVT::i1)
+    return false;
+
+  const Value *Op0 = EVI->getOperand(0);
+  const Type *AggTy = Op0->getType();
+
+  // Get the base result register.
+  unsigned ResultReg;
+  DenseMap<const Value *, unsigned>::iterator I = FuncInfo.ValueMap.find(Op0);
+  if (I != FuncInfo.ValueMap.end())
+    ResultReg = I->second;
+  else if (isa<Instruction>(Op0))
+    ResultReg = FuncInfo.InitializeRegForValue(Op0);
+  else
+    return false; // fast-isel can't handle aggregate constants at the moment
+
+  // Get the actual result register, which is an offset from the base register.
+  unsigned VTIndex = ComputeLinearIndex(AggTy, EVI->idx_begin(), EVI->idx_end());
+
+  SmallVector<EVT, 4> AggValueVTs;
+  ComputeValueVTs(TLI, AggTy, AggValueVTs);
+
+  for (unsigned i = 0; i < VTIndex; i++)
+    ResultReg += TLI.getNumRegisters(FuncInfo.Fn->getContext(), AggValueVTs[i]);
+
+  UpdateValueMap(EVI, ResultReg);
+  return true;
+}
+
+bool
 FastISel::SelectOperator(const User *I, unsigned Opcode) {
   switch (Opcode) {
   case Instruction::Add:
@@ -932,6 +963,9 @@ FastISel::SelectOperator(const User *I, unsigned Opcode) {
     return true;
   }
 
+  case Instruction::ExtractValue:
+    return SelectExtractValue(I);
+
   case Instruction::PHI:
     llvm_unreachable("FastISel shouldn't visit PHI nodes!");
 
@@ -1097,6 +1131,30 @@ unsigned FastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
+unsigned FastISel::FastEmitInst_rrr(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC,
+                                   unsigned Op0, bool Op0IsKill,
+                                   unsigned Op1, bool Op1IsKill,
+                                   unsigned Op2, bool Op2IsKill) {
+  unsigned ResultReg = createResultReg(RC);
+  const TargetInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addReg(Op1, Op1IsKill * RegState::Kill)
+      .addReg(Op2, Op2IsKill * RegState::Kill);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addReg(Op1, Op1IsKill * RegState::Kill)
+      .addReg(Op2, Op2IsKill * RegState::Kill);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
 unsigned FastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
                                    const TargetRegisterClass *RC,
                                    unsigned Op0, bool Op0IsKill,
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index d8a5770..d518b5d 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -54,25 +54,6 @@ static bool isUsedOutsideOfDefiningBlock(const Instruction *I) {
   return false;
 }
 
-/// isOnlyUsedInEntryBlock - If the specified argument is only used in the
-/// entry block, return true.  This includes arguments used by switches, since
-/// the switch may expand into multiple basic blocks.
-static bool isOnlyUsedInEntryBlock(const Argument *A, bool EnableFastISel) {
-  // With FastISel active, we may be splitting blocks, so force creation
-  // of virtual registers for all non-dead arguments.
-  if (EnableFastISel)
-    return A->use_empty();
-
-  const BasicBlock *Entry = A->getParent()->begin();
-  for (Value::const_use_iterator UI = A->use_begin(), E = A->use_end();
-       UI != E; ++UI) {
-    const User *U = *UI;
-    if (cast<Instruction>(U)->getParent() != Entry || isa<SwitchInst>(U))
-      return false;  // Use not in entry block.
-  }
-  return true;
-}
-
 FunctionLoweringInfo::FunctionLoweringInfo(const TargetLowering &tli)
   : TLI(tli) {
 }
@@ -86,16 +67,10 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) {
   SmallVector<ISD::OutputArg, 4> Outs;
   GetReturnInfo(Fn->getReturnType(),
                 Fn->getAttributes().getRetAttributes(), Outs, TLI);
-  CanLowerReturn = TLI.CanLowerReturn(Fn->getCallingConv(), Fn->isVarArg(),
+  CanLowerReturn = TLI.CanLowerReturn(Fn->getCallingConv(), *MF,
+				      Fn->isVarArg(),
                                       Outs, Fn->getContext());
 
-  // Create a vreg for each argument register that is not dead and is used
-  // outside of the entry block for the function.
-  for (Function::const_arg_iterator AI = Fn->arg_begin(), E = Fn->arg_end();
-       AI != E; ++AI)
-    if (!isOnlyUsedInEntryBlock(AI, EnableFastISel))
-      InitializeRegForValue(AI);
-
   // Initialize the mapping of values to registers.  This is only set up for
   // instruction values that are used outside of the block that defines
   // them.
@@ -181,6 +156,10 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) {
          const PHINode *PN = dyn_cast<PHINode>(I); ++I) {
       if (PN->use_empty()) continue;
 
+      // Skip empty types
+      if (PN->getType()->isEmptyTy())
+        continue;
+
       DebugLoc DL = PN->getDebugLoc();
       unsigned PHIReg = ValueMap[PN];
       assert(PHIReg && "PHI node does not have an assigned virtual register!");
@@ -343,7 +322,7 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
       APInt Zero(BitWidth, 0);
       DestLOI.KnownZero = Zero;
       DestLOI.KnownOne = Zero;
-      return;      
+      return;
     }
 
     if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
@@ -375,18 +354,18 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
 /// setByValArgumentFrameIndex - Record frame index for the byval
 /// argument. This overrides previous frame index entry for this argument,
 /// if any.
-void FunctionLoweringInfo::setByValArgumentFrameIndex(const Argument *A, 
+void FunctionLoweringInfo::setByValArgumentFrameIndex(const Argument *A,
                                                       int FI) {
   assert (A->hasByValAttr() && "Argument does not have byval attribute!");
   ByValArgFrameIndexMap[A] = FI;
 }
-  
+
 /// getByValArgumentFrameIndex - Get frame index for the byval argument.
 /// If the argument does not have any assigned frame index then 0 is
 /// returned.
 int FunctionLoweringInfo::getByValArgumentFrameIndex(const Argument *A) {
   assert (A->hasByValAttr() && "Argument does not have byval attribute!");
-  DenseMap<const Argument *, int>::iterator I = 
+  DenseMap<const Argument *, int>::iterator I =
     ByValArgFrameIndexMap.find(A);
   if (I != ByValArgFrameIndexMap.end())
     return I->second;
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index e309def..cb49a80 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -283,7 +283,7 @@ InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op,
       DstRC = II->OpInfo[IIOpNum].getRegClass(TRI);
     assert((DstRC || (TID.isVariadic() && IIOpNum >= TID.getNumOperands())) &&
            "Don't have operand info for this instruction!");
-    if (DstRC && SrcRC != DstRC && !SrcRC->hasSuperClass(DstRC)) {
+    if (DstRC && !SrcRC->hasSuperClassEq(DstRC)) {
       unsigned NewVReg = MRI->createVirtualRegister(DstRC);
       BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
               TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 2b6c56e..62d777c 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -14,23 +14,16 @@
 #include "llvm/Analysis/DebugInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/GlobalVariable.h"
 #include "llvm/LLVMContext.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -57,7 +50,6 @@ class SelectionDAGLegalize {
   const TargetMachine &TM;
   const TargetLowering &TLI;
   SelectionDAG &DAG;
-  CodeGenOpt::Level OptLevel;
 
   // Libcall insertion helpers.
 
@@ -93,13 +85,13 @@ class SelectionDAGLegalize {
   }
 
 public:
-  SelectionDAGLegalize(SelectionDAG &DAG, CodeGenOpt::Level ol);
+  explicit SelectionDAGLegalize(SelectionDAG &DAG);
 
   /// getTypeAction - Return how we should legalize values of this type, either
   /// it is already legal or we need to expand it into multiple registers of
   /// smaller integer type, or we need to promote it to a larger type.
   LegalizeAction getTypeAction(EVT VT) const {
-    return (LegalizeAction)ValueTypeActions.getTypeAction(VT);
+    return (LegalizeAction)TLI.getTypeAction(*DAG.getContext(), VT);
   }
 
   /// isTypeLegal - Return true if this type is legal on this target.
@@ -226,10 +218,9 @@ SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  DebugLoc dl,
   return DAG.getVectorShuffle(NVT, dl, N1, N2, &NewMask[0]);
 }
 
-SelectionDAGLegalize::SelectionDAGLegalize(SelectionDAG &dag,
-                                           CodeGenOpt::Level ol)
+SelectionDAGLegalize::SelectionDAGLegalize(SelectionDAG &dag)
   : TM(dag.getTarget()), TLI(dag.getTargetLoweringInfo()),
-    DAG(dag), OptLevel(ol),
+    DAG(dag),
     ValueTypeActions(TLI.getValueTypeActions()) {
   assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE &&
          "Too many value types for ValueTypeActions to hold!");
@@ -324,6 +315,7 @@ static SDNode *FindCallStartFromCallEnd(SDNode *Node) {
     case ISD::CALLSEQ_START:
       if (!nested)
         return Node;
+      Node = Node->getOperand(0).getNode();
       nested--;
       break;
     case ISD::CALLSEQ_END:
@@ -331,7 +323,7 @@ static SDNode *FindCallStartFromCallEnd(SDNode *Node) {
       break;
     }
   }
-  return 0;
+  return (Node->getOpcode() == ISD::CALLSEQ_START) ? Node : 0;
 }
 
 /// LegalizeAllNodesNotLeadingTo - Recursively walk the uses of N, looking to
@@ -440,68 +432,67 @@ SDValue ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
       SDValue Result = DAG.getNode(ISD::BITCAST, dl, intVT, Val);
       return DAG.getStore(Chain, dl, Result, Ptr, ST->getPointerInfo(),
                           ST->isVolatile(), ST->isNonTemporal(), Alignment);
-    } else {
-      // Do a (aligned) store to a stack slot, then copy from the stack slot
-      // to the final destination using (unaligned) integer loads and stores.
-      EVT StoredVT = ST->getMemoryVT();
-      EVT RegVT =
-        TLI.getRegisterType(*DAG.getContext(),
-                            EVT::getIntegerVT(*DAG.getContext(),
-                                              StoredVT.getSizeInBits()));
-      unsigned StoredBytes = StoredVT.getSizeInBits() / 8;
-      unsigned RegBytes = RegVT.getSizeInBits() / 8;
-      unsigned NumRegs = (StoredBytes + RegBytes - 1) / RegBytes;
-
-      // Make sure the stack slot is also aligned for the register type.
-      SDValue StackPtr = DAG.CreateStackTemporary(StoredVT, RegVT);
-
-      // Perform the original store, only redirected to the stack slot.
-      SDValue Store = DAG.getTruncStore(Chain, dl,
-                                        Val, StackPtr, MachinePointerInfo(),
-                                        StoredVT, false, false, 0);
-      SDValue Increment = DAG.getConstant(RegBytes, TLI.getPointerTy());
-      SmallVector<SDValue, 8> Stores;
-      unsigned Offset = 0;
-
-      // Do all but one copies using the full register width.
-      for (unsigned i = 1; i < NumRegs; i++) {
-        // Load one integer register's worth from the stack slot.
-        SDValue Load = DAG.getLoad(RegVT, dl, Store, StackPtr,
-                                   MachinePointerInfo(),
-                                   false, false, 0);
-        // Store it to the final location.  Remember the store.
-        Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr,
-                                    ST->getPointerInfo().getWithOffset(Offset),
-                                      ST->isVolatile(), ST->isNonTemporal(),
-                                      MinAlign(ST->getAlignment(), Offset)));
-        // Increment the pointers.
-        Offset += RegBytes;
-        StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
-                               Increment);
-        Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
-      }
+    }
+    // Do a (aligned) store to a stack slot, then copy from the stack slot
+    // to the final destination using (unaligned) integer loads and stores.
+    EVT StoredVT = ST->getMemoryVT();
+    EVT RegVT =
+      TLI.getRegisterType(*DAG.getContext(),
+                          EVT::getIntegerVT(*DAG.getContext(),
+                                            StoredVT.getSizeInBits()));
+    unsigned StoredBytes = StoredVT.getSizeInBits() / 8;
+    unsigned RegBytes = RegVT.getSizeInBits() / 8;
+    unsigned NumRegs = (StoredBytes + RegBytes - 1) / RegBytes;
 
-      // The last store may be partial.  Do a truncating store.  On big-endian
-      // machines this requires an extending load from the stack slot to ensure
-      // that the bits are in the right place.
-      EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
-                                    8 * (StoredBytes - Offset));
+    // Make sure the stack slot is also aligned for the register type.
+    SDValue StackPtr = DAG.CreateStackTemporary(StoredVT, RegVT);
 
-      // Load from the stack slot.
-      SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Store, StackPtr,
-                                    MachinePointerInfo(),
-                                    MemVT, false, false, 0);
+    // Perform the original store, only redirected to the stack slot.
+    SDValue Store = DAG.getTruncStore(Chain, dl,
+                                      Val, StackPtr, MachinePointerInfo(),
+                                      StoredVT, false, false, 0);
+    SDValue Increment = DAG.getConstant(RegBytes, TLI.getPointerTy());
+    SmallVector<SDValue, 8> Stores;
+    unsigned Offset = 0;
 
-      Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr,
-                                         ST->getPointerInfo()
-                                           .getWithOffset(Offset),
-                                         MemVT, ST->isVolatile(),
-                                         ST->isNonTemporal(),
-                                         MinAlign(ST->getAlignment(), Offset)));
-      // The order of the stores doesn't matter - say it with a TokenFactor.
-      return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Stores[0],
-                         Stores.size());
+    // Do all but one copies using the full register width.
+    for (unsigned i = 1; i < NumRegs; i++) {
+      // Load one integer register's worth from the stack slot.
+      SDValue Load = DAG.getLoad(RegVT, dl, Store, StackPtr,
+                                 MachinePointerInfo(),
+                                 false, false, 0);
+      // Store it to the final location.  Remember the store.
+      Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr,
+                                  ST->getPointerInfo().getWithOffset(Offset),
+                                    ST->isVolatile(), ST->isNonTemporal(),
+                                    MinAlign(ST->getAlignment(), Offset)));
+      // Increment the pointers.
+      Offset += RegBytes;
+      StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                             Increment);
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
     }
+
+    // The last store may be partial.  Do a truncating store.  On big-endian
+    // machines this requires an extending load from the stack slot to ensure
+    // that the bits are in the right place.
+    EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
+                                  8 * (StoredBytes - Offset));
+
+    // Load from the stack slot.
+    SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Store, StackPtr,
+                                  MachinePointerInfo(),
+                                  MemVT, false, false, 0);
+
+    Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr,
+                                       ST->getPointerInfo()
+                                         .getWithOffset(Offset),
+                                       MemVT, ST->isVolatile(),
+                                       ST->isNonTemporal(),
+                                       MinAlign(ST->getAlignment(), Offset)));
+    // The order of the stores doesn't matter - say it with a TokenFactor.
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Stores[0],
+                       Stores.size());
   }
   assert(ST->getMemoryVT().isInteger() &&
          !ST->getMemoryVT().isVector() &&
@@ -1093,8 +1084,9 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
     {
       SDNode *myCALLSEQ_BEGIN = FindCallStartFromCallEnd(Node);
 
-      // If the CALLSEQ_START node hasn't been legalized first, legalize it.  This
-      // will cause this node to be legalized as well as handling libcalls right.
+      // If the CALLSEQ_START node hasn't been legalized first, legalize it.
+      // This will cause this node to be legalized as well as handling libcalls
+      // right.
       if (getLastCALLSEQ().getNode() != Node) {
         LegalizeOp(SDValue(myCALLSEQ_BEGIN, 0));
         DenseMap<SDValue, SDValue>::iterator I = LegalizedNodes.find(Op);
@@ -2059,14 +2051,14 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
   return CallInfo.first;
 }
 
-/// ExpandLibCall - Generate a libcall taking the given operands as arguments 
+/// ExpandLibCall - Generate a libcall taking the given operands as arguments
 /// and returning a result of type RetVT.
 SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
                                             const SDValue *Ops, unsigned NumOps,
                                             bool isSigned, DebugLoc dl) {
   TargetLowering::ArgListTy Args;
   Args.reserve(NumOps);
-  
+
   TargetLowering::ArgListEntry Entry;
   for (unsigned i = 0; i != NumOps; ++i) {
     Entry.Node = Ops[i];
@@ -2077,14 +2069,14 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
                                          TLI.getPointerTy());
-  
+
   const Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
   std::pair<SDValue,SDValue> CallInfo =
   TLI.LowerCallTo(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
                   false, 0, TLI.getLibcallCallingConv(LC), false,
                   /*isReturnValueUsed=*/true,
                   Callee, Args, DAG, dl);
-  
+
   // Legalize the call sequence, starting with the chain.  This will advance
   // the LastCALLSEQ_END to the legalized version of the CALLSEQ_END node that
   // was added by LowerCallTo (guaranteeing proper serialization of calls).
@@ -3432,8 +3424,8 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
       else if (WideVT == MVT::i128)
         LC = RTLIB::MUL_I128;
       assert(LC != RTLIB::UNKNOWN_LIBCALL && "Cannot expand this operation!");
-      
-      // The high part is obtained by SRA'ing all but one of the bits of low 
+
+      // The high part is obtained by SRA'ing all but one of the bits of low
       // part.
       unsigned LoSize = VT.getSizeInBits();
       SDValue HiLHS = DAG.getNode(ISD::SRA, dl, VT, RHS,
@@ -3452,7 +3444,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
       TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Ret,
                             DAG.getIntPtrConstant(1));
     }
-    
+
     if (isSigned) {
       Tmp1 = DAG.getConstant(VT.getSizeInBits() - 1,
                              TLI.getShiftAmountTy(BottomHalf.getValueType()));
@@ -3534,9 +3526,13 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
                          Tmp2.getOperand(0), Tmp2.getOperand(1),
                          Node->getOperand(2));
     } else {
+      // We test only the i1 bit.  Skip the AND if UNDEF.
+      Tmp3 = (Tmp2.getOpcode() == ISD::UNDEF) ? Tmp2 :
+        DAG.getNode(ISD::AND, dl, Tmp2.getValueType(), Tmp2,
+                    DAG.getConstant(1, Tmp2.getValueType()));
       Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other, Tmp1,
-                         DAG.getCondCode(ISD::SETNE), Tmp2,
-                         DAG.getConstant(0, Tmp2.getValueType()),
+                         DAG.getCondCode(ISD::SETNE), Tmp3,
+                         DAG.getConstant(0, Tmp3.getValueType()),
                          Node->getOperand(2));
     }
     Results.push_back(Tmp1);
@@ -3746,9 +3742,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node,
 
 // SelectionDAG::Legalize - This is the entry point for the file.
 //
-void SelectionDAG::Legalize(CodeGenOpt::Level OptLevel) {
+void SelectionDAG::Legalize() {
   /// run - This is the main entry point to this class.
   ///
-  SelectionDAGLegalize(*this, OptLevel).LegalizeDAG();
+  SelectionDAGLegalize(*this).LegalizeDAG();
 }
-
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 935aab0..b8da57f 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -73,6 +73,17 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::UNDEF:       Res = PromoteIntRes_UNDEF(N); break;
   case ISD::VAARG:       Res = PromoteIntRes_VAARG(N); break;
 
+  case ISD::EXTRACT_SUBVECTOR:
+                         Res = PromoteIntRes_EXTRACT_SUBVECTOR(N); break;
+  case ISD::VECTOR_SHUFFLE:
+                         Res = PromoteIntRes_VECTOR_SHUFFLE(N); break;
+  case ISD::INSERT_VECTOR_ELT:
+                         Res = PromoteIntRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::BUILD_VECTOR:
+                         Res = PromoteIntRes_BUILD_VECTOR(N); break;
+  case ISD::SCALAR_TO_VECTOR:
+                         Res = PromoteIntRes_SCALAR_TO_VECTOR(N); break;
+
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:  Res = PromoteIntRes_INT_EXTEND(N); break;
@@ -174,24 +185,28 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
   default:
     assert(false && "Unknown type action!");
     break;
-  case Legal:
+  case TargetLowering::TypeLegal:
     break;
-  case PromoteInteger:
+  case TargetLowering::TypePromoteInteger:
     if (NOutVT.bitsEq(NInVT))
       // The input promotes to the same size.  Convert the promoted value.
       return DAG.getNode(ISD::BITCAST, dl, NOutVT, GetPromotedInteger(InOp));
+    if (NInVT.isVector())
+      // Promote vector element via memory load/store.
+      return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
+                         CreateStackStoreLoad(InOp, OutVT));
     break;
-  case SoftenFloat:
+  case TargetLowering::TypeSoftenFloat:
     // Promote the integer operand by hand.
     return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftenedFloat(InOp));
-  case ExpandInteger:
-  case ExpandFloat:
+  case TargetLowering::TypeExpandInteger:
+  case TargetLowering::TypeExpandFloat:
     break;
-  case ScalarizeVector:
+  case TargetLowering::TypeScalarizeVector:
     // Convert the element to an integer and promote it by hand.
     return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
                        BitConvertToInteger(GetScalarizedVector(InOp)));
-  case SplitVector: {
+  case TargetLowering::TypeSplitVector: {
     // For example, i32 = BITCAST v2i16 on alpha.  Convert the split
     // pieces of the input into integers and reassemble in the final type.
     SDValue Lo, Hi;
@@ -208,7 +223,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
                        JoinIntegers(Lo, Hi));
     return DAG.getNode(ISD::BITCAST, dl, NOutVT, InOp);
   }
-  case WidenVector:
+  case TargetLowering::TypeWidenVector:
     if (OutVT.bitsEq(NInVT))
       // The input is widened to the same size.  Convert to the widened value.
       return DAG.getNode(ISD::BITCAST, dl, OutVT, GetWidenedVector(InOp));
@@ -342,7 +357,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   DebugLoc dl = N->getDebugLoc();
 
-  if (getTypeAction(N->getOperand(0).getValueType()) == PromoteInteger) {
+  if (getTypeAction(N->getOperand(0).getValueType())
+      == TargetLowering::TypePromoteInteger) {
     SDValue Res = GetPromotedInteger(N->getOperand(0));
     assert(Res.getValueType().bitsLE(NVT) && "Extension doesn't make sense!");
 
@@ -507,11 +523,11 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
 
   switch (getTypeAction(N->getOperand(0).getValueType())) {
   default: llvm_unreachable("Unknown type action!");
-  case Legal:
-  case ExpandInteger:
+  case TargetLowering::TypeLegal:
+  case TargetLowering::TypeExpandInteger:
     Res = N->getOperand(0);
     break;
-  case PromoteInteger:
+  case TargetLowering::TypePromoteInteger:
     Res = GetPromotedInteger(N->getOperand(0));
     break;
   }
@@ -557,9 +573,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) {
   DebugLoc DL = N->getDebugLoc();
   EVT SmallVT = LHS.getValueType();
 
-  // To determine if the result overflowed in a larger type, we extend the input
-  // to the larger type, do the multiply, then check the high bits of the result
-  // to see if the overflow happened.
+  // To determine if the result overflowed in a larger type, we extend the
+  // input to the larger type, do the multiply, then check the high bits of
+  // the result to see if the overflow happened.
   if (N->getOpcode() == ISD::SMULO) {
     LHS = SExtPromotedInteger(LHS);
     RHS = SExtPromotedInteger(RHS);
@@ -569,8 +585,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) {
   }
   SDValue Mul = DAG.getNode(ISD::MUL, DL, LHS.getValueType(), LHS, RHS);
 
-  // Overflow occurred iff the high part of the result does not zero/sign-extend
-  // the low part.
+  // Overflow occurred iff the high part of the result does not
+  // zero/sign-extend the low part.
   SDValue Overflow;
   if (N->getOpcode() == ISD::UMULO) {
     // Unsigned overflow occurred iff the high part is non-zero.
@@ -672,6 +688,8 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::BRCOND:       Res = PromoteIntOp_BRCOND(N, OpNo); break;
   case ISD::BUILD_PAIR:   Res = PromoteIntOp_BUILD_PAIR(N); break;
   case ISD::BUILD_VECTOR: Res = PromoteIntOp_BUILD_VECTOR(N); break;
+  case ISD::CONCAT_VECTORS: Res = PromoteIntOp_CONCAT_VECTORS(N); break;
+  case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntOp_EXTRACT_VECTOR_ELT(N); break;
   case ISD::CONVERT_RNDSAT:
                           Res = PromoteIntOp_CONVERT_RNDSAT(N); break;
   case ISD::INSERT_VECTOR_ELT:
@@ -1513,7 +1531,8 @@ void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N,
   } else {
     // For example, extension of an i48 to an i64.  The operand type necessarily
     // promotes to the result type, so will end up being expanded too.
-    assert(getTypeAction(Op.getValueType()) == PromoteInteger &&
+    assert(getTypeAction(Op.getValueType()) ==
+           TargetLowering::TypePromoteInteger &&
            "Only know how to promote this result!");
     SDValue Res = GetPromotedInteger(Op);
     assert(Res.getValueType() == N->getValueType(0) &&
@@ -2030,7 +2049,8 @@ void DAGTypeLegalizer::ExpandIntRes_SIGN_EXTEND(SDNode *N,
   } else {
     // For example, extension of an i48 to an i64.  The operand type necessarily
     // promotes to the result type, so will end up being expanded too.
-    assert(getTypeAction(Op.getValueType()) == PromoteInteger &&
+    assert(getTypeAction(Op.getValueType()) ==
+           TargetLowering::TypePromoteInteger &&
            "Only know how to promote this result!");
     SDValue Res = GetPromotedInteger(Op);
     assert(Res.getValueType() == N->getValueType(0) &&
@@ -2178,7 +2198,8 @@ void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N,
   } else {
     // For example, extension of an i48 to an i64.  The operand type necessarily
     // promotes to the result type, so will end up being expanded too.
-    assert(getTypeAction(Op.getValueType()) == PromoteInteger &&
+    assert(getTypeAction(Op.getValueType()) ==
+           TargetLowering::TypePromoteInteger &&
            "Only know how to promote this result!");
     SDValue Res = GetPromotedInteger(Op);
     assert(Res.getValueType() == N->getValueType(0) &&
@@ -2613,3 +2634,158 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
          "Don't know how to expand this UINT_TO_FP!");
   return MakeLibCall(LC, DstVT, &Op, 1, true, dl);
 }
+
+SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
+  SDValue InOp0 = N->getOperand(0);
+  EVT InVT = InOp0.getValueType();
+  EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+  unsigned OutNumElems = N->getValueType(0).getVectorNumElements();
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+
+  DebugLoc dl = N->getDebugLoc();
+  SDValue BaseIdx = N->getOperand(1);
+
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i != OutNumElems; ++i) {
+
+    // Extract the element from the original vector.
+    SDValue Index = DAG.getNode(ISD::ADD, dl, BaseIdx.getValueType(),
+      BaseIdx, DAG.getIntPtrConstant(i));
+    SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+      InVT.getVectorElementType(), N->getOperand(0), Index);
+
+    SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, Ext);
+    // Insert the converted element to the new vector.
+    Ops.push_back(Op);
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, &Ops[0], Ops.size());
+}
+
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SHUFFLE(SDNode *N) {
+
+  ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N);
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  unsigned NumElts = VT.getVectorNumElements();
+  SmallVector<int, 8> NewMask;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    NewMask.push_back(SV->getMaskElt(i));
+  }
+
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+  SDValue V1 = GetPromotedInteger(N->getOperand(1));
+  EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+
+  return DAG.getVectorShuffle(OutVT, dl, V0,V1, &NewMask[0]);
+}
+
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_VECTOR(SDNode *N) {
+
+  SDValue InOp0 = N->getOperand(0);
+  EVT InVT = InOp0.getValueType();
+  EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+  unsigned NumElems = N->getNumOperands();
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+
+  DebugLoc dl = N->getDebugLoc();
+
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i != NumElems; ++i) {
+    SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(i));
+    Ops.push_back(Op);
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, &Ops[0], Ops.size());
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) {
+
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue InOp0 = N->getOperand(0);
+  EVT InVT = InOp0.getValueType();
+  EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+  assert(!InVT.isVector() && "Input must not be a scalar");
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+
+  SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(0));
+
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NOutVT, Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) {
+
+  SDValue InOp0 = N->getOperand(0);
+  EVT InVT = InOp0.getValueType();
+  EVT InElVT = InVT.getVectorElementType();
+  EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue ConvertedVector = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, InOp0);
+
+  SDValue ConvElem = DAG.getNode(ISD::ANY_EXTEND, dl,
+    NOutVTElem, N->getOperand(1));
+  return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,NOutVT,
+    ConvertedVector, ConvElem, N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+  SDValue V1 = N->getOperand(1);
+  SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+    V0->getValueType(0).getScalarType(), V0, V1);
+
+  return DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), Ext);
+
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) {
+
+  DebugLoc dl = N->getDebugLoc();
+
+  EVT RetSclrTy = N->getValueType(0).getVectorElementType();
+
+  SmallVector<SDValue, 8> NewOps;
+
+  // For each incoming vector
+  for (unsigned VecIdx = 0, E = N->getNumOperands(); VecIdx!= E; ++VecIdx) {
+    SDValue Incoming = GetPromotedInteger(N->getOperand(VecIdx));
+    EVT SclrTy = Incoming->getValueType(0).getVectorElementType();
+    unsigned NumElem = Incoming->getValueType(0).getVectorNumElements();
+
+    for (unsigned i=0; i<NumElem; ++i) {
+      // Extract element from incoming vector
+      SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SclrTy,
+      Incoming, DAG.getIntPtrConstant(i));
+      SDValue Tr = DAG.getNode(ISD::TRUNCATE, dl, RetSclrTy, Ex);
+      NewOps.push_back(Tr);
+    }
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl,  N->getValueType(0),
+    &NewOps[0], NewOps.size());
+  }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index cedda7e..ba658b0 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -224,38 +224,38 @@ bool DAGTypeLegalizer::run() {
       switch (getTypeAction(ResultVT)) {
       default:
         assert(false && "Unknown action!");
-      case Legal:
+      case TargetLowering::TypeLegal:
         break;
       // The following calls must take care of *all* of the node's results,
       // not just the illegal result they were passed (this includes results
       // with a legal type).  Results can be remapped using ReplaceValueWith,
       // or their promoted/expanded/etc values registered in PromotedIntegers,
       // ExpandedIntegers etc.
-      case PromoteInteger:
+      case TargetLowering::TypePromoteInteger:
         PromoteIntegerResult(N, i);
         Changed = true;
         goto NodeDone;
-      case ExpandInteger:
+      case TargetLowering::TypeExpandInteger:
         ExpandIntegerResult(N, i);
         Changed = true;
         goto NodeDone;
-      case SoftenFloat:
+      case TargetLowering::TypeSoftenFloat:
         SoftenFloatResult(N, i);
         Changed = true;
         goto NodeDone;
-      case ExpandFloat:
+      case TargetLowering::TypeExpandFloat:
         ExpandFloatResult(N, i);
         Changed = true;
         goto NodeDone;
-      case ScalarizeVector:
+      case TargetLowering::TypeScalarizeVector:
         ScalarizeVectorResult(N, i);
         Changed = true;
         goto NodeDone;
-      case SplitVector:
+      case TargetLowering::TypeSplitVector:
         SplitVectorResult(N, i);
         Changed = true;
         goto NodeDone;
-      case WidenVector:
+      case TargetLowering::TypeWidenVector:
         WidenVectorResult(N, i);
         Changed = true;
         goto NodeDone;
@@ -277,36 +277,36 @@ ScanOperands:
       switch (getTypeAction(OpVT)) {
       default:
         assert(false && "Unknown action!");
-      case Legal:
+      case TargetLowering::TypeLegal:
         continue;
       // The following calls must either replace all of the node's results
       // using ReplaceValueWith, and return "false"; or update the node's
       // operands in place, and return "true".
-      case PromoteInteger:
+      case TargetLowering::TypePromoteInteger:
         NeedsReanalyzing = PromoteIntegerOperand(N, i);
         Changed = true;
         break;
-      case ExpandInteger:
+      case TargetLowering::TypeExpandInteger:
         NeedsReanalyzing = ExpandIntegerOperand(N, i);
         Changed = true;
         break;
-      case SoftenFloat:
+      case TargetLowering::TypeSoftenFloat:
         NeedsReanalyzing = SoftenFloatOperand(N, i);
         Changed = true;
         break;
-      case ExpandFloat:
+      case TargetLowering::TypeExpandFloat:
         NeedsReanalyzing = ExpandFloatOperand(N, i);
         Changed = true;
         break;
-      case ScalarizeVector:
+      case TargetLowering::TypeScalarizeVector:
         NeedsReanalyzing = ScalarizeVectorOperand(N, i);
         Changed = true;
         break;
-      case SplitVector:
+      case TargetLowering::TypeSplitVector:
         NeedsReanalyzing = SplitVectorOperand(N, i);
         Changed = true;
         break;
-      case WidenVector:
+      case TargetLowering::TypeWidenVector:
         NeedsReanalyzing = WidenVectorOperand(N, i);
         Changed = true;
         break;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 5409b88..b2f966b 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -57,16 +57,6 @@ public:
     // 1+ - This is a node which has this many unprocessed operands.
   };
 private:
-  enum LegalizeAction {
-    Legal,           // The target natively supports this type.
-    PromoteInteger,  // Replace this integer type with a larger one.
-    ExpandInteger,   // Split this integer type into two of half the size.
-    SoftenFloat,     // Convert this float type to a same size integer type.
-    ExpandFloat,     // Split this float type into two of half the size.
-    ScalarizeVector, // Replace this one-element vector with its element type.
-    SplitVector,     // Split this vector type into two of half the size.
-    WidenVector      // This vector type should be widened into a larger vector.
-  };
 
   /// ValueTypeActions - This is a bitvector that contains two bits for each
   /// simple value type, where the two bits correspond to the LegalizeAction
@@ -74,41 +64,13 @@ private:
   TargetLowering::ValueTypeActionImpl ValueTypeActions;
 
   /// getTypeAction - Return how we should legalize values of this type.
-  LegalizeAction getTypeAction(EVT VT) const {
-    switch (ValueTypeActions.getTypeAction(VT)) {
-    default:
-      assert(false && "Unknown legalize action!");
-    case TargetLowering::Legal:
-      return Legal;
-    case TargetLowering::Promote:
-      // Promote can mean
-      //   1) For integers, use a larger integer type (e.g. i8 -> i32).
-      //   2) For vectors, use a wider vector type (e.g. v3i32 -> v4i32).
-      if (!VT.isVector())
-        return PromoteInteger;
-      return WidenVector;
-    case TargetLowering::Expand:
-      // Expand can mean
-      // 1) split scalar in half, 2) convert a float to an integer,
-      // 3) scalarize a single-element vector, 4) split a vector in two.
-      if (!VT.isVector()) {
-        if (VT.isInteger())
-          return ExpandInteger;
-        if (VT.getSizeInBits() ==
-                TLI.getTypeToTransformTo(*DAG.getContext(), VT).getSizeInBits())
-          return SoftenFloat;
-        return ExpandFloat;
-      }
-
-      if (VT.getVectorNumElements() == 1)
-        return ScalarizeVector;
-      return SplitVector;
-    }
+  TargetLowering::LegalizeTypeAction getTypeAction(EVT VT) const {
+    return TLI.getTypeAction(*DAG.getContext(), VT);
   }
 
   /// isTypeLegal - Return true if this type is legal on this target.
   bool isTypeLegal(EVT VT) const {
-    return ValueTypeActions.getTypeAction(VT) == TargetLowering::Legal;
+    return TLI.getTypeAction(*DAG.getContext(), VT) == TargetLowering::TypeLegal;
   }
 
   /// IgnoreNodeResults - Pretend all of this node's results are legal.
@@ -248,6 +210,11 @@ private:
   SDValue PromoteIntRes_AssertZext(SDNode *N);
   SDValue PromoteIntRes_Atomic1(AtomicSDNode *N);
   SDValue PromoteIntRes_Atomic2(AtomicSDNode *N);
+  SDValue PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N);
+  SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N);
+  SDValue PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N);
   SDValue PromoteIntRes_BITCAST(SDNode *N);
   SDValue PromoteIntRes_BSWAP(SDNode *N);
   SDValue PromoteIntRes_BUILD_PAIR(SDNode *N);
@@ -289,6 +256,9 @@ private:
   SDValue PromoteIntOp_BUILD_VECTOR(SDNode *N);
   SDValue PromoteIntOp_CONVERT_RNDSAT(SDNode *N);
   SDValue PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_EXTRACT_ELEMENT(SDNode *N);
+  SDValue PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N);
   SDValue PromoteIntOp_MEMBARRIER(SDNode *N);
   SDValue PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N);
   SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index a75ae87..85ea6b6 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -43,36 +43,36 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
   switch (getTypeAction(InVT)) {
     default:
       assert(false && "Unknown type action!");
-    case Legal:
-    case PromoteInteger:
+    case TargetLowering::TypeLegal:
+    case TargetLowering::TypePromoteInteger:
       break;
-    case SoftenFloat:
+    case TargetLowering::TypeSoftenFloat:
       // Convert the integer operand instead.
       SplitInteger(GetSoftenedFloat(InOp), Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
       return;
-    case ExpandInteger:
-    case ExpandFloat:
+    case TargetLowering::TypeExpandInteger:
+    case TargetLowering::TypeExpandFloat:
       // Convert the expanded pieces of the input.
       GetExpandedOp(InOp, Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
       return;
-    case SplitVector:
+    case TargetLowering::TypeSplitVector:
       GetSplitVector(InOp, Lo, Hi);
       if (TLI.isBigEndian())
         std::swap(Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
       return;
-    case ScalarizeVector:
+    case TargetLowering::TypeScalarizeVector:
       // Convert the element instead.
       SplitInteger(BitConvertToInteger(GetScalarizedVector(InOp)), Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
       return;
-    case WidenVector: {
+    case TargetLowering::TypeWidenVector: {
       assert(!(InVT.getVectorNumElements() & 1) && "Unsupported BITCAST");
       InOp = GetWidenedVector(InOp);
       EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 0b4dd35..9595f69 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -526,13 +526,13 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
   switch (getTypeAction(InVT)) {
   default:
     assert(false && "Unknown type action!");
-  case Legal:
-  case PromoteInteger:
-  case SoftenFloat:
-  case ScalarizeVector:
+  case TargetLowering::TypeLegal:
+  case TargetLowering::TypePromoteInteger:
+  case TargetLowering::TypeSoftenFloat:
+  case TargetLowering::TypeScalarizeVector:
     break;
-  case ExpandInteger:
-  case ExpandFloat:
+  case TargetLowering::TypeExpandInteger:
+  case TargetLowering::TypeExpandFloat:
     // A scalar to vector conversion, where the scalar needs expansion.
     // If the vector is being split in two then we can just convert the
     // expanded pieces.
@@ -545,7 +545,7 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
       return;
     }
     break;
-  case SplitVector:
+  case TargetLowering::TypeSplitVector:
     // If the input is a vector that needs to be split, convert each split
     // piece of the input now.
     GetSplitVector(InOp, Lo, Hi);
@@ -774,7 +774,7 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
   EVT InVT = N->getOperand(0).getValueType();
   switch (getTypeAction(InVT)) {
   default: llvm_unreachable("Unexpected type action!");
-  case Legal: {
+  case TargetLowering::TypeLegal: {
     EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
                                  LoVT.getVectorNumElements());
     Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0),
@@ -783,10 +783,21 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
                      DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
     break;
   }
-  case SplitVector:
+  case TargetLowering::TypePromoteInteger: {
+    SDValue InOp = GetPromotedInteger(N->getOperand(0));
+    EVT InNVT = EVT::getVectorVT(*DAG.getContext(),
+                                 InOp.getValueType().getVectorElementType(),
+                                 LoVT.getVectorNumElements());
+    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
+                     DAG.getIntPtrConstant(0));
+    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
+                     DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+    break;
+  }
+  case TargetLowering::TypeSplitVector:
     GetSplitVector(N->getOperand(0), Lo, Hi);
     break;
-  case WidenVector: {
+  case TargetLowering::TypeWidenVector: {
     // If the result needs to be split and the input needs to be widened,
     // the two types must have different lengths. Use the widened result
     // and extract from it to do the split.
@@ -1439,7 +1450,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   unsigned Opcode = N->getOpcode();
   unsigned InVTNumElts = InVT.getVectorNumElements();
 
-  if (getTypeAction(InVT) == WidenVector) {
+  if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
     InOp = GetWidenedVector(N->getOperand(0));
     InVT = InOp.getValueType();
     InVTNumElts = InVT.getVectorNumElements();
@@ -1515,7 +1526,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) {
   SDValue ShOp = N->getOperand(1);
 
   EVT ShVT = ShOp.getValueType();
-  if (getTypeAction(ShVT) == WidenVector) {
+  if (getTypeAction(ShVT) == TargetLowering::TypeWidenVector) {
     ShOp = GetWidenedVector(ShOp);
     ShVT = ShOp.getValueType();
   }
@@ -1557,9 +1568,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
   default:
     assert(false && "Unknown type action!");
     break;
-  case Legal:
+  case TargetLowering::TypeLegal:
     break;
-  case PromoteInteger:
+  case TargetLowering::TypePromoteInteger:
     // If the InOp is promoted to the same size, convert it.  Otherwise,
     // fall out of the switch and widen the promoted input.
     InOp = GetPromotedInteger(InOp);
@@ -1567,13 +1578,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
     if (WidenVT.bitsEq(InVT))
       return DAG.getNode(ISD::BITCAST, dl, WidenVT, InOp);
     break;
-  case SoftenFloat:
-  case ExpandInteger:
-  case ExpandFloat:
-  case ScalarizeVector:
-  case SplitVector:
+  case TargetLowering::TypeSoftenFloat:
+  case TargetLowering::TypeExpandInteger:
+  case TargetLowering::TypeExpandFloat:
+  case TargetLowering::TypeScalarizeVector:
+  case TargetLowering::TypeSplitVector:
     break;
-  case WidenVector:
+  case TargetLowering::TypeWidenVector:
     // If the InOp is widened to the same size, convert it.  Otherwise, fall
     // out of the switch and widen the widened input.
     InOp = GetWidenedVector(InOp);
@@ -1653,7 +1664,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
   unsigned NumOperands = N->getNumOperands();
 
   bool InputWidened = false; // Indicates we need to widen the input.
-  if (getTypeAction(InVT) != WidenVector) {
+  if (getTypeAction(InVT) != TargetLowering::TypeWidenVector) {
     if (WidenVT.getVectorNumElements() % InVT.getVectorNumElements() == 0) {
       // Add undef vectors to widen to correct length.
       unsigned NumConcat = WidenVT.getVectorNumElements() /
@@ -1732,7 +1743,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
   ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
 
   unsigned InVTNumElts = InVT.getVectorNumElements();
-  if (getTypeAction(InVT) == WidenVector) {
+  if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
     InOp = GetWidenedVector(InOp);
     InVT = InOp.getValueType();
     InVTNumElts = InVT.getVectorNumElements();
@@ -1800,7 +1811,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   SDValue  Idx  = N->getOperand(1);
   DebugLoc dl = N->getDebugLoc();
 
-  if (getTypeAction(InOp.getValueType()) == WidenVector)
+  if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
     InOp = GetWidenedVector(InOp);
 
   EVT InVT = InOp.getValueType();
@@ -1882,7 +1893,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
     EVT CondEltVT = CondVT.getVectorElementType();
     EVT CondWidenVT =  EVT::getVectorVT(*DAG.getContext(),
                                         CondEltVT, WidenNumElts);
-    if (getTypeAction(CondVT) == WidenVector)
+    if (getTypeAction(CondVT) == TargetLowering::TypeWidenVector)
       Cond1 = GetWidenedVector(Cond1);
 
     if (Cond1.getValueType() != CondWidenVT)
@@ -2026,7 +2037,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
   DebugLoc dl = N->getDebugLoc();
   unsigned NumElts = VT.getVectorNumElements();
   SDValue InOp = N->getOperand(0);
-  if (getTypeAction(InOp.getValueType()) == WidenVector)
+  if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
     InOp = GetWidenedVector(InOp);
   EVT InVT = InOp.getValueType();
   EVT InEltVT = InVT.getVectorElementType();
@@ -2081,7 +2092,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
   unsigned NumOperands = N->getNumOperands();
   for (unsigned i=0; i < NumOperands; ++i) {
     SDValue InOp = N->getOperand(i);
-    if (getTypeAction(InOp.getValueType()) == WidenVector)
+    if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
       InOp = GetWidenedVector(InOp);
     for (unsigned j=0; j < NumInElts; ++j)
       Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 88bd450..8d61a89 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -1008,14 +1008,15 @@ static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
   for (const unsigned *AliasI = TRI->getOverlaps(Reg); *AliasI; ++AliasI) {
 
     // Check if Ref is live.
-    if (!LiveRegDefs[Reg]) continue;
+    if (!LiveRegDefs[*AliasI]) continue;
 
     // Allow multiple uses of the same def.
-    if (LiveRegDefs[Reg] == SU) continue;
+    if (LiveRegDefs[*AliasI] == SU) continue;
 
     // Add Reg to the set of interfering live regs.
-    if (RegAdded.insert(Reg))
-      LRegs.push_back(Reg);
+    if (RegAdded.insert(*AliasI)) {
+      LRegs.push_back(*AliasI);
+    }
   }
 }
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index c2711c8..68eeb60 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2050,14 +2050,15 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
     break;
 
   default:
-    // Allow the target to implement this method for its nodes.
-    if (Op.getOpcode() >= ISD::BUILTIN_OP_END) {
+    if (Op.getOpcode() < ISD::BUILTIN_OP_END)
+      break;
+    // Fallthrough
   case ISD::INTRINSIC_WO_CHAIN:
   case ISD::INTRINSIC_W_CHAIN:
   case ISD::INTRINSIC_VOID:
-      TLI.computeMaskedBitsForTargetNode(Op, Mask, KnownZero, KnownOne, *this,
-                                         Depth);
-    }
+    // Allow the target to implement this method for its nodes.
+    TLI.computeMaskedBitsForTargetNode(Op, Mask, KnownZero, KnownOne, *this,
+                                       Depth);
     return;
   }
 }
@@ -2322,6 +2323,13 @@ bool SelectionDAG::isKnownNeverZero(SDValue Op) const {
     return !C->isZero();
 
   // TODO: Recognize more cases here.
+  switch (Op.getOpcode()) {
+  default: break;
+  case ISD::OR:
+    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+      return !C->isNullValue();
+    break;
+  }
 
   return false;
 }
@@ -2339,16 +2347,6 @@ bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
   return false;
 }
 
-bool SelectionDAG::isVerifiedDebugInfoDesc(SDValue Op) const {
-  GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op);
-  if (!GA) return false;
-  if (GA->getOffset() != 0) return false;
-  const GlobalVariable *GV = dyn_cast<GlobalVariable>(GA->getGlobal());
-  if (!GV) return false;
-  return MF->getMMI().hasDebugInfo();
-}
-
-
 /// getNode - Gets or creates the specified node.
 ///
 SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT) {
@@ -6304,7 +6302,7 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
         Operands[j] = getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                               OperandEltVT,
                               Operand,
-                              getConstant(i, MVT::i32));
+                              getConstant(i, TLI.getPointerTy()));
       } else {
         // A scalar operand; just use it as is.
         Operands[j] = Operand;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index b02a7b6..7a8a975 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -280,9 +280,36 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
     }
 
     // Vector/Vector bitcast.
-    return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+    if (ValueVT.getSizeInBits() == PartVT.getSizeInBits())
+      return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+
+    assert(PartVT.getVectorNumElements() == ValueVT.getVectorNumElements() &&
+      "Cannot handle this kind of promotion");
+    // Promoted vector extract
+    unsigned NumElts = ValueVT.getVectorNumElements();
+    SmallVector<SDValue, 8> NewOps;
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+        PartVT.getScalarType(), Val ,DAG.getIntPtrConstant(i));
+      SDValue Cast;
+
+      bool Smaller = ValueVT.bitsLE(PartVT);
+
+      Cast = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND),
+                         DL, ValueVT.getScalarType(), Ext);
+
+      NewOps.push_back(Cast);
+    }
+    return DAG.getNode(ISD::BUILD_VECTOR, DL, ValueVT,
+      &NewOps[0], NewOps.size());
   }
 
+  // Trivial bitcast if the types are the same size and the destination
+  // vector type is legal.
+  if (PartVT.getSizeInBits() == ValueVT.getSizeInBits() &&
+      TLI.isTypeLegal(ValueVT))
+    return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+
   assert(ValueVT.getVectorElementType() == PartVT &&
          ValueVT.getVectorNumElements() == 1 &&
          "Only trivial scalar-to-vector conversions should get here!");
@@ -446,7 +473,24 @@ static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc DL,
 
       //SDValue UndefElts = DAG.getUNDEF(VectorTy);
       //Val = DAG.getNode(ISD::CONCAT_VECTORS, DL, PartVT, Val, UndefElts);
-    } else {
+    } else if (PartVT.isVector() &&
+               PartVT.getVectorElementType().bitsGE(
+                 ValueVT.getVectorElementType())&&
+               PartVT.getVectorNumElements() == ValueVT.getVectorNumElements()) {
+
+      // Promoted vector extract
+      unsigned NumElts = ValueVT.getVectorNumElements();
+      SmallVector<SDValue, 8> NewOps;
+      for (unsigned i = 0; i < NumElts; ++i) {
+        SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                       ValueVT.getScalarType(), Val ,DAG.getIntPtrConstant(i));
+        SDValue Cast = DAG.getNode(ISD::ANY_EXTEND,
+                       DL, PartVT.getScalarType(), Ext);
+        NewOps.push_back(Cast);
+      }
+      Val = DAG.getNode(ISD::BUILD_VECTOR, DL, PartVT,
+                        &NewOps[0], NewOps.size());
+    } else{
       // Vector -> scalar conversion.
       assert(ValueVT.getVectorElementType() == PartVT &&
              ValueVT.getVectorNumElements() == 1 &&
@@ -783,11 +827,20 @@ void SelectionDAGBuilder::clear() {
   UnusedArgNodeMap.clear();
   PendingLoads.clear();
   PendingExports.clear();
-  DanglingDebugInfoMap.clear();
   CurDebugLoc = DebugLoc();
   HasTailCall = false;
 }
 
+/// clearDanglingDebugInfo - Clear the dangling debug information
+/// map. This function is seperated from the clear so that debug
+/// information that is dangling in a basic block can be properly
+/// resolved in a different basic block. This allows the
+/// SelectionDAG to resolve dangling debug information attached
+/// to PHI nodes.
+void SelectionDAGBuilder::clearDanglingDebugInfo() {
+  DanglingDebugInfoMap.clear();
+}
+
 /// getRoot - Return the current virtual root of the Selection DAG,
 /// flushing any PendingLoad items. This must be done before emitting
 /// a store or any other node that may need to be ordered after any
@@ -1175,6 +1228,10 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
 /// created for it, emit nodes to copy the value into the virtual
 /// registers.
 void SelectionDAGBuilder::CopyToExportRegsIfNeeded(const Value *V) {
+  // Skip empty types
+  if (V->getType()->isEmptyTy())
+    return;
+
   DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
   if (VMI != FuncInfo.ValueMap.end()) {
     assert(!V->use_empty() && "Unused value assigned virtual registers!");
@@ -2810,16 +2867,18 @@ void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) {
   SmallVector<SDValue, 4> Values(NumAggValues);
 
   SDValue Agg = getValue(Op0);
-  SDValue Val = getValue(Op1);
   unsigned i = 0;
   // Copy the beginning value(s) from the original aggregate.
   for (; i != LinearIndex; ++i)
     Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
                 SDValue(Agg.getNode(), Agg.getResNo() + i);
   // Copy values from the inserted value(s).
-  for (; i != LinearIndex + NumValValues; ++i)
-    Values[i] = FromUndef ? DAG.getUNDEF(AggValueVTs[i]) :
-                SDValue(Val.getNode(), Val.getResNo() + i - LinearIndex);
+  if (NumValValues) {
+    SDValue Val = getValue(Op1);
+    for (; i != LinearIndex + NumValValues; ++i)
+      Values[i] = FromUndef ? DAG.getUNDEF(AggValueVTs[i]) :
+                  SDValue(Val.getNode(), Val.getResNo() + i - LinearIndex);
+  }
   // Copy remaining value(s) from the original aggregate.
   for (; i != NumAggValues; ++i)
     Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
@@ -2842,6 +2901,13 @@ void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) {
   ComputeValueVTs(TLI, ValTy, ValValueVTs);
 
   unsigned NumValValues = ValValueVTs.size();
+
+  // Ignore a extractvalue that produces an empty object
+  if (!NumValValues) {
+    setValue(&I, DAG.getUNDEF(MVT(MVT::Other)));
+    return;
+  }
+
   SmallVector<SDValue, 4> Values(NumValValues);
 
   SDValue Agg = getValue(Op0);
@@ -4013,6 +4079,24 @@ static SDValue ExpandPowI(DebugLoc DL, SDValue LHS, SDValue RHS,
   return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS);
 }
 
+// getTruncatedArgReg - Find underlying register used for an truncated
+// argument.
+static unsigned getTruncatedArgReg(const SDValue &N) {
+  if (N.getOpcode() != ISD::TRUNCATE)
+    return 0;
+
+  const SDValue &Ext = N.getOperand(0);
+  if (Ext.getOpcode() == ISD::AssertZext || Ext.getOpcode() == ISD::AssertSext){
+    const SDValue &CFR = Ext.getOperand(0);
+    if (CFR.getOpcode() == ISD::CopyFromReg)
+      return cast<RegisterSDNode>(CFR.getOperand(1))->getReg();
+    else
+      if (CFR.getOpcode() == ISD::TRUNCATE)
+        return getTruncatedArgReg(CFR);
+  }
+  return 0;
+}
+
 /// EmitFuncArgumentDbgValue - If the DbgValueInst is a dbg_value of a function
 /// argument, create the corresponding DBG_VALUE machine instruction for it now.
 /// At the end of instruction selection, they will be inserted to the entry BB.
@@ -4044,9 +4128,12 @@ SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
       Reg = 0;
   }
 
-  if (N.getNode() && N.getOpcode() == ISD::CopyFromReg) {
-    Reg = cast<RegisterSDNode>(N.getOperand(1))->getReg();
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+  if (N.getNode()) {
+    if (N.getOpcode() == ISD::CopyFromReg)
+      Reg = cast<RegisterSDNode>(N.getOperand(1))->getReg();
+    else
+      Reg = getTruncatedArgReg(N);
+    if (Reg && TargetRegisterInfo::isVirtualRegister(Reg)) {
       MachineRegisterInfo &RegInfo = MF.getRegInfo();
       unsigned PR = RegInfo.getLiveInPhysReg(Reg);
       if (PR)
@@ -4208,9 +4295,9 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
           SDV = DAG.getDbgValue(Variable, FINode->getIndex(),
                                 0, dl, SDNodeOrder);
         else {
-          // Can't do anything with other non-AI cases yet.  This might be a
-          // parameter of a callee function that got inlined, for example.
-          DEBUG(dbgs() << "Dropping debug info for " << DI);
+          // Address is an argument, so try to emit its dbg value using
+          // virtual register info from the FuncInfo.ValueMap.
+          EmitFuncArgumentDbgValue(Address, Variable, 0, N);
           return 0;
         }
       } else if (AI)
@@ -4403,7 +4490,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::eh_sjlj_dispatch_setup: {
     DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_DISPATCHSETUP, dl, MVT::Other,
-                            getRoot()));
+                            getRoot(), getValue(I.getArgOperand(0))));
     return 0;
   }
 
@@ -4797,7 +4884,9 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
                 Outs, TLI, &Offsets);
 
   bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
-                        FTy->isVarArg(), Outs, FTy->getContext());
+					   DAG.getMachineFunction(),
+					   FTy->isVarArg(), Outs,
+					   FTy->getContext());
 
   SDValue DemoteStackSlot;
   int DemoteStackIdx = -100;
@@ -4827,8 +4916,14 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
 
   for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
        i != e; ++i) {
-    SDValue ArgNode = getValue(*i);
-    Entry.Node = ArgNode; Entry.Ty = (*i)->getType();
+    const Value *V = *i;
+
+    // Skip empty types
+    if (V->getType()->isEmptyTy())
+      continue;
+
+    SDValue ArgNode = getValue(V);
+    Entry.Node = ArgNode; Entry.Ty = V->getType();
 
     unsigned attrInd = i - CS.arg_begin() + 1;
     Entry.isSExt  = CS.paramHasAttr(attrInd, Attribute::SExt);
@@ -5268,6 +5363,7 @@ public:
 
     const llvm::Type *OpTy = CallOperandVal->getType();
 
+    // FIXME: code duplicated from TargetLowering::ParseConstraints().
     // If this is an indirect operand, the operand is a pointer to the
     // accessed type.
     if (isIndirect) {
@@ -5277,6 +5373,11 @@ public:
       OpTy = PtrTy->getElementType();
     }
 
+    // Look for vector wrapped in a struct. e.g. { <16 x i8> }.
+    if (const StructType *STy = dyn_cast<StructType>(OpTy))
+      if (STy->getNumElements() == 1)
+        OpTy = STy->getElementType(0);
+
     // If OpTy is not a single value, it may be a struct/union that we
     // can tile with integers.
     if (!OpTy->isSingleValueType() && OpTy->isSized()) {
@@ -5555,7 +5656,9 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
   std::set<unsigned> OutputRegs, InputRegs;
 
-  TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(CS);
+  TargetLowering::AsmOperandInfoVector
+    TargetConstraints = TLI.ParseConstraints(CS);
+
   bool hasMemory = false;
 
   unsigned ArgNo = 0;   // ArgNo - The argument of the CallInst.
@@ -5614,7 +5717,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       hasMemory = true;
     else {
       for (unsigned j = 0, ee = OpInfo.Codes.size(); j != ee; ++j) {
-        TargetLowering::ConstraintType CType = TLI.getConstraintType(OpInfo.Codes[j]);
+        TargetLowering::ConstraintType
+          CType = TLI.getConstraintType(OpInfo.Codes[j]);
         if (CType == TargetLowering::C_Memory) {
           hasMemory = true;
           break;
@@ -5664,12 +5768,17 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     // need to to provide an address for the memory input.
     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
         !OpInfo.isIndirect) {
-      assert((OpInfo.isMultipleAlternative || (OpInfo.Type == InlineAsm::isInput)) &&
+      assert((OpInfo.isMultipleAlternative ||
+              (OpInfo.Type == InlineAsm::isInput)) &&
              "Can only indirectify direct input operands!");
 
       // Memory operands really want the address of the value.  If we don't have
       // an indirect input, put it in the constpool if we can, otherwise spill
       // it to a stack slot.
+      // TODO: This isn't quite right. We need to handle these according to
+      // the addressing mode that the constraint wants. Also, this may take
+      // an additional register for the computation and we don't want that
+      // either.
 
       // If the operand is a float, integer, or vector constant, spill to a
       // constant pool entry to get its address.
@@ -5871,7 +5980,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
       if (OpInfo.ConstraintType == TargetLowering::C_Other) {
         std::vector<SDValue> Ops;
-        TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode[0],
+        TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
                                          Ops, DAG);
         if (Ops.empty())
           report_fatal_error("Invalid operand for inline asm constraint '" +
@@ -6080,14 +6189,15 @@ TargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy,
         Flags.setByVal();
         const PointerType *Ty = cast<PointerType>(Args[i].Ty);
         const Type *ElementTy = Ty->getElementType();
-        unsigned FrameAlign = getByValTypeAlignment(ElementTy);
-        unsigned FrameSize  = getTargetData()->getTypeAllocSize(ElementTy);
+        Flags.setByValSize(getTargetData()->getTypeAllocSize(ElementTy));
         // For ByVal, alignment should come from FE.  BE will guess if this
         // info is not there but there are cases it cannot get right.
+        unsigned FrameAlign;
         if (Args[i].Alignment)
           FrameAlign = Args[i].Alignment;
+        else
+          FrameAlign = getByValTypeAlignment(ElementTy);
         Flags.setByValAlign(FrameAlign);
-        Flags.setByValSize(FrameSize);
       }
       if (Args[i].isNest)
         Flags.setNest();
@@ -6232,6 +6342,25 @@ SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
 
 #include "llvm/CodeGen/SelectionDAGISel.h"
 
+/// isOnlyUsedInEntryBlock - If the specified argument is only used in the
+/// entry block, return true.  This includes arguments used by switches, since
+/// the switch may expand into multiple basic blocks.
+static bool isOnlyUsedInEntryBlock(const Argument *A) {
+  // With FastISel active, we may be splitting blocks, so force creation
+  // of virtual registers for all non-dead arguments.
+  if (EnableFastISel)
+    return A->use_empty();
+
+  const BasicBlock *Entry = A->getParent()->begin();
+  for (Value::const_use_iterator UI = A->use_begin(), E = A->use_end();
+       UI != E; ++UI) {
+    const User *U = *UI;
+    if (cast<Instruction>(U)->getParent() != Entry || isa<SwitchInst>(U))
+      return false;  // Use not in entry block.
+  }
+  return true;
+}
+
 void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
   // If this is the entry block, emit arguments.
   const Function &F = *LLVMBB->getParent();
@@ -6286,14 +6415,15 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
         Flags.setByVal();
         const PointerType *Ty = cast<PointerType>(I->getType());
         const Type *ElementTy = Ty->getElementType();
-        unsigned FrameAlign = TLI.getByValTypeAlignment(ElementTy);
-        unsigned FrameSize  = TD->getTypeAllocSize(ElementTy);
+        Flags.setByValSize(TD->getTypeAllocSize(ElementTy));
         // For ByVal, alignment should be passed from FE.  BE will guess if
         // this info is not there but there are cases it cannot get right.
+        unsigned FrameAlign;
         if (F.getParamAlignment(Idx))
           FrameAlign = F.getParamAlignment(Idx);
+        else
+          FrameAlign = TLI.getByValTypeAlignment(ElementTy);
         Flags.setByValAlign(FrameAlign);
-        Flags.setByValSize(FrameSize);
       }
       if (F.paramHasAttr(Idx, Attribute::Nest))
         Flags.setNest();
@@ -6375,8 +6505,8 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
     if (I->use_empty() && NumValues)
       SDB->setUnusedArgValue(I, InVals[i]);
 
-    for (unsigned Value = 0; Value != NumValues; ++Value) {
-      EVT VT = ValueVTs[Value];
+    for (unsigned Val = 0; Val != NumValues; ++Val) {
+      EVT VT = ValueVTs[Val];
       EVT PartVT = TLI.getRegisterType(*CurDAG->getContext(), VT);
       unsigned NumParts = TLI.getNumRegisters(*CurDAG->getContext(), VT);
 
@@ -6395,21 +6525,35 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
       i += NumParts;
     }
 
+    // We don't need to do anything else for unused arguments.
+    if (ArgValues.empty())
+      continue;
+
     // Note down frame index for byval arguments.
-    if (I->hasByValAttr() && !ArgValues.empty())
+    if (I->hasByValAttr())
       if (FrameIndexSDNode *FI =
           dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
         FuncInfo->setByValArgumentFrameIndex(I, FI->getIndex());
 
-    if (!I->use_empty()) {
-      SDValue Res;
-      if (!ArgValues.empty())
-        Res = DAG.getMergeValues(&ArgValues[0], NumValues,
-                                 SDB->getCurDebugLoc());
-      SDB->setValue(I, Res);
-
-      // If this argument is live outside of the entry block, insert a copy from
-      // wherever we got it to the vreg that other BB's will reference it as.
+    SDValue Res = DAG.getMergeValues(&ArgValues[0], NumValues,
+                                     SDB->getCurDebugLoc());
+    SDB->setValue(I, Res);
+
+    // If this argument is live outside of the entry block, insert a copy from
+    // wherever we got it to the vreg that other BB's will reference it as.
+    if (!EnableFastISel && Res.getOpcode() == ISD::CopyFromReg) {
+      // If we can, though, try to skip creating an unnecessary vreg.
+      // FIXME: This isn't very clean... it would be nice to make this more
+      // general.  It's also subtly incompatible with the hacks FastISel
+      // uses with vregs.
+      unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        FuncInfo->ValueMap[I] = Reg;
+        continue;
+      }
+    }
+    if (!isOnlyUsedInEntryBlock(I)) {
+      FuncInfo->InitializeRegForValue(I);
       SDB->CopyToExportRegsIfNeeded(I);
     }
   }
@@ -6455,6 +6599,10 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
       // Ignore dead phi's.
       if (PN->use_empty()) continue;
 
+      // Skip empty types
+      if (PN->getType()->isEmptyTy())
+        continue;
+
       unsigned Reg;
       const Value *PHIOp = PN->getIncomingValueForBlock(LLVMBB);
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index a689b76..8376d41 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -332,6 +332,14 @@ public:
   /// consumed.
   void clear();
 
+  /// clearDanglingDebugInfo - Clear the dangling debug information
+  /// map. This function is seperated from the clear so that debug
+  /// information that is dangling in a basic block can be properly
+  /// resolved in a different basic block. This allows the
+  /// SelectionDAG to resolve dangling debug information attached
+  /// to PHI nodes.
+  void clearDanglingDebugInfo();
+
   /// getRoot - Return the current virtual root of the Selection DAG,
   /// flushing any PendingLoad items. This must be done before emitting
   /// a store or any other node that may need to be ordered after any
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index fdf3767..771b008 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -55,17 +55,11 @@
 using namespace llvm;
 
 STATISTIC(NumFastIselFailures, "Number of instructions fast isel failed on");
+STATISTIC(NumFastIselSuccess, "Number of instructions fast isel selected");
 STATISTIC(NumFastIselBlocks, "Number of blocks selected entirely by fast isel");
 STATISTIC(NumDAGBlocks, "Number of blocks selected using DAG");
 STATISTIC(NumDAGIselRetries,"Number of times dag isel has to try another path");
 
-#ifndef NDEBUG
-STATISTIC(NumBBWithOutOfOrderLineInfo,
-          "Number of blocks with out of order line number info");
-STATISTIC(NumMBBWithOutOfOrderLineInfo,
-          "Number of machine blocks with out of order line number info");
-#endif
-
 static cl::opt<bool>
 EnableFastISelVerbose("fast-isel-verbose", cl::Hidden,
           cl::desc("Enable verbose messages in the \"fast\" "
@@ -208,40 +202,6 @@ void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-/// FunctionCallsSetJmp - Return true if the function has a call to setjmp or
-/// other function that gcc recognizes as "returning twice". This is used to
-/// limit code-gen optimizations on the machine function.
-///
-/// FIXME: Remove after <rdar://problem/8031714> is fixed.
-static bool FunctionCallsSetJmp(const Function *F) {
-  const Module *M = F->getParent();
-  static const char *ReturnsTwiceFns[] = {
-    "_setjmp",
-    "setjmp",
-    "sigsetjmp",
-    "setjmp_syscall",
-    "savectx",
-    "qsetjmp",
-    "vfork",
-    "getcontext"
-  };
-#define NUM_RETURNS_TWICE_FNS sizeof(ReturnsTwiceFns) / sizeof(const char *)
-
-  for (unsigned I = 0; I < NUM_RETURNS_TWICE_FNS; ++I)
-    if (const Function *Callee = M->getFunction(ReturnsTwiceFns[I])) {
-      if (!Callee->use_empty())
-        for (Value::const_use_iterator
-               I = Callee->use_begin(), E = Callee->use_end();
-             I != E; ++I)
-          if (const CallInst *CI = dyn_cast<CallInst>(*I))
-            if (CI->getParent()->getParent() == F)
-              return true;
-    }
-
-  return false;
-#undef NUM_RETURNS_TWICE_FNS
-}
-
 /// SplitCriticalSideEffectEdges - Look for critical edges with a PHI value that
 /// may trap on it.  In this case we have to split the edge so that the path
 /// through the predecessor block that doesn't go to the phi block doesn't
@@ -392,7 +352,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   }
 
   // Determine if there is a call to setjmp in the machine function.
-  MF->setCallsSetJmp(FunctionCallsSetJmp(&Fn));
+  MF->setCallsSetJmp(Fn.callsFunctionThatReturnsTwice());
 
   // Replace forward-declared registers with the registers containing
   // the desired value.
@@ -570,7 +530,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 
   {
     NamedRegionTimer T("DAG Legalization", GroupName, TimePassesIsEnabled);
-    CurDAG->Legalize(OptLevel);
+    CurDAG->Legalize();
   }
 
   DEBUG(dbgs() << "Legalized selection DAG: BB#" << BlockNumber
@@ -819,48 +779,6 @@ bool SelectionDAGISel::TryToFoldFastISelLoad(const LoadInst *LI,
   return FastIS->TryToFoldLoad(User, RI.getOperandNo(), LI);
 }
 
-#ifndef NDEBUG
-/// CheckLineNumbers - Check if basic block instructions follow source order
-/// or not.
-static void CheckLineNumbers(const BasicBlock *BB) {
-  unsigned Line = 0;
-  unsigned Col = 0;
-  for (BasicBlock::const_iterator BI = BB->begin(),
-         BE = BB->end(); BI != BE; ++BI) {
-    const DebugLoc DL = BI->getDebugLoc();
-    if (DL.isUnknown()) continue;
-    unsigned L = DL.getLine();
-    unsigned C = DL.getCol();
-    if (L < Line || (L == Line && C < Col)) {
-      ++NumBBWithOutOfOrderLineInfo;
-      return;
-    }
-    Line = L;
-    Col = C;
-  }
-}
-
-/// CheckLineNumbers - Check if machine basic block instructions follow source
-/// order or not.
-static void CheckLineNumbers(const MachineBasicBlock *MBB) {
-  unsigned Line = 0;
-  unsigned Col = 0;
-  for (MachineBasicBlock::const_iterator MBI = MBB->begin(),
-         MBE = MBB->end(); MBI != MBE; ++MBI) {
-    const DebugLoc DL = MBI->getDebugLoc();
-    if (DL.isUnknown()) continue;
-    unsigned L = DL.getLine();
-    unsigned C = DL.getCol();
-    if (L < Line || (L == Line && C < Col)) {
-      ++NumMBBWithOutOfOrderLineInfo;
-      return;
-    }
-    Line = L;
-    Col = C;
-  }
-}
-#endif
-
 /// isFoldedOrDeadInstruction - Return true if the specified instruction is
 /// side-effect free and is either dead or folded into a generated instruction.
 /// Return false if it needs to be emitted.
@@ -883,9 +801,6 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   for (ReversePostOrderTraversal<const Function*>::rpo_iterator
        I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
     const BasicBlock *LLVMBB = *I;
-#ifndef NDEBUG
-    CheckLineNumbers(LLVMBB);
-#endif
 
     if (OptLevel != CodeGenOpt::None) {
       bool AllPredsVisited = true;
@@ -961,6 +876,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 
         // Try to select the instruction with FastISel.
         if (FastIS->SelectInstruction(Inst)) {
+          ++NumFastIselSuccess;
           // If fast isel succeeded, skip over all the folded instructions, and
           // then see if there is a load right before the selected instructions.
           // Try to fold the load if so.
@@ -1004,9 +920,14 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
           continue;
         }
 
-        // Otherwise, give up on FastISel for the rest of the block.
-        // For now, be a little lenient about non-branch terminators.
-        if (!isa<TerminatorInst>(Inst) || isa<BranchInst>(Inst)) {
+        if (isa<TerminatorInst>(Inst) && !isa<BranchInst>(Inst)) {
+          // Don't abort, and use a different message for terminator misses.
+          ++NumFastIselFailures;
+          if (EnableFastISelVerbose || EnableFastISelAbort) {
+            dbgs() << "FastISel missed terminator: ";
+            Inst->dump();
+          }
+        } else {
           ++NumFastIselFailures;
           if (EnableFastISelVerbose || EnableFastISelAbort) {
             dbgs() << "FastISel miss: ";
@@ -1041,11 +962,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   }
 
   delete FastIS;
-#ifndef NDEBUG
-  for (MachineFunction::const_iterator MBI = MF->begin(), MBE = MF->end();
-       MBI != MBE; ++MBI)
-    CheckLineNumbers(MBI);
-#endif
+  SDB->clearDanglingDebugInfo();
 }
 
 void
@@ -2677,11 +2594,45 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       // instructions that access memory and for ComplexPatterns that match
       // loads.
       if (EmitNodeInfo & OPFL_MemRefs) {
+        // Only attach load or store memory operands if the generated
+        // instruction may load or store.
+        const TargetInstrDesc &TID = TM.getInstrInfo()->get(TargetOpc);
+        bool mayLoad = TID.mayLoad();
+        bool mayStore = TID.mayStore();
+
+        unsigned NumMemRefs = 0;
+        for (SmallVector<MachineMemOperand*, 2>::const_iterator I =
+             MatchedMemRefs.begin(), E = MatchedMemRefs.end(); I != E; ++I) {
+          if ((*I)->isLoad()) {
+            if (mayLoad)
+              ++NumMemRefs;
+          } else if ((*I)->isStore()) {
+            if (mayStore)
+              ++NumMemRefs;
+          } else {
+            ++NumMemRefs;
+          }
+        }
+
         MachineSDNode::mmo_iterator MemRefs =
-          MF->allocateMemRefsArray(MatchedMemRefs.size());
-        std::copy(MatchedMemRefs.begin(), MatchedMemRefs.end(), MemRefs);
+          MF->allocateMemRefsArray(NumMemRefs);
+
+        MachineSDNode::mmo_iterator MemRefsPos = MemRefs;
+        for (SmallVector<MachineMemOperand*, 2>::const_iterator I =
+             MatchedMemRefs.begin(), E = MatchedMemRefs.end(); I != E; ++I) {
+          if ((*I)->isLoad()) {
+            if (mayLoad)
+              *MemRefsPos++ = *I;
+          } else if ((*I)->isStore()) {
+            if (mayStore)
+              *MemRefsPos++ = *I;
+          } else {
+            *MemRefsPos++ = *I;
+          }
+        }
+
         cast<MachineSDNode>(Res)
-          ->setMemRefs(MemRefs, MemRefs + MatchedMemRefs.size());
+          ->setMemRefs(MemRefs, MemRefs + NumMemRefs);
       }
 
       DEBUG(errs() << "  "
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 15606af..cf6069a 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -26,11 +26,19 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include <cctype>
 using namespace llvm;
 
+/// We are in the process of implementing a new TypeLegalization action
+/// - the promotion of vector elements. This feature is disabled by default
+/// and only enabled using this flag.
+static cl::opt<bool>
+AllowPromoteIntElem("promote-elements", cl::Hidden,
+  cl::desc("Allow promotion of integer vector element types"));
+
 namespace llvm {
 TLSModel::Model getTLSModel(const GlobalValue *GV, Reloc::Model reloc) {
   bool isLocal = GV->hasLocalLinkage();
@@ -528,7 +536,8 @@ static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
 /// NOTE: The constructor takes ownership of TLOF.
 TargetLowering::TargetLowering(const TargetMachine &tm,
                                const TargetLoweringObjectFile *tlof)
-  : TM(tm), TD(TM.getTargetData()), TLOF(*tlof) {
+  : TM(tm), TD(TM.getTargetData()), TLOF(*tlof),
+  mayPromoteElements(AllowPromoteIntElem) {
   // All operations default to being supported.
   memset(OpActions, 0, sizeof(OpActions));
   memset(LoadExtActions, 0, sizeof(LoadExtActions));
@@ -596,6 +605,8 @@ TargetLowering::TargetLowering(const TargetMachine &tm,
   SchedPreferenceInfo = Sched::Latency;
   JumpBufSize = 0;
   JumpBufAlignment = 0;
+  MinFunctionAlignment = 0;
+  PrefFunctionAlignment = 0;
   PrefLoopAlignment = 0;
   MinStackArgumentAlignment = 1;
   ShouldFoldAtomicFences = false;
@@ -747,7 +758,7 @@ void TargetLowering::computeRegisterProperties() {
     NumRegistersForVT[ExpandedReg] = 2*NumRegistersForVT[ExpandedReg-1];
     RegisterTypeForVT[ExpandedReg] = (MVT::SimpleValueType)LargestIntReg;
     TransformToType[ExpandedReg] = (MVT::SimpleValueType)(ExpandedReg - 1);
-    ValueTypeActions.setTypeAction(ExpandedVT, Expand);
+    ValueTypeActions.setTypeAction(ExpandedVT, TypeExpandInteger);
   }
 
   // Inspect all of the ValueType's smaller than the largest integer
@@ -761,7 +772,7 @@ void TargetLowering::computeRegisterProperties() {
     } else {
       RegisterTypeForVT[IntReg] = TransformToType[IntReg] =
         (MVT::SimpleValueType)LegalIntReg;
-      ValueTypeActions.setTypeAction(IVT, Promote);
+      ValueTypeActions.setTypeAction(IVT, TypePromoteInteger);
     }
   }
 
@@ -770,7 +781,7 @@ void TargetLowering::computeRegisterProperties() {
     NumRegistersForVT[MVT::ppcf128] = 2*NumRegistersForVT[MVT::f64];
     RegisterTypeForVT[MVT::ppcf128] = MVT::f64;
     TransformToType[MVT::ppcf128] = MVT::f64;
-    ValueTypeActions.setTypeAction(MVT::ppcf128, Expand);
+    ValueTypeActions.setTypeAction(MVT::ppcf128, TypeExpandFloat);
   }
 
   // Decide how to handle f64. If the target does not have native f64 support,
@@ -779,7 +790,7 @@ void TargetLowering::computeRegisterProperties() {
     NumRegistersForVT[MVT::f64] = NumRegistersForVT[MVT::i64];
     RegisterTypeForVT[MVT::f64] = RegisterTypeForVT[MVT::i64];
     TransformToType[MVT::f64] = MVT::i64;
-    ValueTypeActions.setTypeAction(MVT::f64, Expand);
+    ValueTypeActions.setTypeAction(MVT::f64, TypeSoftenFloat);
   }
 
   // Decide how to handle f32. If the target does not have native support for
@@ -789,12 +800,12 @@ void TargetLowering::computeRegisterProperties() {
       NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::f64];
       RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::f64];
       TransformToType[MVT::f32] = MVT::f64;
-      ValueTypeActions.setTypeAction(MVT::f32, Promote);
+      ValueTypeActions.setTypeAction(MVT::f32, TypePromoteInteger);
     } else {
       NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::i32];
       RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::i32];
       TransformToType[MVT::f32] = MVT::i32;
-      ValueTypeActions.setTypeAction(MVT::f32, Expand);
+      ValueTypeActions.setTypeAction(MVT::f32, TypeSoftenFloat);
     }
   }
 
@@ -810,6 +821,30 @@ void TargetLowering::computeRegisterProperties() {
     unsigned NElts = VT.getVectorNumElements();
     if (NElts != 1) {
       bool IsLegalWiderType = false;
+      // If we allow the promotion of vector elements using a flag,
+      // then return TypePromoteInteger on vector elements.
+      // First try to promote the elements of integer vectors. If no legal
+      // promotion was found, fallback to the widen-vector method.
+      if (mayPromoteElements)
+      for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+        EVT SVT = (MVT::SimpleValueType)nVT;
+        // Promote vectors of integers to vectors with the same number
+        // of elements, with a wider element type.
+        if (SVT.getVectorElementType().getSizeInBits() > EltVT.getSizeInBits()
+            && SVT.getVectorNumElements() == NElts &&
+            isTypeLegal(SVT) && SVT.getScalarType().isInteger()) {
+          TransformToType[i] = SVT;
+          RegisterTypeForVT[i] = SVT;
+          NumRegistersForVT[i] = 1;
+          ValueTypeActions.setTypeAction(VT, TypePromoteInteger);
+          IsLegalWiderType = true;
+          break;
+        }
+      }
+
+      if (IsLegalWiderType) continue;
+
+      // Try to widen the vector.
       for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
         EVT SVT = (MVT::SimpleValueType)nVT;
         if (SVT.getVectorElementType() == EltVT &&
@@ -818,7 +853,7 @@ void TargetLowering::computeRegisterProperties() {
           TransformToType[i] = SVT;
           RegisterTypeForVT[i] = SVT;
           NumRegistersForVT[i] = 1;
-          ValueTypeActions.setTypeAction(VT, Promote);
+          ValueTypeActions.setTypeAction(VT, TypeWidenVector);
           IsLegalWiderType = true;
           break;
         }
@@ -838,10 +873,12 @@ void TargetLowering::computeRegisterProperties() {
     if (NVT == VT) {
       // Type is already a power of 2.  The default action is to split.
       TransformToType[i] = MVT::Other;
-      ValueTypeActions.setTypeAction(VT, Expand);
+      unsigned NumElts = VT.getVectorNumElements();
+      ValueTypeActions.setTypeAction(VT,
+            NumElts > 1 ? TypeSplitVector : TypeScalarizeVector);
     } else {
       TransformToType[i] = NVT;
-      ValueTypeActions.setTypeAction(VT, Promote);
+      ValueTypeActions.setTypeAction(VT, TypeWidenVector);
     }
   }
 
@@ -890,7 +927,7 @@ unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
   // If there is a wider vector type with the same element type as this one,
   // we should widen to that legal vector type.  This handles things like
   // <2 x float> -> <4 x float>.
-  if (NumElts != 1 && getTypeAction(VT) == Promote) {
+  if (NumElts != 1 && getTypeAction(Context, VT) == TypeWidenVector) {
     RegisterVT = getTypeToTransformTo(Context, VT);
     if (isTypeLegal(RegisterVT)) {
       IntermediateVT = RegisterVT;
@@ -1723,26 +1760,28 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     break;
   }
   case ISD::BITCAST:
-#if 0
-    // If this is an FP->Int bitcast and if the sign bit is the only thing that
-    // is demanded, turn this into a FGETSIGN.
-    if (NewMask == EVT::getIntegerVTSignBit(Op.getValueType()) &&
-        MVT::isFloatingPoint(Op.getOperand(0).getValueType()) &&
-        !MVT::isVector(Op.getOperand(0).getValueType())) {
-      // Only do this xform if FGETSIGN is valid or if before legalize.
-      if (!TLO.AfterLegalize ||
-          isOperationLegal(ISD::FGETSIGN, Op.getValueType())) {
+    // If this is an FP->Int bitcast and if the sign bit is the only
+    // thing demanded, turn this into a FGETSIGN.
+    if (NewMask == APInt::getSignBit(Op.getValueType().getSizeInBits()) &&
+        Op.getOperand(0).getValueType().isFloatingPoint() &&
+        !Op.getOperand(0).getValueType().isVector()) {
+      bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, Op.getValueType());
+      bool i32Legal  = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32);
+      if ((OpVTLegal || i32Legal) && Op.getValueType().isSimple()) {
+        EVT Ty = OpVTLegal ? Op.getValueType() : MVT::i32;
         // Make a FGETSIGN + SHL to move the sign bit into the appropriate
         // place.  We expect the SHL to be eliminated by other optimizations.
-        SDValue Sign = TLO.DAG.getNode(ISD::FGETSIGN, Op.getValueType(),
-                                         Op.getOperand(0));
+        SDValue Sign = TLO.DAG.getNode(ISD::FGETSIGN, dl, Ty, Op.getOperand(0));
+        unsigned OpVTSizeInBits = Op.getValueType().getSizeInBits();
+        if (!OpVTLegal && OpVTSizeInBits > 32)
+          Sign = TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, Op.getValueType(), Sign);
         unsigned ShVal = Op.getValueType().getSizeInBits()-1;
-        SDValue ShAmt = TLO.DAG.getConstant(ShVal, getShiftAmountTy());
-        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, Op.getValueType(),
+        SDValue ShAmt = TLO.DAG.getConstant(ShVal, Op.getValueType());
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl,
+                                                 Op.getValueType(),
                                                  Sign, ShAmt));
       }
     }
-#endif
     break;
   case ISD::ADD:
   case ISD::MUL:
@@ -2619,9 +2658,13 @@ const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const{
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
 /// vector.  If it is invalid, don't add anything to Ops.
 void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
-                                                  char ConstraintLetter,
+                                                  std::string &Constraint,
                                                   std::vector<SDValue> &Ops,
                                                   SelectionDAG &DAG) const {
+  
+  if (Constraint.length() > 1) return;
+  
+  char ConstraintLetter = Constraint[0];
   switch (ConstraintLetter) {
   default: break;
   case 'X':     // Allows any operand; labels (basic block) use this.
@@ -2810,6 +2853,12 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
           report_fatal_error("Indirect operand for inline asm not a pointer!");
         OpTy = PtrTy->getElementType();
       }
+      
+      // Look for vector wrapped in a struct. e.g. { <16 x i8> }.
+      if (const StructType *STy = dyn_cast<StructType>(OpTy))
+        if (STy->getNumElements() == 1)
+          OpTy = STy->getElementType(0);
+
       // If OpTy is not a single value, it may be a struct/union that we
       // can tile with integers.
       if (!OpTy->isSingleValueType() && OpTy->isSized()) {
@@ -3054,7 +3103,7 @@ static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
       assert(OpInfo.Codes[i].size() == 1 &&
              "Unhandled multi-letter 'other' constraint");
       std::vector<SDValue> ResultOps;
-      TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i][0],
+      TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i],
                                        ResultOps, *DAG);
       if (!ResultOps.empty()) {
         BestType = CType;
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.cpp b/lib/CodeGen/SimpleRegisterCoalescing.cpp
index 35b8e14..221bec5 100644
--- a/lib/CodeGen/SimpleRegisterCoalescing.cpp
+++ b/lib/CodeGen/SimpleRegisterCoalescing.cpp
@@ -47,7 +47,6 @@ STATISTIC(numExtends  , "Number of copies extended");
 STATISTIC(NumReMats   , "Number of instructions re-materialized");
 STATISTIC(numPeep     , "Number of identity moves eliminated after coalescing");
 STATISTIC(numAborts   , "Number of times interval joining aborted");
-STATISTIC(numDeadValNo, "Number of valno def marked dead");
 
 char SimpleRegisterCoalescing::ID = 0;
 static cl::opt<bool>
@@ -61,9 +60,9 @@ DisableCrossClassJoin("disable-cross-class-join",
                cl::init(false), cl::Hidden);
 
 static cl::opt<bool>
-DisablePhysicalJoin("disable-physical-join",
-               cl::desc("Avoid coalescing physical register copies"),
-               cl::init(false), cl::Hidden);
+EnablePhysicalJoin("join-physregs",
+                   cl::desc("Join physical register copies"),
+                   cl::init(false), cl::Hidden);
 
 static cl::opt<bool>
 VerifyCoalescing("verify-coalescing",
@@ -253,7 +252,12 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(const CoalescerPair &CP,
 
   // Okay, merge "B1" into the same value number as "B0".
   if (BValNo != ValLR->valno) {
+    // If B1 is killed by a PHI, then the merged live range must also be killed
+    // by the same PHI, as B0 and B1 can not overlap.
+    bool HasPHIKill = BValNo->hasPHIKill();
     IntB.MergeValueNumberInto(BValNo, ValLR->valno);
+    if (HasPHIKill)
+      ValLR->valno->setHasPHIKill(true);
   }
   DEBUG({
       dbgs() << "   result = ";
@@ -272,7 +276,7 @@ bool SimpleRegisterCoalescing::AdjustCopiesBackFrom(const CoalescerPair &CP,
   // merge, find the last use and trim the live range. That will also add the
   // isKill marker.
   if (ALR->end == CopyIdx)
-    TrimLiveIntervalToLastUse(CopyUseIdx, CopyMI->getParent(), IntA, ALR);
+    li_->shrinkToUses(&IntA);
 
   ++numExtends;
   return true;
@@ -426,6 +430,10 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(const CoalescerPair &CP,
   MachineInstr *NewMI = tii_->commuteInstruction(DefMI);
   if (!NewMI)
     return false;
+  if (TargetRegisterInfo::isVirtualRegister(IntA.reg) &&
+      TargetRegisterInfo::isVirtualRegister(IntB.reg) &&
+      !mri_->constrainRegClass(IntB.reg, mri_->getRegClass(IntA.reg)))
+    return false;
   if (NewMI != DefMI) {
     li_->ReplaceMachineInstrInMaps(DefMI, NewMI);
     MBB->insert(DefMI, NewMI);
@@ -503,98 +511,6 @@ bool SimpleRegisterCoalescing::RemoveCopyByCommutingDef(const CoalescerPair &CP,
   return true;
 }
 
-/// isSameOrFallThroughBB - Return true if MBB == SuccMBB or MBB simply
-/// fallthoughs to SuccMBB.
-static bool isSameOrFallThroughBB(MachineBasicBlock *MBB,
-                                  MachineBasicBlock *SuccMBB,
-                                  const TargetInstrInfo *tii_) {
-  if (MBB == SuccMBB)
-    return true;
-  MachineBasicBlock *TBB = 0, *FBB = 0;
-  SmallVector<MachineOperand, 4> Cond;
-  return !tii_->AnalyzeBranch(*MBB, TBB, FBB, Cond) && !TBB && !FBB &&
-    MBB->isSuccessor(SuccMBB);
-}
-
-/// removeRange - Wrapper for LiveInterval::removeRange. This removes a range
-/// from a physical register live interval as well as from the live intervals
-/// of its sub-registers.
-static void removeRange(LiveInterval &li,
-                        SlotIndex Start, SlotIndex End,
-                        LiveIntervals *li_, const TargetRegisterInfo *tri_) {
-  li.removeRange(Start, End, true);
-  if (TargetRegisterInfo::isPhysicalRegister(li.reg)) {
-    for (const unsigned* SR = tri_->getSubRegisters(li.reg); *SR; ++SR) {
-      if (!li_->hasInterval(*SR))
-        continue;
-      LiveInterval &sli = li_->getInterval(*SR);
-      SlotIndex RemoveStart = Start;
-      SlotIndex RemoveEnd = Start;
-
-      while (RemoveEnd != End) {
-        LiveInterval::iterator LR = sli.FindLiveRangeContaining(RemoveStart);
-        if (LR == sli.end())
-          break;
-        RemoveEnd = (LR->end < End) ? LR->end : End;
-        sli.removeRange(RemoveStart, RemoveEnd, true);
-        RemoveStart = RemoveEnd;
-      }
-    }
-  }
-}
-
-/// TrimLiveIntervalToLastUse - If there is a last use in the same basic block
-/// as the copy instruction, trim the live interval to the last use and return
-/// true.
-bool
-SimpleRegisterCoalescing::TrimLiveIntervalToLastUse(SlotIndex CopyIdx,
-                                                    MachineBasicBlock *CopyMBB,
-                                                    LiveInterval &li,
-                                                    const LiveRange *LR) {
-  SlotIndex MBBStart = li_->getMBBStartIdx(CopyMBB);
-  SlotIndex LastUseIdx;
-  MachineOperand *LastUse =
-    lastRegisterUse(LR->start, CopyIdx.getPrevSlot(), li.reg, LastUseIdx);
-  if (LastUse) {
-    MachineInstr *LastUseMI = LastUse->getParent();
-    if (!isSameOrFallThroughBB(LastUseMI->getParent(), CopyMBB, tii_)) {
-      // r1024 = op
-      // ...
-      // BB1:
-      //       = r1024
-      //
-      // BB2:
-      // r1025<dead> = r1024<kill>
-      if (MBBStart < LR->end)
-        removeRange(li, MBBStart, LR->end, li_, tri_);
-      return true;
-    }
-
-    // There are uses before the copy, just shorten the live range to the end
-    // of last use.
-    LastUse->setIsKill();
-    removeRange(li, LastUseIdx.getDefIndex(), LR->end, li_, tri_);
-    if (LastUseMI->isCopy()) {
-      MachineOperand &DefMO = LastUseMI->getOperand(0);
-      if (DefMO.getReg() == li.reg && !DefMO.getSubReg())
-        DefMO.setIsDead();
-    }
-    return true;
-  }
-
-  // Is it livein?
-  if (LR->start <= MBBStart && LR->end > MBBStart) {
-    if (LR->start == li_->getZeroIndex()) {
-      assert(TargetRegisterInfo::isPhysicalRegister(li.reg));
-      // Live-in to the function but dead. Remove it from entry live-in set.
-      mf_->begin()->removeLiveIn(li.reg);
-    }
-    // FIXME: Shorten intervals in BBs that reaches this BB.
-  }
-
-  return false;
-}
-
 /// ReMaterializeTrivialDef - If the source of a copy is defined by a trivial
 /// computation, replace the copy by rematerialize the definition.
 bool SimpleRegisterCoalescing::ReMaterializeTrivialDef(LiveInterval &SrcInt,
@@ -781,26 +697,6 @@ static bool removeIntervalIfEmpty(LiveInterval &li, LiveIntervals *li_,
   return false;
 }
 
-/// ShortenDeadCopyLiveRange - Shorten a live range defined by a dead copy.
-/// Return true if live interval is removed.
-bool SimpleRegisterCoalescing::ShortenDeadCopyLiveRange(LiveInterval &li,
-                                                        MachineInstr *CopyMI) {
-  SlotIndex CopyIdx = li_->getInstructionIndex(CopyMI);
-  LiveInterval::iterator MLR =
-    li.FindLiveRangeContaining(CopyIdx.getDefIndex());
-  if (MLR == li.end())
-    return false;  // Already removed by ShortenDeadCopySrcLiveRange.
-  SlotIndex RemoveStart = MLR->start;
-  SlotIndex RemoveEnd = MLR->end;
-  SlotIndex DefIdx = CopyIdx.getDefIndex();
-  // Remove the liverange that's defined by this.
-  if (RemoveStart == DefIdx && RemoveEnd == DefIdx.getStoreIndex()) {
-    removeRange(li, RemoveStart, RemoveEnd, li_, tri_);
-    return removeIntervalIfEmpty(li, li_, tri_);
-  }
-  return false;
-}
-
 /// RemoveDeadDef - If a def of a live interval is now determined dead, remove
 /// the val# it defines. If the live interval becomes empty, remove it as well.
 bool SimpleRegisterCoalescing::RemoveDeadDef(LiveInterval &li,
@@ -834,84 +730,6 @@ void SimpleRegisterCoalescing::RemoveCopyFlag(unsigned DstReg,
   }
 }
 
-/// PropagateDeadness - Propagate the dead marker to the instruction which
-/// defines the val#.
-static void PropagateDeadness(LiveInterval &li, MachineInstr *CopyMI,
-                              SlotIndex &LRStart, LiveIntervals *li_,
-                              const TargetRegisterInfo* tri_) {
-  MachineInstr *DefMI =
-    li_->getInstructionFromIndex(LRStart.getDefIndex());
-  if (DefMI && DefMI != CopyMI) {
-    int DeadIdx = DefMI->findRegisterDefOperandIdx(li.reg);
-    if (DeadIdx != -1)
-      DefMI->getOperand(DeadIdx).setIsDead();
-    else
-      DefMI->addOperand(MachineOperand::CreateReg(li.reg,
-                   /*def*/true, /*implicit*/true, /*kill*/false, /*dead*/true));
-    LRStart = LRStart.getNextSlot();
-  }
-}
-
-/// ShortenDeadCopySrcLiveRange - Shorten a live range as it's artificially
-/// extended by a dead copy. Mark the last use (if any) of the val# as kill as
-/// ends the live range there. If there isn't another use, then this live range
-/// is dead. Return true if live interval is removed.
-bool
-SimpleRegisterCoalescing::ShortenDeadCopySrcLiveRange(LiveInterval &li,
-                                                      MachineInstr *CopyMI) {
-  SlotIndex CopyIdx = li_->getInstructionIndex(CopyMI);
-  if (CopyIdx == SlotIndex()) {
-    // FIXME: special case: function live in. It can be a general case if the
-    // first instruction index starts at > 0 value.
-    assert(TargetRegisterInfo::isPhysicalRegister(li.reg));
-    // Live-in to the function but dead. Remove it from entry live-in set.
-    if (mf_->begin()->isLiveIn(li.reg))
-      mf_->begin()->removeLiveIn(li.reg);
-    if (const LiveRange *LR = li.getLiveRangeContaining(CopyIdx))
-      removeRange(li, LR->start, LR->end, li_, tri_);
-    return removeIntervalIfEmpty(li, li_, tri_);
-  }
-
-  LiveInterval::iterator LR =
-    li.FindLiveRangeContaining(CopyIdx.getPrevIndex().getStoreIndex());
-  if (LR == li.end())
-    // Livein but defined by a phi.
-    return false;
-
-  SlotIndex RemoveStart = LR->start;
-  SlotIndex RemoveEnd = CopyIdx.getStoreIndex();
-  if (LR->end > RemoveEnd)
-    // More uses past this copy? Nothing to do.
-    return false;
-
-  // If there is a last use in the same bb, we can't remove the live range.
-  // Shorten the live interval and return.
-  MachineBasicBlock *CopyMBB = CopyMI->getParent();
-  if (TrimLiveIntervalToLastUse(CopyIdx, CopyMBB, li, LR))
-    return false;
-
-  // There are other kills of the val#. Nothing to do.
-  if (!li.isOnlyLROfValNo(LR))
-    return false;
-
-  MachineBasicBlock *StartMBB = li_->getMBBFromIndex(RemoveStart);
-  if (!isSameOrFallThroughBB(StartMBB, CopyMBB, tii_))
-    // If the live range starts in another mbb and the copy mbb is not a fall
-    // through mbb, then we can only cut the range from the beginning of the
-    // copy mbb.
-    RemoveStart = li_->getMBBStartIdx(CopyMBB).getNextIndex().getBaseIndex();
-
-  if (LR->valno->def == RemoveStart) {
-    // If the def MI defines the val# and this copy is the only kill of the
-    // val#, then propagate the dead marker.
-    PropagateDeadness(li, CopyMI, RemoveStart, li_, tri_);
-    ++numDeadValNo;
-  }
-
-  removeRange(li, RemoveStart, RemoveEnd, li_, tri_);
-  return removeIntervalIfEmpty(li, li_, tri_);
-}
-
 /// shouldJoinPhys - Return true if a copy involving a physreg should be joined.
 /// We need to be careful about coalescing a source physical register with a
 /// virtual register. Once the coalescing is done, it cannot be broken and these
@@ -927,7 +745,7 @@ bool SimpleRegisterCoalescing::shouldJoinPhys(CoalescerPair &CP) {
   if (!Allocatable && CP.isFlipped() && JoinVInt.containsOneValue())
     return true;
 
-  if (DisablePhysicalJoin) {
+  if (!EnablePhysicalJoin) {
     DEBUG(dbgs() << "\tPhysreg joins disabled.\n");
     return false;
   }
@@ -954,7 +772,7 @@ bool SimpleRegisterCoalescing::shouldJoinPhys(CoalescerPair &CP) {
   //        CodeGen/X86/phys_subreg_coalesce-3.ll needs it.
   if (!CP.isPartial()) {
     const TargetRegisterClass *RC = mri_->getRegClass(CP.getSrcReg());
-    unsigned Threshold = allocatableRCRegs_[RC].count() * 2;
+    unsigned Threshold = RegClassInfo.getNumAllocatableRegs(RC) * 2;
     unsigned Length = li_->getApproximateInstructionCount(JoinVInt);
     if (Length > Threshold) {
       ++numAborts;
@@ -973,7 +791,7 @@ SimpleRegisterCoalescing::isWinToJoinCrossClass(unsigned SrcReg,
                                              const TargetRegisterClass *SrcRC,
                                              const TargetRegisterClass *DstRC,
                                              const TargetRegisterClass *NewRC) {
-  unsigned NewRCCount = allocatableRCRegs_[NewRC].count();
+  unsigned NewRCCount = RegClassInfo.getNumAllocatableRegs(NewRC);
   // This heuristics is good enough in practice, but it's obviously not *right*.
   // 4 is a magic number that works well enough for x86, ARM, etc. It filter
   // out all but the most restrictive register classes.
@@ -987,8 +805,14 @@ SimpleRegisterCoalescing::isWinToJoinCrossClass(unsigned SrcReg,
   LiveInterval &DstInt = li_->getInterval(DstReg);
   unsigned SrcSize = li_->getApproximateInstructionCount(SrcInt);
   unsigned DstSize = li_->getApproximateInstructionCount(DstInt);
-  if (SrcSize <= NewRCCount && DstSize <= NewRCCount)
+
+  // Coalesce aggressively if the intervals are small compared to the number of
+  // registers in the new class. The number 4 is fairly arbitrary, chosen to be
+  // less aggressive than the 8 used for the whole function size.
+  const unsigned ThresSize = 4 * NewRCCount;
+  if (SrcSize <= ThresSize && DstSize <= ThresSize)
     return true;
+
   // Estimate *register use density*. If it doubles or more, abort.
   unsigned SrcUses = std::distance(mri_->use_nodbg_begin(SrcReg),
                                    mri_->use_nodbg_end());
@@ -996,13 +820,13 @@ SimpleRegisterCoalescing::isWinToJoinCrossClass(unsigned SrcReg,
                                    mri_->use_nodbg_end());
   unsigned NewUses = SrcUses + DstUses;
   unsigned NewSize = SrcSize + DstSize;
-  if (SrcRC != NewRC && SrcSize > NewRCCount) {
-    unsigned SrcRCCount = allocatableRCRegs_[SrcRC].count();
+  if (SrcRC != NewRC && SrcSize > ThresSize) {
+    unsigned SrcRCCount = RegClassInfo.getNumAllocatableRegs(SrcRC);
     if (NewUses*SrcSize*SrcRCCount > 2*SrcUses*NewSize*NewRCCount)
       return false;
   }
-  if (DstRC != NewRC && DstSize > NewRCCount) {
-    unsigned DstRCCount = allocatableRCRegs_[DstRC].count();
+  if (DstRC != NewRC && DstSize > ThresSize) {
+    unsigned DstRCCount = RegClassInfo.getNumAllocatableRegs(DstRC);
     if (NewUses*DstSize*DstRCCount > 2*DstUses*NewSize*NewRCCount)
       return false;
   }
@@ -1032,6 +856,7 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
 
   // If they are already joined we continue.
   if (CP.getSrcReg() == CP.getDstReg()) {
+    markAsJoined(CopyMI);
     DEBUG(dbgs() << "\tCopy already coalesced.\n");
     return false;  // Not coalescable.
   }
@@ -1551,81 +1376,6 @@ void SimpleRegisterCoalescing::joinIntervals() {
   }
 }
 
-/// Return true if the two specified registers belong to different register
-/// classes.  The registers may be either phys or virt regs.
-bool
-SimpleRegisterCoalescing::differingRegisterClasses(unsigned RegA,
-                                                   unsigned RegB) const {
-  // Get the register classes for the first reg.
-  if (TargetRegisterInfo::isPhysicalRegister(RegA)) {
-    assert(TargetRegisterInfo::isVirtualRegister(RegB) &&
-           "Shouldn't consider two physregs!");
-    return !mri_->getRegClass(RegB)->contains(RegA);
-  }
-
-  // Compare against the regclass for the second reg.
-  const TargetRegisterClass *RegClassA = mri_->getRegClass(RegA);
-  if (TargetRegisterInfo::isVirtualRegister(RegB)) {
-    const TargetRegisterClass *RegClassB = mri_->getRegClass(RegB);
-    return RegClassA != RegClassB;
-  }
-  return !RegClassA->contains(RegB);
-}
-
-/// lastRegisterUse - Returns the last (non-debug) use of the specific register
-/// between cycles Start and End or NULL if there are no uses.
-MachineOperand *
-SimpleRegisterCoalescing::lastRegisterUse(SlotIndex Start,
-                                          SlotIndex End,
-                                          unsigned Reg,
-                                          SlotIndex &UseIdx) const{
-  UseIdx = SlotIndex();
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-    MachineOperand *LastUse = NULL;
-    for (MachineRegisterInfo::use_nodbg_iterator I = mri_->use_nodbg_begin(Reg),
-           E = mri_->use_nodbg_end(); I != E; ++I) {
-      MachineOperand &Use = I.getOperand();
-      MachineInstr *UseMI = Use.getParent();
-      if (UseMI->isIdentityCopy())
-        continue;
-      SlotIndex Idx = li_->getInstructionIndex(UseMI);
-      if (Idx >= Start && Idx < End && (!UseIdx.isValid() || Idx >= UseIdx)) {
-        LastUse = &Use;
-        UseIdx = Idx.getUseIndex();
-      }
-    }
-    return LastUse;
-  }
-
-  SlotIndex s = Start;
-  SlotIndex e = End.getPrevSlot().getBaseIndex();
-  while (e >= s) {
-    // Skip deleted instructions
-    MachineInstr *MI = li_->getInstructionFromIndex(e);
-    while (e != SlotIndex() && e.getPrevIndex() >= s && !MI) {
-      e = e.getPrevIndex();
-      MI = li_->getInstructionFromIndex(e);
-    }
-    if (e < s || MI == NULL)
-      return NULL;
-
-    // Ignore identity copies.
-    if (!MI->isIdentityCopy())
-      for (unsigned i = 0, NumOps = MI->getNumOperands(); i != NumOps; ++i) {
-        MachineOperand &Use = MI->getOperand(i);
-        if (Use.isReg() && Use.isUse() && Use.getReg() &&
-            tri_->regsOverlap(Use.getReg(), Reg)) {
-          UseIdx = e.getUseIndex();
-          return &Use;
-        }
-      }
-
-    e = e.getPrevIndex();
-  }
-
-  return NULL;
-}
-
 void SimpleRegisterCoalescing::releaseMemory() {
   JoinedCopies.clear();
   ReMatCopies.clear();
@@ -1650,10 +1400,7 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
   if (VerifyCoalescing)
     mf_->verify(this, "Before register coalescing");
 
-  for (TargetRegisterInfo::regclass_iterator I = tri_->regclass_begin(),
-         E = tri_->regclass_end(); I != E; ++I)
-    allocatableRCRegs_.insert(std::make_pair(*I,
-                                             tri_->getAllocatableSet(fn, *I)));
+  RegClassInfo.runOnMachineFunction(fn);
 
   // Join (coalesce) intervals if requested.
   if (EnableJoining) {
@@ -1690,13 +1437,11 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
           // or else the scavenger may complain. LowerSubregs will
           // delete them later.
           DoDelete = false;
-        
+
         if (MI->allDefsAreDead()) {
-          if (li_->hasInterval(SrcReg)) {
-            LiveInterval &li = li_->getInterval(SrcReg);
-            if (!ShortenDeadCopySrcLiveRange(li, MI))
-              ShortenDeadCopyLiveRange(li, MI);
-          }
+          if (TargetRegisterInfo::isVirtualRegister(SrcReg) &&
+              li_->hasInterval(SrcReg))
+            li_->shrinkToUses(&li_->getInterval(SrcReg));
           DoDelete = true;
         }
         if (!DoDelete) {
@@ -1748,24 +1493,6 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
           DeadDefs.clear();
       }
 
-      // If the move will be an identity move delete it
-      if (MI->isIdentityCopy()) {
-        unsigned SrcReg = MI->getOperand(1).getReg();
-        if (li_->hasInterval(SrcReg)) {
-          LiveInterval &RegInt = li_->getInterval(SrcReg);
-          // If def of this move instruction is dead, remove its live range
-          // from the destination register's live interval.
-          if (MI->allDefsAreDead()) {
-            if (!ShortenDeadCopySrcLiveRange(RegInt, MI))
-              ShortenDeadCopyLiveRange(RegInt, MI);
-          }
-        }
-        li_->RemoveMachineInstrFromMaps(MI);
-        mii = mbbi->erase(mii);
-        ++numPeep;
-        continue;
-      }
-
       ++mii;
 
       // Check for now unnecessary kill flags.
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.h b/lib/CodeGen/SimpleRegisterCoalescing.h
index 65cf542..92f6c64 100644
--- a/lib/CodeGen/SimpleRegisterCoalescing.h
+++ b/lib/CodeGen/SimpleRegisterCoalescing.h
@@ -17,7 +17,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/RegisterCoalescer.h"
-#include "llvm/ADT/BitVector.h"
+#include "RegisterClassInfo.h"
 
 namespace llvm {
   class SimpleRegisterCoalescing;
@@ -47,8 +47,7 @@ namespace llvm {
     LiveDebugVariables *ldv_;
     const MachineLoopInfo* loopInfo;
     AliasAnalysis *AA;
-    
-    DenseMap<const TargetRegisterClass*, BitVector> allocatableRCRegs_;
+    RegisterClassInfo RegClassInfo;
 
     /// JoinedCopies - Keep track of copies eliminated due to coalescing.
     ///
@@ -103,10 +102,6 @@ namespace llvm {
     /// use this information below to update aliases.
     bool JoinIntervals(CoalescerPair &CP);
 
-    /// Return true if the two specified registers belong to different register
-    /// classes.  The registers may be either phys or virt regs.
-    bool differingRegisterClasses(unsigned RegA, unsigned RegB) const;
-
     /// AdjustCopiesBackFrom - We found a non-trivially-coalescable copy. If
     /// the source value number is defined by a copy from the destination reg
     /// see if we can merge these two destination reg valno# into a single
@@ -124,13 +119,6 @@ namespace llvm {
     /// can transform the copy into a noop by commuting the definition.
     bool RemoveCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
 
-    /// TrimLiveIntervalToLastUse - If there is a last use in the same basic
-    /// block as the copy instruction, trim the ive interval to the last use
-    /// and return true.
-    bool TrimLiveIntervalToLastUse(SlotIndex CopyIdx,
-                                   MachineBasicBlock *CopyMBB,
-                                   LiveInterval &li, const LiveRange *LR);
-
     /// ReMaterializeTrivialDef - If the source of a copy is defined by a trivial
     /// computation, replace the copy by rematerialize the definition.
     /// If PreserveSrcInt is true, make sure SrcInt is valid after the call.
@@ -156,16 +144,6 @@ namespace llvm {
     /// subregister.
     void UpdateRegDefsUses(const CoalescerPair &CP);
 
-    /// ShortenDeadCopyLiveRange - Shorten a live range defined by a dead copy.
-    /// Return true if live interval is removed.
-    bool ShortenDeadCopyLiveRange(LiveInterval &li, MachineInstr *CopyMI);
-
-    /// ShortenDeadCopyLiveRange - Shorten a live range as it's artificially
-    /// extended by a dead copy. Mark the last use (if any) of the val# as kill
-    /// as ends the live range there. If there isn't another use, then this
-    /// live range is dead. Return true if live interval is removed.
-    bool ShortenDeadCopySrcLiveRange(LiveInterval &li, MachineInstr *CopyMI);
-
     /// RemoveDeadDef - If a def of a live interval is now determined dead,
     /// remove the val# it defines. If the live interval becomes empty, remove
     /// it as well.
@@ -175,11 +153,6 @@ namespace llvm {
     /// VNInfo copy flag for DstReg and all aliases.
     void RemoveCopyFlag(unsigned DstReg, const MachineInstr *CopyMI);
 
-    /// lastRegisterUse - Returns the last use of the specific register between
-    /// cycles Start and End or NULL if there are no uses.
-    MachineOperand *lastRegisterUse(SlotIndex Start, SlotIndex End,
-                                    unsigned Reg, SlotIndex &LastUseIdx) const;
-
     /// markAsJoined - Remember that CopyMI has already been joined.
     void markAsJoined(MachineInstr *CopyMI);
   };
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 43904a7..92970e4 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -520,7 +520,7 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
 
   // Add a call to dispatch_setup after the setjmp call. This is expanded to any
   // target-specific setup that needs to be done.
-  CallInst::Create(DispatchSetupFn, "", EntryBB->getTerminator());
+  CallInst::Create(DispatchSetupFn, DispatchVal, "", EntryBB->getTerminator());
 
   // check the return value of the setjmp. non-zero goes to dispatcher.
   Value *IsNormal = new ICmpInst(EntryBB->getTerminator(),
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index ac9d72b..bf27cc8 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -30,6 +30,9 @@ using namespace llvm;
 
 STATISTIC(NumFinished, "Number of splits finished");
 STATISTIC(NumSimple,   "Number of splits that were simple");
+STATISTIC(NumCopies,   "Number of copies inserted for splitting");
+STATISTIC(NumRemats,   "Number of rematerialized defs for splitting");
+STATISTIC(NumRepairs,  "Number of invalid live ranges repaired");
 
 //===----------------------------------------------------------------------===//
 //                                 Split Analysis
@@ -51,6 +54,7 @@ void SplitAnalysis::clear() {
   UseBlocks.clear();
   ThroughBlocks.clear();
   CurLI = 0;
+  DidRepairRange = false;
 }
 
 SlotIndex SplitAnalysis::computeLastSplitPoint(unsigned Num) {
@@ -119,6 +123,8 @@ void SplitAnalysis::analyzeUses() {
   if (!calcLiveBlockInfo()) {
     // FIXME: calcLiveBlockInfo found inconsistencies in the live range.
     // I am looking at you, SimpleRegisterCoalescing!
+    DidRepairRange = true;
+    ++NumRepairs;
     DEBUG(dbgs() << "*** Fixing inconsistent live interval! ***\n");
     const_cast<LiveIntervals&>(LIS)
       .shrinkToUses(const_cast<LiveInterval*>(CurLI));
@@ -139,7 +145,7 @@ void SplitAnalysis::analyzeUses() {
 /// where CurLI is live.
 bool SplitAnalysis::calcLiveBlockInfo() {
   ThroughBlocks.resize(MF.getNumBlockIDs());
-  NumThroughBlocks = 0;
+  NumThroughBlocks = NumGapBlocks = 0;
   if (CurLI->empty())
     return true;
 
@@ -158,55 +164,63 @@ bool SplitAnalysis::calcLiveBlockInfo() {
     SlotIndex Start, Stop;
     tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);
 
-    // LVI is the first live segment overlapping MBB.
-    BI.LiveIn = LVI->start <= Start;
-    if (!BI.LiveIn)
-      BI.Def = LVI->start;
-
-    // Find the first and last uses in the block.
-    bool Uses = UseI != UseE && *UseI < Stop;
-    if (Uses) {
+    // If the block contains no uses, the range must be live through. At one
+    // point, SimpleRegisterCoalescing could create dangling ranges that ended
+    // mid-block.
+    if (UseI == UseE || *UseI >= Stop) {
+      ++NumThroughBlocks;
+      ThroughBlocks.set(BI.MBB->getNumber());
+      // The range shouldn't end mid-block if there are no uses. This shouldn't
+      // happen.
+      if (LVI->end < Stop)
+        return false;
+    } else {
+      // This block has uses. Find the first and last uses in the block.
       BI.FirstUse = *UseI;
       assert(BI.FirstUse >= Start);
       do ++UseI;
       while (UseI != UseE && *UseI < Stop);
       BI.LastUse = UseI[-1];
       assert(BI.LastUse < Stop);
-    }
 
-    // Look for gaps in the live range.
-    bool hasGap = false;
-    BI.LiveOut = true;
-    while (LVI->end < Stop) {
-      SlotIndex LastStop = LVI->end;
-      if (++LVI == LVE || LVI->start >= Stop) {
-        BI.Kill = LastStop;
-        BI.LiveOut = false;
-        break;
-      }
-      if (LastStop < LVI->start) {
-        hasGap = true;
-        BI.Kill = LastStop;
-        BI.Def = LVI->start;
+      // LVI is the first live segment overlapping MBB.
+      BI.LiveIn = LVI->start <= Start;
+
+      // Look for gaps in the live range.
+      BI.LiveOut = true;
+      while (LVI->end < Stop) {
+        SlotIndex LastStop = LVI->end;
+        if (++LVI == LVE || LVI->start >= Stop) {
+          BI.LiveOut = false;
+          BI.LastUse = LastStop;
+          break;
+        }
+        if (LastStop < LVI->start) {
+          // There is a gap in the live range. Create duplicate entries for the
+          // live-in snippet and the live-out snippet.
+          ++NumGapBlocks;
+
+          // Push the Live-in part.
+          BI.LiveThrough = false;
+          BI.LiveOut = false;
+          UseBlocks.push_back(BI);
+          UseBlocks.back().LastUse = LastStop;
+
+          // Set up BI for the live-out part.
+          BI.LiveIn = false;
+          BI.LiveOut = true;
+          BI.FirstUse = LVI->start;
+        }
       }
-    }
 
-    // Don't set LiveThrough when the block has a gap.
-    BI.LiveThrough = !hasGap && BI.LiveIn && BI.LiveOut;
-    if (Uses)
+      // Don't set LiveThrough when the block has a gap.
+      BI.LiveThrough = BI.LiveIn && BI.LiveOut;
       UseBlocks.push_back(BI);
-    else {
-      ++NumThroughBlocks;
-      ThroughBlocks.set(BI.MBB->getNumber());
-    }
-    // FIXME: This should never happen. The live range stops or starts without a
-    // corresponding use. An earlier pass did something wrong.
-    if (!BI.LiveThrough && !Uses)
-      return false;
 
-    // LVI is now at LVE or LVI->end >= Stop.
-    if (LVI == LVE)
-      break;
+      // LVI is now at LVE or LVI->end >= Stop.
+      if (LVI == LVE)
+        break;
+    }
 
     // Live segment ends exactly at Stop. Move to the next segment.
     if (LVI->end == Stop && ++LVI == LVE)
@@ -218,6 +232,8 @@ bool SplitAnalysis::calcLiveBlockInfo() {
     else
       MFI = LIS.getMBBFromIndex(LVI->start);
   }
+
+  assert(getNumLiveBlocks() == countLiveBlocks(CurLI) && "Bad block count");
   return true;
 }
 
@@ -587,12 +603,14 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
   LiveRangeEdit::Remat RM(ParentVNI);
   if (Edit->canRematerializeAt(RM, UseIdx, true, LIS)) {
     Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, LIS, TII, TRI, Late);
+    ++NumRemats;
   } else {
     // Can't remat, just insert a copy from parent.
     CopyMI = BuildMI(MBB, I, DebugLoc(), TII.get(TargetOpcode::COPY), LI->reg)
                .addReg(Edit->getReg());
     Def = LIS.getSlotIndexes()->insertMachineInstrInMaps(CopyMI, Late)
             .getDefIndex();
+    ++NumCopies;
   }
 
   // Define the value in Reg.
diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h
index 2ae760a..7174c0b 100644
--- a/lib/CodeGen/SplitKit.h
+++ b/lib/CodeGen/SplitKit.h
@@ -63,17 +63,22 @@ public:
   ///  1. |   o---x   | Internal to block. Variable is only live in this block.
   ///  2. |---x       | Live-in, kill.
   ///  3. |       o---| Def, live-out.
-  ///  4. |---x   o---| Live-in, kill, def, live-out.
+  ///  4. |---x   o---| Live-in, kill, def, live-out. Counted by NumGapBlocks.
   ///  5. |---o---o---| Live-through with uses or defs.
-  ///  6. |-----------| Live-through without uses. Transparent.
+  ///  6. |-----------| Live-through without uses. Counted by NumThroughBlocks.
+  ///
+  /// Two BlockInfo entries are created for template 4. One for the live-in
+  /// segment, and one for the live-out segment. These entries look as if the
+  /// block were split in the middle where the live range isn't live.
+  ///
+  /// Live-through blocks without any uses don't get BlockInfo entries. They
+  /// are simply listed in ThroughBlocks instead.
   ///
   struct BlockInfo {
     MachineBasicBlock *MBB;
     SlotIndex FirstUse;   ///< First instr using current reg.
     SlotIndex LastUse;    ///< Last instr using current reg.
-    SlotIndex Kill;       ///< Interval end point inside block.
-    SlotIndex Def;        ///< Interval start point inside block.
-    bool LiveThrough;     ///< Live in whole block (Templ 5. or 6. above).
+    bool LiveThrough;     ///< Live in whole block (Templ 5. above).
     bool LiveIn;          ///< Current reg is live in.
     bool LiveOut;         ///< Current reg is live out.
   };
@@ -91,12 +96,19 @@ private:
   /// UseBlocks - Blocks where CurLI has uses.
   SmallVector<BlockInfo, 8> UseBlocks;
 
+  /// NumGapBlocks - Number of duplicate entries in UseBlocks for blocks where
+  /// the live range has a gap.
+  unsigned NumGapBlocks;
+
   /// ThroughBlocks - Block numbers where CurLI is live through without uses.
   BitVector ThroughBlocks;
 
   /// NumThroughBlocks - Number of live-through blocks.
   unsigned NumThroughBlocks;
 
+  /// DidRepairRange - analyze was forced to shrinkToUses().
+  bool DidRepairRange;
+
   SlotIndex computeLastSplitPoint(unsigned Num);
 
   // Sumarize statistics by counting instructions using CurLI.
@@ -113,6 +125,11 @@ public:
   /// split.
   void analyze(const LiveInterval *li);
 
+  /// didRepairRange() - Returns true if CurLI was invalid and has been repaired
+  /// by analyze(). This really shouldn't happen, but sometimes the coalescer
+  /// can create live ranges that end in mid-air.
+  bool didRepairRange() const { return DidRepairRange; }
+
   /// clear - clear all data structures so SplitAnalysis is ready to analyze a
   /// new interval.
   void clear();
@@ -139,7 +156,7 @@ public:
 
   /// getUseBlocks - Return an array of BlockInfo objects for the basic blocks
   /// where CurLI has uses.
-  ArrayRef<BlockInfo> getUseBlocks() { return UseBlocks; }
+  ArrayRef<BlockInfo> getUseBlocks() const { return UseBlocks; }
 
   /// getNumThroughBlocks - Return the number of through blocks.
   unsigned getNumThroughBlocks() const { return NumThroughBlocks; }
@@ -150,9 +167,14 @@ public:
   /// getThroughBlocks - Return the set of through blocks.
   const BitVector &getThroughBlocks() const { return ThroughBlocks; }
 
-  /// countLiveBlocks - Return the number of blocks where li is live.
-  /// This is guaranteed to return the same number as getNumThroughBlocks() +
-  /// getUseBlocks().size() after calling analyze(li).
+  /// getNumLiveBlocks - Return the number of blocks where CurLI is live.
+  unsigned getNumLiveBlocks() const {
+    return getUseBlocks().size() - NumGapBlocks + getNumThroughBlocks();
+  }
+
+  /// countLiveBlocks - Return the number of blocks where li is live. This is
+  /// guaranteed to return the same number as getNumLiveBlocks() after calling
+  /// analyze(li).
   unsigned countLiveBlocks(const LiveInterval *li) const;
 
   typedef SmallPtrSet<const MachineBasicBlock*, 16> BlockPtrSet;
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index 04d3d31..e8eab8f 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -34,6 +34,7 @@ STATISTIC(NumTails     , "Number of tails duplicated");
 STATISTIC(NumTailDups  , "Number of tail duplicated blocks");
 STATISTIC(NumInstrDups , "Additional instructions due to tail duplication");
 STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
+STATISTIC(NumAddedPHIs , "Number of phis added");
 
 // Heuristic for tail duplication.
 static cl::opt<unsigned>
@@ -80,16 +81,21 @@ namespace {
     void ProcessPHI(MachineInstr *MI, MachineBasicBlock *TailBB,
                     MachineBasicBlock *PredBB,
                     DenseMap<unsigned, unsigned> &LocalVRMap,
-                    SmallVector<std::pair<unsigned,unsigned>, 4> &Copies);
+                    SmallVector<std::pair<unsigned,unsigned>, 4> &Copies,
+                    const DenseSet<unsigned> &UsedByPhi,
+                    bool Remove);
     void DuplicateInstruction(MachineInstr *MI,
                               MachineBasicBlock *TailBB,
                               MachineBasicBlock *PredBB,
                               MachineFunction &MF,
-                              DenseMap<unsigned, unsigned> &LocalVRMap);
+                              DenseMap<unsigned, unsigned> &LocalVRMap,
+                              const DenseSet<unsigned> &UsedByPhi);
     void UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
                               SmallVector<MachineBasicBlock*, 8> &TDBBs,
                               SmallSetVector<MachineBasicBlock*, 8> &Succs);
     bool TailDuplicateBlocks(MachineFunction &MF);
+    bool shouldTailDuplicate(const MachineFunction &MF,
+                             MachineBasicBlock &TailBB);
     bool TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
                        SmallVector<MachineBasicBlock*, 8> &TDBBs,
                        SmallVector<MachineInstr*, 16> &Copies);
@@ -146,11 +152,11 @@ static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) {
       for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
         MachineBasicBlock *PHIBB = MI->getOperand(i+1).getMBB();
         if (CheckExtra && !Preds.count(PHIBB)) {
-          // This is not a hard error.
           dbgs() << "Warning: malformed PHI in BB#" << MBB->getNumber()
                  << ": " << *MI;
           dbgs() << "  extra input from predecessor BB#"
                  << PHIBB->getNumber() << '\n';
+          llvm_unreachable(0);
         }
         if (PHIBB->getNumber() < 0) {
           dbgs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI;
@@ -183,10 +189,6 @@ bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) {
     if (NumTails == TailDupLimit)
       break;
 
-    // Only duplicate blocks that end with unconditional branches.
-    if (MBB->canFallThrough())
-      continue;
-
     // Save the successors list.
     SmallSetVector<MachineBasicBlock*, 8> Succs(MBB->succ_begin(),
                                                 MBB->succ_end());
@@ -240,7 +242,7 @@ bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) {
             MachineOperand &UseMO = UI.getOperand();
             MachineInstr *UseMI = &*UI;
             ++UI;
-            if (UseMI->getParent() == DefBB)
+            if (UseMI->getParent() == DefBB && !UseMI->isPHI())
               continue;
             SSAUpdate.RewriteUse(UseMO);
           }
@@ -271,6 +273,7 @@ bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) {
       MadeChange = true;
     }
   }
+  NumAddedPHIs += NewPHIs.size();
 
   return MadeChange;
 }
@@ -293,6 +296,24 @@ static unsigned getPHISrcRegOpIdx(MachineInstr *MI, MachineBasicBlock *SrcBB) {
   return 0;
 }
 
+
+// Remember which registers are used by phis in this block. This is
+// used to determine which registers are liveout while modifying the
+// block (which is why we need to copy the information).
+static void getRegsUsedByPHIs(const MachineBasicBlock &BB,
+                              DenseSet<unsigned> *UsedByPhi) {
+  for(MachineBasicBlock::const_iterator I = BB.begin(), E = BB.end();
+      I != E; ++I) {
+    const MachineInstr &MI = *I;
+    if (!MI.isPHI())
+      break;
+    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
+      unsigned SrcReg = MI.getOperand(i).getReg();
+      UsedByPhi->insert(SrcReg);
+    }
+  }
+}
+
 /// AddSSAUpdateEntry - Add a definition and source virtual registers pair for
 /// SSA update.
 void TailDuplicatePass::AddSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
@@ -315,7 +336,9 @@ void TailDuplicatePass::ProcessPHI(MachineInstr *MI,
                                    MachineBasicBlock *TailBB,
                                    MachineBasicBlock *PredBB,
                                    DenseMap<unsigned, unsigned> &LocalVRMap,
-                         SmallVector<std::pair<unsigned,unsigned>, 4> &Copies) {
+                           SmallVector<std::pair<unsigned,unsigned>, 4> &Copies,
+                                   const DenseSet<unsigned> &RegsUsedByPhi,
+                                   bool Remove) {
   unsigned DefReg = MI->getOperand(0).getReg();
   unsigned SrcOpIdx = getPHISrcRegOpIdx(MI, PredBB);
   assert(SrcOpIdx && "Unable to find matching PHI source?");
@@ -327,9 +350,12 @@ void TailDuplicatePass::ProcessPHI(MachineInstr *MI,
   // available value liveout of the block.
   unsigned NewDef = MRI->createVirtualRegister(RC);
   Copies.push_back(std::make_pair(NewDef, SrcReg));
-  if (isDefLiveOut(DefReg, TailBB, MRI))
+  if (isDefLiveOut(DefReg, TailBB, MRI) || RegsUsedByPhi.count(DefReg))
     AddSSAUpdateEntry(DefReg, NewDef, PredBB);
 
+  if (!Remove)
+    return;
+
   // Remove PredBB from the PHI node.
   MI->RemoveOperand(SrcOpIdx+1);
   MI->RemoveOperand(SrcOpIdx);
@@ -343,7 +369,8 @@ void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI,
                                      MachineBasicBlock *TailBB,
                                      MachineBasicBlock *PredBB,
                                      MachineFunction &MF,
-                                     DenseMap<unsigned, unsigned> &LocalVRMap) {
+                                     DenseMap<unsigned, unsigned> &LocalVRMap,
+                                     const DenseSet<unsigned> &UsedByPhi) {
   MachineInstr *NewMI = TII->duplicate(MI, MF);
   for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
     MachineOperand &MO = NewMI->getOperand(i);
@@ -357,7 +384,7 @@ void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI,
       unsigned NewReg = MRI->createVirtualRegister(RC);
       MO.setReg(NewReg);
       LocalVRMap.insert(std::make_pair(Reg, NewReg));
-      if (isDefLiveOut(Reg, TailBB, MRI))
+      if (isDefLiveOut(Reg, TailBB, MRI) || UsedByPhi.count(Reg))
         AddSSAUpdateEntry(Reg, NewReg, PredBB);
     } else {
       DenseMap<unsigned, unsigned>::iterator VI = LocalVRMap.find(Reg);
@@ -416,6 +443,13 @@ TailDuplicatePass::UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
         // This register is defined in the tail block.
         for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
           MachineBasicBlock *SrcBB = LI->second[j].first;
+          // If we didn't duplicate a bb into a particular predecessor, we
+          // might still have added an entry to SSAUpdateVals to correcly
+          // recompute SSA. If that case, avoid adding a dummy extra argument
+          // this PHI.
+          if (!SrcBB->isSuccessor(SuccBB))
+            continue;
+
           unsigned SrcReg = LI->second[j].second;
           if (Idx != 0) {
             II->getOperand(Idx).setReg(SrcReg);
@@ -448,14 +482,15 @@ TailDuplicatePass::UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
   }
 }
 
-/// TailDuplicate - If it is profitable, duplicate TailBB's contents in each
-/// of its predecessors.
+/// shouldTailDuplicate - Determine if it is profitable to duplicate this block.
 bool
-TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
-                                 SmallVector<MachineBasicBlock*, 8> &TDBBs,
-                                 SmallVector<MachineInstr*, 16> &Copies) {
-  // Set the limit on the number of instructions to duplicate, with a default
-  // of one less than the tail-merge threshold. When optimizing for size,
+TailDuplicatePass::shouldTailDuplicate(const MachineFunction &MF,
+                                       MachineBasicBlock &TailBB) {
+  // Only duplicate blocks that end with unconditional branches.
+  if (TailBB.canFallThrough())
+    return false;
+
+  // Set the limit on the cost to duplicate. When optimizing for size,
   // duplicate only one, because one branch instruction can be eliminated to
   // compensate for the duplication.
   unsigned MaxDuplicateCount;
@@ -466,12 +501,12 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
     MaxDuplicateCount = TailDuplicateSize;
 
   if (PreRegAlloc) {
-    if (TailBB->empty())
+    if (TailBB.empty())
       return false;
-    const TargetInstrDesc &TID = TailBB->back().getDesc();
+    const TargetInstrDesc &TID = TailBB.back().getDesc();
     // Pre-regalloc tail duplication hurts compile time and doesn't help
-    // much except for indirect branches and returns.
-    if (!TID.isIndirectBranch() && !TID.isReturn())
+    // much except for indirect branches.
+    if (!TID.isIndirectBranch())
       return false;
     // If the target has hardware branch prediction that can handle indirect
     // branches, duplicating them can often make them predictable when there
@@ -482,15 +517,15 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
   }
 
   // Don't try to tail-duplicate single-block loops.
-  if (TailBB->isSuccessor(TailBB))
+  if (TailBB.isSuccessor(&TailBB))
     return false;
 
   // Check the instructions in the block to determine whether tail-duplication
   // is invalid or unlikely to be profitable.
   unsigned InstrCount = 0;
   bool HasCall = false;
-  for (MachineBasicBlock::iterator I = TailBB->begin();
-       I != TailBB->end(); ++I) {
+  for (MachineBasicBlock::const_iterator I = TailBB.begin(); I != TailBB.end();
+       ++I) {
     // Non-duplicable things shouldn't be tail-duplicated.
     if (I->getDesc().isNotDuplicable()) return false;
     // Do not duplicate 'return' instructions if this is a pre-regalloc run.
@@ -510,6 +545,18 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
   if (InstrCount > 1 && (PreRegAlloc && HasCall))
     return false;
 
+  return true;
+}
+
+/// TailDuplicate - If it is profitable, duplicate TailBB's contents in each
+/// of its predecessors.
+bool
+TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
+                                 SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                                 SmallVector<MachineInstr*, 16> &Copies) {
+  if (!shouldTailDuplicate(MF, *TailBB))
+    return false;
+
   DEBUG(dbgs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n');
 
   // Iterate through all the unique predecessors and tail-duplicate this
@@ -518,13 +565,17 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
   bool Changed = false;
   SmallSetVector<MachineBasicBlock*, 8> Preds(TailBB->pred_begin(),
                                               TailBB->pred_end());
+  DenseSet<unsigned> UsedByPhi;
+  getRegsUsedByPHIs(*TailBB, &UsedByPhi);
   for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
        PE = Preds.end(); PI != PE; ++PI) {
     MachineBasicBlock *PredBB = *PI;
 
     assert(TailBB != PredBB &&
            "Single-block loop should have been rejected earlier!");
-    if (PredBB->succ_size() > 1) continue;
+    // EH edges are ignored by AnalyzeBranch.
+    if (PredBB->succ_size() > 1)
+      continue;
 
     MachineBasicBlock *PredTBB, *PredFBB;
     SmallVector<MachineOperand, 4> PredCond;
@@ -532,9 +583,6 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
       continue;
     if (!PredCond.empty())
       continue;
-    // EH edges are ignored by AnalyzeBranch.
-    if (PredBB->succ_size() != 1)
-      continue;
     // Don't duplicate into a fall-through predecessor (at least for now).
     if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough())
       continue;
@@ -557,11 +605,11 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
       if (MI->isPHI()) {
         // Replace the uses of the def of the PHI with the register coming
         // from PredBB.
-        ProcessPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos);
+        ProcessPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, true);
       } else {
         // Replace def of virtual registers with new registers, and update
         // uses with PHI source register or the new registers.
-        DuplicateInstruction(MI, TailBB, PredBB, MF, LocalVRMap);
+        DuplicateInstruction(MI, TailBB, PredBB, MF, LocalVRMap, UsedByPhi);
       }
     }
     MachineBasicBlock::iterator Loc = PredBB->getFirstTerminator();
@@ -590,12 +638,11 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
   MachineBasicBlock *PrevBB = prior(MachineFunction::iterator(TailBB));
   MachineBasicBlock *PriorTBB = 0, *PriorFBB = 0;
   SmallVector<MachineOperand, 4> PriorCond;
-  bool PriorUnAnalyzable =
-    TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true);
   // This has to check PrevBB->succ_size() because EH edges are ignored by
   // AnalyzeBranch.
-  if (!PriorUnAnalyzable && PriorCond.empty() && !PriorTBB &&
-      TailBB->pred_size() == 1 && PrevBB->succ_size() == 1 &&
+  if (PrevBB->succ_size() == 1 && 
+      !TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true) &&
+      PriorCond.empty() && !PriorTBB && TailBB->pred_size() == 1 &&
       !TailBB->hasAddressTaken()) {
     DEBUG(dbgs() << "\nMerging into block: " << *PrevBB
           << "From MBB: " << *TailBB);
@@ -608,7 +655,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
         // Replace the uses of the def of the PHI with the register coming
         // from PredBB.
         MachineInstr *MI = &*I++;
-        ProcessPHI(MI, TailBB, PrevBB, LocalVRMap, CopyInfos);
+        ProcessPHI(MI, TailBB, PrevBB, LocalVRMap, CopyInfos, UsedByPhi, true);
         if (MI->getParent())
           MI->eraseFromParent();
       }
@@ -618,7 +665,7 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
         // Replace def of virtual registers with new registers, and update
         // uses with PHI source register or the new registers.
         MachineInstr *MI = &*I++;
-        DuplicateInstruction(MI, TailBB, PrevBB, MF, LocalVRMap);
+        DuplicateInstruction(MI, TailBB, PrevBB, MF, LocalVRMap, UsedByPhi);
         MI->eraseFromParent();
       }
       MachineBasicBlock::iterator Loc = PrevBB->getFirstTerminator();
@@ -639,6 +686,57 @@ TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF,
     Changed = true;
   }
 
+  // If this is after register allocation, there are no phis to fix.
+  if (!PreRegAlloc)
+    return Changed;
+
+  // If we made no changes so far, we are safe.
+  if (!Changed)
+    return Changed;
+
+
+  // Handle the nasty case in that we duplicated a block that is part of a loop
+  // into some but not all of its predecessors. For example:
+  //    1 -> 2 <-> 3                 |
+  //          \                      |
+  //           \---> rest            |
+  // if we duplicate 2 into 1 but not into 3, we end up with
+  // 12 -> 3 <-> 2 -> rest           |
+  //   \             /               |
+  //    \----->-----/                |
+  // If there was a "var = phi(1, 3)" in 2, it has to be ultimately replaced
+  // with a phi in 3 (which now dominates 2).
+  // What we do here is introduce a copy in 3 of the register defined by the
+  // phi, just like when we are duplicating 2 into 3, but we don't copy any
+  // real instructions or remove the 3 -> 2 edge from the phi in 2.
+  for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
+       PE = Preds.end(); PI != PE; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+    if (std::find(TDBBs.begin(), TDBBs.end(), PredBB) != TDBBs.end())
+      continue;
+
+    // EH edges
+    if (PredBB->succ_size() != 1)
+      continue;
+
+    DenseMap<unsigned, unsigned> LocalVRMap;
+    SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos;
+    MachineBasicBlock::iterator I = TailBB->begin();
+    // Process PHI instructions first.
+    while (I != TailBB->end() && I->isPHI()) {
+      // Replace the uses of the def of the PHI with the register coming
+      // from PredBB.
+      MachineInstr *MI = &*I++;
+      ProcessPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, false);
+    }
+    MachineBasicBlock::iterator Loc = PredBB->getFirstTerminator();
+    for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) {
+      Copies.push_back(BuildMI(*PredBB, Loc, DebugLoc(),
+                               TII->get(TargetOpcode::COPY),
+                               CopyInfos[i].first).addReg(CopyInfos[i].second));
+    }
+  }
+
   return Changed;
 }
 
@@ -655,4 +753,3 @@ void TailDuplicatePass::RemoveDeadBlock(MachineBasicBlock *MBB) {
   // Remove the block.
   MBB->eraseFromParent();
 }
-
diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp
index b9fcd38..34e2b33 100644
--- a/lib/CodeGen/TargetInstrInfoImpl.cpp
+++ b/lib/CodeGen/TargetInstrInfoImpl.cpp
@@ -212,8 +212,7 @@ static const TargetRegisterClass *canFoldCopy(const MachineInstr *MI,
   if (TargetRegisterInfo::isPhysicalRegister(LiveOp.getReg()))
     return RC->contains(LiveOp.getReg()) ? RC : 0;
 
-  const TargetRegisterClass *LiveRC = MRI.getRegClass(LiveReg);
-  if (RC == LiveRC || RC->hasSubClass(LiveRC))
+  if (RC->hasSubClassEq(MRI.getRegClass(LiveReg)))
     return RC;
 
   // FIXME: Allow folding when register classes are memory compatible.
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 6ed91b0..cdac42d 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -225,10 +225,11 @@ void TargetLoweringObjectFileELF::emitPersonalityValue(MCStreamer &Streamer,
 
 static SectionKind
 getELFKindForNamedSection(StringRef Name, SectionKind K) {
-  // FIXME: Why is this here? Codegen is should not be in the business
-  // of figuring section flags. If the user wrote section(".eh_frame"),
-  // we should just pass that to MC which will defer to the assembly
-  // or use its default if producing an object file.
+  // N.B.: The defaults used in here are no the same ones used in MC.
+  // We follow gcc, MC follows gas. For example, given ".section .eh_frame",
+  // both gas and MC will produce a section with no flags. Given
+  // section(".eh_frame") gcc will produce
+  // .section	.eh_frame,"a",@progbits
   if (Name.empty() || Name[0] != '.') return K;
 
   // Some lame default implementation based on some magic section names.
@@ -254,9 +255,6 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) {
       Name.startswith(".llvm.linkonce.tb."))
     return SectionKind::getThreadBSS();
 
-  if (Name == ".eh_frame")
-    return SectionKind::getDataRel();
-
   return K;
 }
 
@@ -484,11 +482,6 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
 
 void TargetLoweringObjectFileMachO::Initialize(MCContext &Ctx,
                                                const TargetMachine &TM) {
-  // _foo.eh symbols are currently always exported so that the linker knows
-  // about them.  This is not necessary on 10.6 and later, but it
-  // doesn't hurt anything.
-  // FIXME: I need to get this from Triple.
-  IsFunctionEHSymbolGlobal = true;
   IsFunctionEHFrameSymbolPrivate = false;
   SupportsWeakOmittedEHFrame = false;
 
@@ -994,6 +987,20 @@ void TargetLoweringObjectFileCOFF::Initialize(MCContext &Ctx,
     getContext().getCOFFSection(".drectve",
                                 COFF::IMAGE_SCN_LNK_INFO,
                                 SectionKind::getMetadata());
+
+  PDataSection =
+    getContext().getCOFFSection(".pdata",
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ |
+                                COFF::IMAGE_SCN_MEM_WRITE,
+                                SectionKind::getDataRel());
+
+  XDataSection =
+    getContext().getCOFFSection(".xdata",
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ |
+                                COFF::IMAGE_SCN_MEM_WRITE,
+                                SectionKind::getDataRel());
 }
 
 const MCSection *TargetLoweringObjectFileCOFF::getEHFrameSection() const {
@@ -1004,6 +1011,28 @@ const MCSection *TargetLoweringObjectFileCOFF::getEHFrameSection() const {
                                      SectionKind::getDataRel());
 }
 
+const MCSection *TargetLoweringObjectFileCOFF::getWin64EHFuncTableSection(
+                                                       StringRef suffix) const {
+  if (suffix == "")
+    return PDataSection;
+  return getContext().getCOFFSection((".pdata"+suffix).str(),
+                                     COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                     COFF::IMAGE_SCN_MEM_READ |
+                                     COFF::IMAGE_SCN_MEM_WRITE,
+                                     SectionKind::getDataRel());
+}
+
+const MCSection *TargetLoweringObjectFileCOFF::getWin64EHTableSection(
+                                                       StringRef suffix) const {
+  if (suffix == "")
+    return XDataSection;
+  return getContext().getCOFFSection((".xdata"+suffix).str(),
+                                     COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                     COFF::IMAGE_SCN_MEM_READ |
+                                     COFF::IMAGE_SCN_MEM_WRITE,
+                                     SectionKind::getDataRel());
+}
+
 
 static unsigned
 getCOFFSectionFlags(SectionKind K) {
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 52ea872..f54d879 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1125,6 +1125,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
             break; // The tied operands have been eliminated.
         }
 
+        bool IsEarlyClobber = false;
         bool RemovedKillFlag = false;
         bool AllUsesCopied = true;
         unsigned LastCopiedReg = 0;
@@ -1132,7 +1133,11 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
         for (unsigned tpi = 0, tpe = TiedPairs.size(); tpi != tpe; ++tpi) {
           unsigned SrcIdx = TiedPairs[tpi].first;
           unsigned DstIdx = TiedPairs[tpi].second;
-          unsigned regA = mi->getOperand(DstIdx).getReg();
+
+          const MachineOperand &DstMO = mi->getOperand(DstIdx);
+          unsigned regA = DstMO.getReg();
+          IsEarlyClobber |= DstMO.isEarlyClobber();
+
           // Grab regB from the instruction because it may have changed if the
           // instruction was commuted.
           regB = mi->getOperand(SrcIdx).getReg();
@@ -1196,15 +1201,17 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
         }
 
         if (AllUsesCopied) {
-          // Replace other (un-tied) uses of regB with LastCopiedReg.
-          for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
-            MachineOperand &MO = mi->getOperand(i);
-            if (MO.isReg() && MO.getReg() == regB && MO.isUse()) {
-              if (MO.isKill()) {
-                MO.setIsKill(false);
-                RemovedKillFlag = true;
+          if (!IsEarlyClobber) {
+            // Replace other (un-tied) uses of regB with LastCopiedReg.
+            for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
+              MachineOperand &MO = mi->getOperand(i);
+              if (MO.isReg() && MO.getReg() == regB && MO.isUse()) {
+                if (MO.isKill()) {
+                  MO.setIsKill(false);
+                  RemovedKillFlag = true;
+                }
+                MO.setReg(LastCopiedReg);
               }
-              MO.setReg(LastCopiedReg);
             }
           }
 
diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index 48d8ab1..52693f0 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp
@@ -196,8 +196,11 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
         temp->eraseFromParent();
         ModifiedPHI = true;
 
-        if (Input != Output)
-          F.getRegInfo().replaceRegWith(Output, Input);
+        if (Input != Output) {
+          MachineRegisterInfo &MRI = F.getRegInfo();
+          MRI.constrainRegClass(Input, MRI.getRegClass(Output));
+          MRI.replaceRegWith(Output, Input);
+        }
 
         continue;
       }
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 226b78f..7557979 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -42,6 +42,7 @@
 using namespace llvm;
 
 STATISTIC(NumSpills  , "Number of register spills");
+STATISTIC(NumIdCopies, "Number of identity moves eliminated after rewriting");
 
 //===----------------------------------------------------------------------===//
 //  VirtRegMap implementation
@@ -318,6 +319,7 @@ void VirtRegMap::rewrite(SlotIndexes *Indexes) {
 
       // Finally, remove any identity copies.
       if (MI->isIdentityCopy()) {
+        ++NumIdCopies;
         if (MI->getNumOperands() == 2) {
           DEBUG(dbgs() << "Deleting identity copy.\n");
           RemoveMachineInstrFromMaps(MI);
diff --git a/lib/ExecutionEngine/CMakeLists.txt b/lib/ExecutionEngine/CMakeLists.txt
index 8bff265..58caae8 100644
--- a/lib/ExecutionEngine/CMakeLists.txt
+++ b/lib/ExecutionEngine/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_library(LLVMExecutionEngine
   ExecutionEngine.cpp
   ExecutionEngineBindings.cpp
+  TargetSelect.cpp
   )
 
 add_subdirectory(Interpreter)
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index 2b1e878..7652090 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
 #include <cmath>
 #include <cstring>
 using namespace llvm;
@@ -42,20 +43,14 @@ ExecutionEngine *(*ExecutionEngine::JITCtor)(
   JITMemoryManager *JMM,
   CodeGenOpt::Level OptLevel,
   bool GVsWithCode,
-  CodeModel::Model CMM,
-  StringRef MArch,
-  StringRef MCPU,
-  const SmallVectorImpl<std::string>& MAttrs) = 0;
+  TargetMachine *TM) = 0;
 ExecutionEngine *(*ExecutionEngine::MCJITCtor)(
   Module *M,
   std::string *ErrorStr,
   JITMemoryManager *JMM,
   CodeGenOpt::Level OptLevel,
   bool GVsWithCode,
-  CodeModel::Model CMM,
-  StringRef MArch,
-  StringRef MCPU,
-  const SmallVectorImpl<std::string>& MAttrs) = 0;
+  TargetMachine *TM) = 0;
 ExecutionEngine *(*ExecutionEngine::InterpCtor)(Module *M,
                                                 std::string *ErrorStr) = 0;
 
@@ -419,6 +414,35 @@ ExecutionEngine *ExecutionEngine::create(Module *M,
       .create();
 }
 
+/// createJIT - This is the factory method for creating a JIT for the current
+/// machine, it does not fall back to the interpreter.  This takes ownership
+/// of the module.
+ExecutionEngine *ExecutionEngine::createJIT(Module *M,
+                                            std::string *ErrorStr,
+                                            JITMemoryManager *JMM,
+                                            CodeGenOpt::Level OptLevel,
+                                            bool GVsWithCode,
+                                            CodeModel::Model CMM) {
+  if (ExecutionEngine::JITCtor == 0) {
+    if (ErrorStr)
+      *ErrorStr = "JIT has not been linked in.";
+    return 0;
+  }
+
+  // Use the defaults for extra parameters.  Users can use EngineBuilder to
+  // set them.
+  StringRef MArch = "";
+  StringRef MCPU = "";
+  SmallVector<std::string, 1> MAttrs;
+
+  TargetMachine *TM =
+          EngineBuilder::selectTarget(M, MArch, MCPU, MAttrs, ErrorStr);
+  if (!TM || (ErrorStr && ErrorStr->length() > 0)) return 0;
+  TM->setCodeModel(CMM);
+
+  return ExecutionEngine::JITCtor(M, ErrorStr, JMM, OptLevel, GVsWithCode, TM);
+}
+
 ExecutionEngine *EngineBuilder::create() {
   // Make sure we can resolve symbols in the program as well. The zero arg
   // to the function tells DynamicLibrary to load the program, not a library.
@@ -441,18 +465,21 @@ ExecutionEngine *EngineBuilder::create() {
   // Unless the interpreter was explicitly selected or the JIT is not linked,
   // try making a JIT.
   if (WhichEngine & EngineKind::JIT) {
-    if (UseMCJIT && ExecutionEngine::MCJITCtor) {
-      ExecutionEngine *EE =
-        ExecutionEngine::MCJITCtor(M, ErrorStr, JMM, OptLevel,
-                                   AllocateGVsWithCode, CMModel,
-                                   MArch, MCPU, MAttrs);
-      if (EE) return EE;
-    } else if (ExecutionEngine::JITCtor) {
-      ExecutionEngine *EE =
-        ExecutionEngine::JITCtor(M, ErrorStr, JMM, OptLevel,
-                                 AllocateGVsWithCode, CMModel,
-                                 MArch, MCPU, MAttrs);
-      if (EE) return EE;
+    if (TargetMachine *TM =
+        EngineBuilder::selectTarget(M, MArch, MCPU, MAttrs, ErrorStr)) {
+      TM->setCodeModel(CMModel);
+
+      if (UseMCJIT && ExecutionEngine::MCJITCtor) {
+        ExecutionEngine *EE =
+          ExecutionEngine::MCJITCtor(M, ErrorStr, JMM, OptLevel,
+                                     AllocateGVsWithCode, TM);
+        if (EE) return EE;
+      } else if (ExecutionEngine::JITCtor) {
+        ExecutionEngine *EE =
+          ExecutionEngine::JITCtor(M, ErrorStr, JMM, OptLevel,
+                                   AllocateGVsWithCode, TM);
+        if (EE) return EE;
+      }
     }
   }
 
diff --git a/lib/ExecutionEngine/JIT/CMakeLists.txt b/lib/ExecutionEngine/JIT/CMakeLists.txt
index 42020d6..cefb0ae 100644
--- a/lib/ExecutionEngine/JIT/CMakeLists.txt
+++ b/lib/ExecutionEngine/JIT/CMakeLists.txt
@@ -9,5 +9,4 @@ add_llvm_library(LLVMJIT
   JITEmitter.cpp
   JITMemoryManager.cpp
   OProfileJITEventListener.cpp
-  TargetSelect.cpp
   )
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
index d1f87ac..8fceaf2 100644
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -203,39 +203,18 @@ void DarwinRegisterFrame(void* FrameBegin) {
 /// createJIT - This is the factory method for creating a JIT for the current
 /// machine, it does not fall back to the interpreter.  This takes ownership
 /// of the module.
-ExecutionEngine *ExecutionEngine::createJIT(Module *M,
-                                            std::string *ErrorStr,
-                                            JITMemoryManager *JMM,
-                                            CodeGenOpt::Level OptLevel,
-                                            bool GVsWithCode,
-                                            CodeModel::Model CMM) {
-  // Use the defaults for extra parameters.  Users can use EngineBuilder to
-  // set them.
-  StringRef MArch = "";
-  StringRef MCPU = "";
-  SmallVector<std::string, 1> MAttrs;
-  return JIT::createJIT(M, ErrorStr, JMM, OptLevel, GVsWithCode, CMM,
-                        MArch, MCPU, MAttrs);
-}
-
 ExecutionEngine *JIT::createJIT(Module *M,
                                 std::string *ErrorStr,
                                 JITMemoryManager *JMM,
                                 CodeGenOpt::Level OptLevel,
                                 bool GVsWithCode,
-                                CodeModel::Model CMM,
-                                StringRef MArch,
-                                StringRef MCPU,
-                                const SmallVectorImpl<std::string>& MAttrs) {
+                                TargetMachine *TM) {
   // Try to register the program as a source of symbols to resolve against.
+  //
+  // FIXME: Don't do this here.
   sys::DynamicLibrary::LoadLibraryPermanently(0, NULL);
 
-  // Pick a target either via -march or by guessing the native arch.
-  TargetMachine *TM = JIT::selectTarget(M, MArch, MCPU, MAttrs, ErrorStr);
-  if (!TM || (ErrorStr && ErrorStr->length() > 0)) return 0;
-  TM->setCodeModel(CMM);
-
-  // If the target supports JIT code generation, create a the JIT.
+  // If the target supports JIT code generation, create the JIT.
   if (TargetJITInfo *TJ = TM->getJITInfo()) {
     return new JIT(M, *TM, *TJ, JMM, OptLevel, GVsWithCode);
   } else {
diff --git a/lib/ExecutionEngine/JIT/JIT.h b/lib/ExecutionEngine/JIT/JIT.h
index b576c16..b879fc3 100644
--- a/lib/ExecutionEngine/JIT/JIT.h
+++ b/lib/ExecutionEngine/JIT/JIT.h
@@ -181,23 +181,12 @@ public:
   ///
   JITCodeEmitter *getCodeEmitter() const { return JCE; }
 
-  /// selectTarget - Pick a target either via -march or by guessing the native
-  /// arch.  Add any CPU features specified via -mcpu or -mattr.
-  static TargetMachine *selectTarget(Module *M,
-                                     StringRef MArch,
-                                     StringRef MCPU,
-                                     const SmallVectorImpl<std::string>& MAttrs,
-                                     std::string *Err);
-
   static ExecutionEngine *createJIT(Module *M,
                                     std::string *ErrorStr,
                                     JITMemoryManager *JMM,
                                     CodeGenOpt::Level OptLevel,
                                     bool GVsWithCode,
-                                    CodeModel::Model CMM,
-                                    StringRef MArch,
-                                    StringRef MCPU,
-                                    const SmallVectorImpl<std::string>& MAttrs);
+                                    TargetMachine *TM);
 
   // Run the JIT on F and return information about the generated code
   void runJITOnFunction(Function *F, MachineCodeInfo *MCI = 0);
diff --git a/lib/ExecutionEngine/MCJIT/CMakeLists.txt b/lib/ExecutionEngine/MCJIT/CMakeLists.txt
index 6553079..38fdffa 100644
--- a/lib/ExecutionEngine/MCJIT/CMakeLists.txt
+++ b/lib/ExecutionEngine/MCJIT/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_llvm_library(LLVMMCJIT
   MCJIT.cpp
-  TargetSelect.cpp
   Intercept.cpp
   )
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 3d4ee36..4475f4d 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -38,27 +38,15 @@ ExecutionEngine *MCJIT::createJIT(Module *M,
                                   JITMemoryManager *JMM,
                                   CodeGenOpt::Level OptLevel,
                                   bool GVsWithCode,
-                                  CodeModel::Model CMM,
-                                  StringRef MArch,
-                                  StringRef MCPU,
-                                  const SmallVectorImpl<std::string>& MAttrs) {
+                                  TargetMachine *TM) {
   // Try to register the program as a source of symbols to resolve against.
   //
   // FIXME: Don't do this here.
   sys::DynamicLibrary::LoadLibraryPermanently(0, NULL);
 
-  // Pick a target either via -march or by guessing the native arch.
-  //
-  // FIXME: This should be lifted out of here, it isn't something which should
-  // be part of the JIT policy, rather the burden for this selection should be
-  // pushed to clients.
-  TargetMachine *TM = MCJIT::selectTarget(M, MArch, MCPU, MAttrs, ErrorStr);
-  if (!TM || (ErrorStr && ErrorStr->length() > 0)) return 0;
-  TM->setCodeModel(CMM);
-
   // If the target supports JIT code generation, create the JIT.
   if (TargetJITInfo *TJ = TM->getJITInfo())
-    return new MCJIT(M, TM, *TJ, new MCJITMemoryManager(JMM), OptLevel,
+    return new MCJIT(M, TM, *TJ, new MCJITMemoryManager(JMM, M), OptLevel,
                      GVsWithCode);
 
   if (ErrorStr)
@@ -114,8 +102,12 @@ void *MCJIT::getPointerToFunction(Function *F) {
     return Addr;
   }
 
-  Twine Name = TM->getMCAsmInfo()->getGlobalPrefix() + F->getName();
-  return (void*)Dyld.getSymbolAddress(Name.str());
+  // FIXME: Should we be using the mangler for this? Probably.
+  StringRef BaseName = F->getName();
+  if (BaseName[0] == '\1')
+    return (void*)Dyld.getSymbolAddress(BaseName.substr(1));
+  return (void*)Dyld.getSymbolAddress((TM->getMCAsmInfo()->getGlobalPrefix()
+                                       + BaseName).str());
 }
 
 void *MCJIT::recompileAndRelinkFunction(Function *F) {
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h
index 1b50766..b64c21a 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -76,22 +76,12 @@ public:
     MCJITCtor = createJIT;
   }
 
-  // FIXME: This routine is scheduled for termination. Do not use it.
-  static TargetMachine *selectTarget(Module *M,
-                                     StringRef MArch,
-                                     StringRef MCPU,
-                                     const SmallVectorImpl<std::string>& MAttrs,
-                                     std::string *Err);
-
   static ExecutionEngine *createJIT(Module *M,
                                     std::string *ErrorStr,
                                     JITMemoryManager *JMM,
                                     CodeGenOpt::Level OptLevel,
                                     bool GVsWithCode,
-                                    CodeModel::Model CMM,
-                                    StringRef MArch,
-                                    StringRef MCPU,
-                                    const SmallVectorImpl<std::string>& MAttrs);
+                                    TargetMachine *TM);
 
   // @}
 };
diff --git a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h b/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
index e3c6fda..40bc031 100644
--- a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
+++ b/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
@@ -26,7 +26,7 @@ class MCJITMemoryManager : public RTDyldMemoryManager {
   // FIXME: Multiple modules.
   Module *M;
 public:
-  MCJITMemoryManager(JITMemoryManager *jmm) : JMM(jmm) {}
+  MCJITMemoryManager(JITMemoryManager *jmm, Module *m) : JMM(jmm), M(m) {}
 
   // Allocate ActualSize bytes, or more, for the named function. Return
   // a pointer to the allocated memory and update Size to reflect how much
@@ -36,6 +36,11 @@ public:
     //        prefix.
     if (Name[0] == '_') ++Name;
     Function *F = M->getFunction(Name);
+    // Some ObjC names have a prefixed \01 in the IR. If we failed to find
+    // the symbol and it's of the ObjC conventions (starts with "-"), try
+    // prepending a \01 and see if we can find it that way.
+    if (!F && Name[0] == '-')
+      F = M->getFunction((Twine("\1") + Name).str());
     assert(F && "No matching function in JIT IR Module!");
     return JMM->startFunctionBody(F, Size);
   }
@@ -48,6 +53,11 @@ public:
     //        prefix.
     if (Name[0] == '_') ++Name;
     Function *F = M->getFunction(Name);
+    // Some ObjC names have a prefixed \01 in the IR. If we failed to find
+    // the symbol and it's of the ObjC conventions (starts with "-"), try
+    // prepending a \01 and see if we can find it that way.
+    if (!F && Name[0] == '-')
+      F = M->getFunction((Twine("\1") + Name).str());
     assert(F && "No matching function in JIT IR Module!");
     JMM->endFunctionBody(F, FunctionStart, FunctionEnd);
   }
diff --git a/lib/ExecutionEngine/MCJIT/TargetSelect.cpp b/lib/ExecutionEngine/MCJIT/TargetSelect.cpp
deleted file mode 100644
index 50f6593..0000000
--- a/lib/ExecutionEngine/MCJIT/TargetSelect.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-//===-- TargetSelect.cpp - Target Chooser Code ----------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This just asks the TargetRegistry for the appropriate JIT to use, and allows
-// the user to specify a specific one on the commandline with -march=x. Clients
-// should initialize targets prior to calling createJIT.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCJIT.h"
-#include "llvm/Module.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Host.h"
-#include "llvm/Target/SubtargetFeature.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegistry.h"
-using namespace llvm;
-
-/// selectTarget - Pick a target either via -march or by guessing the native
-/// arch.  Add any CPU features specified via -mcpu or -mattr.
-TargetMachine *MCJIT::selectTarget(Module *Mod,
-                                 StringRef MArch,
-                                 StringRef MCPU,
-                                 const SmallVectorImpl<std::string>& MAttrs,
-                                 std::string *ErrorStr) {
-  Triple TheTriple(Mod->getTargetTriple());
-  if (TheTriple.getTriple().empty())
-    TheTriple.setTriple(sys::getHostTriple());
-
-  // Adjust the triple to match what the user requested.
-  const Target *TheTarget = 0;
-  if (!MArch.empty()) {
-    for (TargetRegistry::iterator it = TargetRegistry::begin(),
-           ie = TargetRegistry::end(); it != ie; ++it) {
-      if (MArch == it->getName()) {
-        TheTarget = &*it;
-        break;
-      }
-    }
-
-    if (!TheTarget) {
-      *ErrorStr = "No available targets are compatible with this -march, "
-        "see -version for the available targets.\n";
-      return 0;
-    }
-
-    // Adjust the triple to match (if known), otherwise stick with the
-    // module/host triple.
-    Triple::ArchType Type = Triple::getArchTypeForLLVMName(MArch);
-    if (Type != Triple::UnknownArch)
-      TheTriple.setArch(Type);
-  } else {
-    std::string Error;
-    TheTarget = TargetRegistry::lookupTarget(TheTriple.getTriple(), Error);
-    if (TheTarget == 0) {
-      if (ErrorStr)
-        *ErrorStr = Error;
-      return 0;
-    }
-  }
-
-  if (!TheTarget->hasJIT()) {
-    errs() << "WARNING: This target JIT is not designed for the host you are"
-           << " running.  If bad things happen, please choose a different "
-           << "-march switch.\n";
-  }
-
-  // Package up features to be passed to target/subtarget
-  std::string FeaturesStr;
-  if (!MCPU.empty() || !MAttrs.empty()) {
-    SubtargetFeatures Features;
-    Features.setCPU(MCPU);
-    for (unsigned i = 0; i != MAttrs.size(); ++i)
-      Features.AddFeature(MAttrs[i]);
-    FeaturesStr = Features.getString();
-  }
-
-  // Allocate a target...
-  TargetMachine *Target =
-    TheTarget->createTargetMachine(TheTriple.getTriple(), FeaturesStr);
-  assert(Target && "Could not allocate target machine!");
-  return Target;
-}
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 065e5e3..eda4cbb 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -129,18 +129,19 @@ void RuntimeDyldImpl::extractFunction(StringRef Name, uint8_t *StartAddress,
                                       uint8_t *EndAddress) {
   // Allocate memory for the function via the memory manager.
   uintptr_t Size = EndAddress - StartAddress + 1;
-  uint8_t *Mem = MemMgr->startFunctionBody(Name.data(), Size);
+  uintptr_t AllocSize = Size;
+  uint8_t *Mem = MemMgr->startFunctionBody(Name.data(), AllocSize);
   assert(Size >= (uint64_t)(EndAddress - StartAddress + 1) &&
          "Memory manager failed to allocate enough memory!");
   // Copy the function payload into the memory block.
-  memcpy(Mem, StartAddress, EndAddress - StartAddress + 1);
+  memcpy(Mem, StartAddress, Size);
   MemMgr->endFunctionBody(Name.data(), Mem, Mem + Size);
   // Remember where we put it.
   Functions[Name] = sys::MemoryBlock(Mem, Size);
   // Default the assigned address for this symbol to wherever this
   // allocated it.
   SymbolTable[Name] = Mem;
-  DEBUG(dbgs() << "    allocated to " << Mem << "\n");
+  DEBUG(dbgs() << "    allocated to [" << Mem << ", " << Mem + Size << "]\n");
 }
 
 bool RuntimeDyldImpl::
@@ -268,9 +269,9 @@ loadSegment32(const MachOObject *Obj,
     if (!Sect)
       return Error("unable to load section: '" + Twine(SectNum) + "'");
 
-    // FIXME: Improve check.
+    // FIXME: For the time being, we're only loading text segments.
     if (Sect->Flags != 0x80000400)
-      return Error("unsupported section type!");
+      continue;
 
     // Address and names of symbols in the section.
     typedef std::pair<uint64_t, StringRef> SymbolEntry;
@@ -295,11 +296,11 @@ loadSegment32(const MachOObject *Obj,
 
       // FIXME: Check the symbol type and flags.
       if (STE->Type != 0xF)  // external, defined in this section.
-        return Error("unexpected symbol type!");
+        continue;
       // Flags == 0x8 marks a thumb function for ARM, which is fine as it
       // doesn't require any special handling here.
       if (STE->Flags != 0x0 && STE->Flags != 0x8)
-        return Error("unexpected symbol type!");
+        continue;
 
       // Remember the symbol.
       Symbols.push_back(SymbolEntry(STE->Value, Name));
@@ -310,6 +311,10 @@ loadSegment32(const MachOObject *Obj,
     // Sort the symbols by address, just in case they didn't come in that way.
     array_pod_sort(Symbols.begin(), Symbols.end());
 
+    // If there weren't any functions (odd, but just in case...)
+    if (!Symbols.size())
+      continue;
+
     // Extract the function data.
     uint8_t *Base = (uint8_t*)Obj->getData(SegmentLC->FileOffset,
                                            SegmentLC->FileSize).data();
@@ -403,9 +408,9 @@ loadSegment64(const MachOObject *Obj,
     if (!Sect)
       return Error("unable to load section: '" + Twine(SectNum) + "'");
 
-    // FIXME: Improve check.
+    // FIXME: For the time being, we're only loading text segments.
     if (Sect->Flags != 0x80000400)
-      return Error("unsupported section type!");
+      continue;
 
     // Address and names of symbols in the section.
     typedef std::pair<uint64_t, StringRef> SymbolEntry;
@@ -430,9 +435,9 @@ loadSegment64(const MachOObject *Obj,
 
       // FIXME: Check the symbol type and flags.
       if (STE->Type != 0xF)  // external, defined in this section.
-        return Error("unexpected symbol type!");
+        continue;
       if (STE->Flags != 0x0)
-        return Error("unexpected symbol type!");
+        continue;
 
       // Remember the symbol.
       Symbols.push_back(SymbolEntry(STE->Value, Name));
@@ -443,6 +448,10 @@ loadSegment64(const MachOObject *Obj,
     // Sort the symbols by address, just in case they didn't come in that way.
     array_pod_sort(Symbols.begin(), Symbols.end());
 
+    // If there weren't any functions (odd, but just in case...)
+    if (!Symbols.size())
+      continue;
+
     // Extract the function data.
     uint8_t *Base = (uint8_t*)Obj->getData(Segment64LC->FileOffset,
                                            Segment64LC->FileSize).data();
diff --git a/lib/ExecutionEngine/JIT/TargetSelect.cpp b/lib/ExecutionEngine/TargetSelect.cpp
index 8d92ab0..a8822e5 100644
--- a/lib/ExecutionEngine/JIT/TargetSelect.cpp
+++ b/lib/ExecutionEngine/TargetSelect.cpp
@@ -13,7 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "JIT.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/Module.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/CommandLine.h"
@@ -26,11 +26,11 @@ using namespace llvm;
 
 /// selectTarget - Pick a target either via -march or by guessing the native
 /// arch.  Add any CPU features specified via -mcpu or -mattr.
-TargetMachine *JIT::selectTarget(Module *Mod,
-                                 StringRef MArch,
-                                 StringRef MCPU,
-                                 const SmallVectorImpl<std::string>& MAttrs,
-                                 std::string *ErrorStr) {
+TargetMachine *EngineBuilder::selectTarget(Module *Mod,
+                              StringRef MArch,
+                              StringRef MCPU,
+                              const SmallVectorImpl<std::string>& MAttrs,
+                              std::string *ErrorStr) {
   Triple TheTriple(Mod->getTargetTriple());
   if (TheTriple.getTriple().empty())
     TheTriple.setTriple(sys::getHostTriple());
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index 6aed059..a77ecd3 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -30,6 +30,7 @@ add_llvm_library(LLVMMC
   MCStreamer.cpp
   MCSymbol.cpp
   MCValue.cpp
+  MCWin64EH.cpp
   MachObjectWriter.cpp
   WinCOFFStreamer.cpp
   WinCOFFObjectWriter.cpp
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 23c6d4c..59e1b8e 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -25,6 +25,8 @@
 #include "llvm/Support/ELF.h"
 #include "llvm/Target/TargetAsmBackend.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/Statistic.h"
 
 #include "../Target/X86/X86FixupKinds.h"
 #include "../Target/ARM/ARMFixupKinds.h"
@@ -32,6 +34,9 @@
 #include <vector>
 using namespace llvm;
 
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "reloc-info"
+
 bool ELFObjectWriter::isFixupKindPCRel(const MCAssembler &Asm, unsigned Kind) {
   const MCFixupKindInfo &FKI =
     Asm.getBackend().getFixupKindInfo((MCFixupKind) Kind);
@@ -46,6 +51,7 @@ bool ELFObjectWriter::RelocNeedsGOT(MCSymbolRefExpr::VariantKind Variant) {
   case MCSymbolRefExpr::VK_GOT:
   case MCSymbolRefExpr::VK_PLT:
   case MCSymbolRefExpr::VK_GOTPCREL:
+  case MCSymbolRefExpr::VK_GOTOFF:
   case MCSymbolRefExpr::VK_TPOFF:
   case MCSymbolRefExpr::VK_TLSGD:
   case MCSymbolRefExpr::VK_GOTTPOFF:
@@ -181,8 +187,13 @@ uint64_t ELFObjectWriter::SymbolValue(MCSymbolData &Data,
   if (!Symbol.isInSection())
     return 0;
 
-  if (Data.getFragment())
-    return Layout.getSymbolOffset(&Data);
+
+  if (Data.getFragment()) {
+    if (Data.getFlags() & ELF_Other_ThumbFunc)
+      return Layout.getSymbolOffset(&Data)+1;
+    else
+      return Layout.getSymbolOffset(&Data);
+  }
 
   return 0;
 }
@@ -319,7 +330,9 @@ void ELFObjectWriter::WriteSymbolTable(MCDataFragment *SymtabF,
 
 const MCSymbol *ELFObjectWriter::SymbolToReloc(const MCAssembler &Asm,
                                                const MCValue &Target,
-                                               const MCFragment &F) const {
+                                               const MCFragment &F, 
+                                               const MCFixup &Fixup,
+                                               bool IsPCRel) const {
   const MCSymbol &Symbol = Target.getSymA()->getSymbol();
   const MCSymbol &ASymbol = Symbol.AliasedSymbol();
   const MCSymbol *Renamed = Renames.lookup(&Symbol);
@@ -342,7 +355,7 @@ const MCSymbol *ELFObjectWriter::SymbolToReloc(const MCAssembler &Asm,
   const SectionKind secKind = Section.getKind();
 
   if (secKind.isBSS())
-    return ExplicitRelSym(Asm, Target, F, true);
+    return ExplicitRelSym(Asm, Target, F, Fixup, IsPCRel);
 
   if (secKind.isThreadLocal()) {
     if (Renamed)
@@ -365,13 +378,14 @@ const MCSymbol *ELFObjectWriter::SymbolToReloc(const MCAssembler &Asm,
 
   if (Section.getFlags() & ELF::SHF_MERGE) {
     if (Target.getConstant() == 0)
-      return NULL;
+      return ExplicitRelSym(Asm, Target, F, Fixup, IsPCRel);
     if (Renamed)
       return Renamed;
     return &Symbol;
   }
 
-  return ExplicitRelSym(Asm, Target, F, false);
+  return ExplicitRelSym(Asm, Target, F, Fixup, IsPCRel);
+
 }
 
 
@@ -390,7 +404,7 @@ void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm,
   if (!Target.isAbsolute()) {
     const MCSymbol &Symbol = Target.getSymA()->getSymbol();
     const MCSymbol &ASymbol = Symbol.AliasedSymbol();
-    RelocSymbol = SymbolToReloc(Asm, Target, *Fragment);
+    RelocSymbol = SymbolToReloc(Asm, Target, *Fragment, Fixup, IsPCRel);
 
     if (const MCSymbolRefExpr *RefB = Target.getSymB()) {
       const MCSymbol &SymbolB = RefB->getSymbol();
@@ -532,6 +546,7 @@ void ELFObjectWriter::ComputeSymbolTable(MCAssembler &Asm,
                                          RevGroupMapTy RevGroupMap,
                                          unsigned NumRegularSections) {
   // FIXME: Is this the correct place to do this?
+  // FIXME: Why is an undefined reference to _GLOBAL_OFFSET_TABLE_ needed?
   if (NeedsGOT) {
     llvm::StringRef Name = "_GLOBAL_OFFSET_TABLE_";
     MCSymbol *Sym = Asm.getContext().GetOrCreateSymbol(Name);
@@ -1261,32 +1276,93 @@ void ARMELFObjectWriter::WriteEFlags() {
 
 // In ARM, _MergedGlobals and other most symbols get emitted directly.
 // I.e. not as an offset to a section symbol.
-// This code is a first-cut approximation of what ARM/gcc does.
+// This code is an approximation of what ARM/gcc does.
+
+STATISTIC(PCRelCount, "Total number of PIC Relocations");
+STATISTIC(NonPCRelCount, "Total number of non-PIC relocations");
 
 const MCSymbol *ARMELFObjectWriter::ExplicitRelSym(const MCAssembler &Asm,
                                                    const MCValue &Target,
                                                    const MCFragment &F,
-                                                   bool IsBSS) const {
+                                                   const MCFixup &Fixup,
+                                                   bool IsPCRel) const {
   const MCSymbol &Symbol = Target.getSymA()->getSymbol();
   bool EmitThisSym = false;
 
-  if (IsBSS) {
-    EmitThisSym = StringSwitch<bool>(Symbol.getName())
-      .Case("_MergedGlobals", true)
-      .Default(false);
+  const MCSectionELF &Section =
+    static_cast<const MCSectionELF&>(Symbol.getSection());
+  bool InNormalSection = true;
+  unsigned RelocType = 0;
+  RelocType = GetRelocTypeInner(Target, Fixup, IsPCRel);
+
+  DEBUG(
+      const MCSymbolRefExpr::VariantKind Kind = Target.getSymA()->getKind();
+      MCSymbolRefExpr::VariantKind Kind2;
+      Kind2 = Target.getSymB() ?  Target.getSymB()->getKind() :
+        MCSymbolRefExpr::VK_None;
+      dbgs() << "considering symbol "
+        << Section.getSectionName() << "/"
+        << Symbol.getName() << "/"
+        << " Rel:" << (unsigned)RelocType
+        << " Kind: " << (int)Kind << "/" << (int)Kind2
+        << " Tmp:"
+        << Symbol.isAbsolute() << "/" << Symbol.isDefined() << "/"
+        << Symbol.isVariable() << "/" << Symbol.isTemporary()
+        << " Counts:" << PCRelCount << "/" << NonPCRelCount << "\n");
+
+  if (IsPCRel) { ++PCRelCount;
+    switch (RelocType) {
+    default:
+      // Most relocation types are emitted as explicit symbols
+      InNormalSection =
+        StringSwitch<bool>(Section.getSectionName())
+        .Case(".data.rel.ro.local", false)
+        .Case(".data.rel", false)
+        .Case(".bss", false)
+        .Default(true);
+      EmitThisSym = true;
+      break;
+    case ELF::R_ARM_ABS32:
+      // But things get strange with R_ARM_ABS32
+      // In this case, most things that go in .rodata show up
+      // as section relative relocations
+      InNormalSection =
+        StringSwitch<bool>(Section.getSectionName())
+        .Case(".data.rel.ro.local", false)
+        .Case(".data.rel", false)
+        .Case(".rodata", false)
+        .Case(".bss", false)
+        .Default(true);
+      EmitThisSym = false;
+      break;
+    }
   } else {
-    EmitThisSym = StringSwitch<bool>(Symbol.getName())
-      .Case("_MergedGlobals", true)
-      .StartsWith(".L.str", true)
-      .Default(false);
+    NonPCRelCount++;
+    InNormalSection =
+      StringSwitch<bool>(Section.getSectionName())
+      .Case(".data.rel.ro.local", false)
+      .Case(".rodata", false)
+      .Case(".data.rel", false)
+      .Case(".bss", false)
+      .Default(true);
+
+    switch (RelocType) {
+    default: EmitThisSym = true; break;
+    case ELF::R_ARM_ABS32: EmitThisSym = false; break;
+    }
   }
+
   if (EmitThisSym)
     return &Symbol;
-  if (! Symbol.isTemporary())
+  if (! Symbol.isTemporary() && InNormalSection) {
     return &Symbol;
+  }
   return NULL;
 }
 
+// Need to examine the Fixup when determining whether to 
+// emit the relocation as an explicit symbol or as a section relative
+// offset
 unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target,
                                           const MCFixup &Fixup,
                                           bool IsPCRel,
@@ -1295,6 +1371,20 @@ unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target,
   MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
     MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
 
+  unsigned Type = GetRelocTypeInner(Target, Fixup, IsPCRel);
+
+  if (RelocNeedsGOT(Modifier))
+    NeedsGOT = true;
+  
+  return Type;
+}
+
+unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
+                                               const MCFixup &Fixup,
+                                               bool IsPCRel) const  {
+  MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+    MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
   unsigned Type = 0;
   if (IsPCRel) {
     switch ((unsigned)Fixup.getKind()) {
@@ -1303,7 +1393,7 @@ unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target,
       switch (Modifier) {
       default: llvm_unreachable("Unsupported Modifier");
       case MCSymbolRefExpr::VK_None:
-        Type = ELF::R_ARM_BASE_PREL;
+        Type = ELF::R_ARM_REL32;
         break;
       case MCSymbolRefExpr::VK_ARM_TLSGD:
         assert(0 && "unimplemented");
@@ -1342,6 +1432,17 @@ unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target,
     case ARM::fixup_t2_movw_lo16_pcrel:
       Type = ELF::R_ARM_THM_MOVW_PREL_NC;
       break;
+    case ARM::fixup_arm_thumb_bl:
+    case ARM::fixup_arm_thumb_blx:
+      switch (Modifier) {
+      case MCSymbolRefExpr::VK_ARM_PLT:
+        Type = ELF::R_ARM_THM_CALL;
+        break;
+      default:
+        Type = ELF::R_ARM_NONE;
+        break;
+      }
+      break;
     }
   } else {
     switch ((unsigned)Fixup.getKind()) {
@@ -1399,9 +1500,6 @@ unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target,
     }
   }
 
-  if (RelocNeedsGOT(Modifier))
-    NeedsGOT = true;
-
   return Type;
 }
 
@@ -1613,6 +1711,9 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
         case MCSymbolRefExpr::VK_DTPOFF:
           Type = ELF::R_386_TLS_LDO_32;
           break;
+        case MCSymbolRefExpr::VK_GOTTPOFF:
+          Type = ELF::R_386_TLS_IE_32;
+          break;
         }
         break;
       case FK_Data_2: Type = ELF::R_386_16; break;
diff --git a/lib/MC/ELFObjectWriter.h b/lib/MC/ELFObjectWriter.h
index f1d514a..7593099 100644
--- a/lib/MC/ELFObjectWriter.h
+++ b/lib/MC/ELFObjectWriter.h
@@ -140,15 +140,18 @@ class ELFObjectWriter : public MCObjectWriter {
     unsigned ShstrtabIndex;
 
 
-    const MCSymbol *SymbolToReloc(const MCAssembler &Asm,
-                                  const MCValue &Target,
-                                  const MCFragment &F) const;
+    virtual const MCSymbol *SymbolToReloc(const MCAssembler &Asm,
+                                          const MCValue &Target,
+                                          const MCFragment &F,
+                                          const MCFixup &Fixup,
+                                          bool IsPCRel) const;
 
     // For arch-specific emission of explicit reloc symbol
     virtual const MCSymbol *ExplicitRelSym(const MCAssembler &Asm,
                                            const MCValue &Target,
                                            const MCFragment &F,
-                                           bool IsBSS) const {
+                                           const MCFixup &Fixup,
+                                           bool IsPCRel) const {
       return NULL;
     }
 
@@ -380,11 +383,16 @@ class ELFObjectWriter : public MCObjectWriter {
     virtual const MCSymbol *ExplicitRelSym(const MCAssembler &Asm,
                                            const MCValue &Target,
                                            const MCFragment &F,
-                                           bool IsBSS) const;
+                                           const MCFixup &Fixup,
+                                           bool IsPCRel) const;
 
     virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                                   bool IsPCRel, bool IsRelocWithSymbol,
                                   int64_t Addend);
+  private:
+    unsigned GetRelocTypeInner(const MCValue &Target,
+                               const MCFixup &Fixup, bool IsPCRel) const;
+    
   };
 
   //===- MBlazeELFObjectWriter -------------------------------------------===//
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index 541dd08..73b259e 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -74,9 +74,8 @@ MCAsmInfo::MCAsmInfo() {
   HasLEB128 = false;
   SupportsDebugInformation = false;
   ExceptionsType = ExceptionHandling::None;
-  DwarfRequiresFrameSection = true;
   DwarfUsesInlineInfoSection = false;
-  DwarfUsesAbsoluteLabelForStmtList = true;
+  DwarfRequiresRelocationForSectionOffset = true;
   DwarfSectionOffsetDirective = 0;
   DwarfUsesLabelOffsetForRanges = true;
   HasMicrosoftFastStdCallMangling = false;
diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp
index 4dd1d44..5851cb0 100644
--- a/lib/MC/MCAsmInfoDarwin.cpp
+++ b/lib/MC/MCAsmInfoDarwin.cpp
@@ -56,6 +56,6 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
   HasNoDeadStrip = true;
   HasSymbolResolver = true;
 
-  DwarfUsesAbsoluteLabelForStmtList = false;
+  DwarfRequiresRelocationForSectionOffset = false;
   DwarfUsesLabelOffsetForRanges = false;
 }
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 9717c01..e8b09fc 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -54,6 +54,8 @@ class MCAsmStreamer : public MCStreamer {
 
   bool needsSet(const MCExpr *Value);
 
+  void EmitRegisterName(int64_t Register);
+
 public:
   MCAsmStreamer(MCContext &Context, formatted_raw_ostream &os,
                 bool isVerboseAsm, bool useLoc, bool useCFI,
@@ -193,6 +195,7 @@ public:
                                      unsigned Isa, unsigned Discriminator,
                                      StringRef FileName);
 
+  virtual void EmitCFISections(bool EH, bool Debug);
   virtual void EmitCFIStartProc();
   virtual void EmitCFIEndProc();
   virtual void EmitCFIDefCfa(int64_t Register, int64_t Offset);
@@ -207,6 +210,21 @@ public:
   virtual void EmitCFIRelOffset(int64_t Register, int64_t Offset);
   virtual void EmitCFIAdjustCfaOffset(int64_t Adjustment);
 
+  virtual void EmitWin64EHStartProc(const MCSymbol *Symbol);
+  virtual void EmitWin64EHEndProc();
+  virtual void EmitWin64EHStartChained();
+  virtual void EmitWin64EHEndChained();
+  virtual void EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
+                                  bool Except);
+  virtual void EmitWin64EHHandlerData();
+  virtual void EmitWin64EHPushReg(unsigned Register);
+  virtual void EmitWin64EHSetFrame(unsigned Register, unsigned Offset);
+  virtual void EmitWin64EHAllocStack(unsigned Size);
+  virtual void EmitWin64EHSaveReg(unsigned Register, unsigned Offset);
+  virtual void EmitWin64EHSaveXMM(unsigned Register, unsigned Offset);
+  virtual void EmitWin64EHPushFrame(bool Code);
+  virtual void EmitWin64EHEndProlog();
+
   virtual void EmitFnStart();
   virtual void EmitFnEnd();
   virtual void EmitCantUnwind();
@@ -322,7 +340,8 @@ void MCAsmStreamer::EmitThumbFunc(MCSymbol *Func) {
   // This needs to emit to a temporary string to get properly quoted
   // MCSymbols when they have spaces in them.
   OS << "\t.thumb_func";
-  if (Func)
+  // Only Mach-O hasSubsectionsViaSymbols()
+  if (MAI.hasSubsectionsViaSymbols())
     OS << '\t' << *Func;
   EmitEOL();
 }
@@ -351,7 +370,7 @@ void MCAsmStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
                                               const MCSymbol *Label) {
   EmitIntValue(dwarf::DW_CFA_advance_loc4, 1);
   const MCExpr *AddrDelta = BuildSymbolDiff(getContext(), Label, LastLabel);
-  AddrDelta = ForceExpAbs(this, getContext(), AddrDelta);
+  AddrDelta = ForceExpAbs(AddrDelta);
   EmitValue(AddrDelta, 4);
 }
 
@@ -764,6 +783,24 @@ void MCAsmStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitCFISections(bool EH, bool Debug) {
+  MCStreamer::EmitCFISections(EH, Debug);
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_sections ";
+  if (EH) {
+    OS << ".eh_frame";
+    if (Debug)
+      OS << ", .debug_frame";
+  } else if (Debug) {
+    OS << ".debug_frame";
+  }
+
+  EmitEOL();
+}
+
 void MCAsmStreamer::EmitCFIStartProc() {
   MCStreamer::EmitCFIStartProc();
 
@@ -784,13 +821,25 @@ void MCAsmStreamer::EmitCFIEndProc() {
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitRegisterName(int64_t Register) {
+  if (InstPrinter) {
+    const TargetAsmInfo &asmInfo = getContext().getTargetAsmInfo();
+    unsigned LLVMRegister = asmInfo.getLLVMRegNum(Register, true);
+    InstPrinter->printRegName(OS, LLVMRegister);
+  } else {
+    OS << Register;
+  }
+}
+
 void MCAsmStreamer::EmitCFIDefCfa(int64_t Register, int64_t Offset) {
   MCStreamer::EmitCFIDefCfa(Register, Offset);
 
   if (!UseCFI)
     return;
 
-  OS << ".cfi_def_cfa " << Register << ", " << Offset;
+  OS << "\t.cfi_def_cfa ";
+  EmitRegisterName(Register);
+  OS << ", " << Offset;
   EmitEOL();
 }
 
@@ -810,7 +859,8 @@ void MCAsmStreamer::EmitCFIDefCfaRegister(int64_t Register) {
   if (!UseCFI)
     return;
 
-  OS << "\t.cfi_def_cfa_register " << Register;
+  OS << "\t.cfi_def_cfa_register ";
+  EmitRegisterName(Register);
   EmitEOL();
 }
 
@@ -820,7 +870,9 @@ void MCAsmStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) {
   if (!UseCFI)
     return;
 
-  OS << "\t.cfi_offset " << Register << ", " << Offset;
+  OS << "\t.cfi_offset ";
+  EmitRegisterName(Register);
+  OS << ", " << Offset;
   EmitEOL();
 }
 
@@ -871,7 +923,8 @@ void MCAsmStreamer::EmitCFISameValue(int64_t Register) {
   if (!UseCFI)
     return;
 
-  OS << "\t.cfi_same_value " << Register;
+  OS << "\t.cfi_same_value ";
+  EmitRegisterName(Register);
   EmitEOL();
 }
 
@@ -881,7 +934,9 @@ void MCAsmStreamer::EmitCFIRelOffset(int64_t Register, int64_t Offset) {
   if (!UseCFI)
     return;
 
-  OS << "\t.cfi_rel_offset " << Register << ", " << Offset;
+  OS << "\t.cfi_rel_offset ";
+  EmitRegisterName(Register);
+  OS << ", " << Offset;
   EmitEOL();
 }
 
@@ -895,6 +950,115 @@ void MCAsmStreamer::EmitCFIAdjustCfaOffset(int64_t Adjustment) {
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitWin64EHStartProc(const MCSymbol *Symbol) {
+  MCStreamer::EmitWin64EHStartProc(Symbol);
+
+  OS << ".seh_proc " << *Symbol;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHEndProc() {
+  MCStreamer::EmitWin64EHEndProc();
+
+  OS << "\t.seh_endproc";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHStartChained() {
+  MCStreamer::EmitWin64EHStartChained();
+
+  OS << "\t.seh_startchained";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHEndChained() {
+  MCStreamer::EmitWin64EHEndChained();
+
+  OS << "\t.seh_endchained";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
+                                       bool Except) {
+  MCStreamer::EmitWin64EHHandler(Sym, Unwind, Except);
+
+  OS << "\t.seh_handler " << *Sym;
+  if (Unwind)
+    OS << ", @unwind";
+  if (Except)
+    OS << ", @except";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHHandlerData() {
+  MCStreamer::EmitWin64EHHandlerData();
+
+  // Switch sections. Don't call SwitchSection directly, because that will
+  // cause the section switch to be visible in the emitted assembly.
+  // We only do this so the section switch that terminates the handler
+  // data block is visible.
+  MCWin64EHUnwindInfo *CurFrame = getCurrentW64UnwindInfo();
+  StringRef suffix=MCWin64EHUnwindEmitter::GetSectionSuffix(CurFrame->Function);
+  const MCSection *xdataSect =
+    getContext().getTargetAsmInfo().getWin64EHTableSection(suffix);
+  if (xdataSect)
+    SwitchSectionNoChange(xdataSect);
+
+  OS << "\t.seh_handlerdata";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHPushReg(unsigned Register) {
+  MCStreamer::EmitWin64EHPushReg(Register);
+
+  OS << "\t.seh_pushreg " << Register;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHSetFrame(unsigned Register, unsigned Offset) {
+  MCStreamer::EmitWin64EHSetFrame(Register, Offset);
+
+  OS << "\t.seh_setframe " << Register << ", " << Offset;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHAllocStack(unsigned Size) {
+  MCStreamer::EmitWin64EHAllocStack(Size);
+
+  OS << "\t.seh_stackalloc " << Size;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHSaveReg(unsigned Register, unsigned Offset) {
+  MCStreamer::EmitWin64EHSaveReg(Register, Offset);
+
+  OS << "\t.seh_savereg " << Register << ", " << Offset;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHSaveXMM(unsigned Register, unsigned Offset) {
+  MCStreamer::EmitWin64EHSaveXMM(Register, Offset);
+
+  OS << "\t.seh_savexmm " << Register << ", " << Offset;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHPushFrame(bool Code) {
+  MCStreamer::EmitWin64EHPushFrame(Code);
+
+  OS << "\t.seh_pushframe";
+  if (Code)
+    OS << " @code";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHEndProlog(void) {
+  MCStreamer::EmitWin64EHEndProlog();
+
+  OS << "\t.seh_endprologue";
+  EmitEOL();
+}
+
 void MCAsmStreamer::AddEncodingComment(const MCInst &Inst) {
   raw_ostream &OS = GetCommentOS();
   SmallString<256> Code;
@@ -1005,8 +1169,10 @@ void MCAsmStreamer::EmitPersonality(const MCSymbol *Personality) {
 }
 
 void MCAsmStreamer::EmitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset) {
-  OS << "\t.setfp\t" << InstPrinter->getRegName(FpReg)
-     << ", "        << InstPrinter->getRegName(SpReg);
+  OS << "\t.setfp\t";
+  InstPrinter->printRegName(OS, FpReg);
+  OS << ", ";
+  InstPrinter->printRegName(OS, SpReg);
   if (Offset)
     OS << ", #" << Offset;
   EmitEOL();
@@ -1025,10 +1191,12 @@ void MCAsmStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
   else
     OS << "\t.save\t{";
 
-  OS << InstPrinter->getRegName(RegList[0]);
+  InstPrinter->printRegName(OS, RegList[0]);
 
-  for (unsigned i = 1, e = RegList.size(); i != e; ++i)
-    OS << ", " << InstPrinter->getRegName(RegList[i]);
+  for (unsigned i = 1, e = RegList.size(); i != e; ++i) {
+    OS << ", ";
+    InstPrinter->printRegName(OS, RegList[i]);
+  }
 
   OS << "}";
   EmitEOL();
@@ -1070,8 +1238,8 @@ void MCAsmStreamer::Finish() {
   if (getContext().hasDwarfFiles() && !UseLoc)
     MCDwarfFileTable::Emit(this);
 
-  if (getNumFrameInfos() && !UseCFI)
-    MCDwarfFrameEmitter::Emit(*this, false);
+  if (!UseCFI)
+    EmitFrames(false);
 }
 
 MCStreamer *llvm::createAsmStreamer(MCContext &Context,
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 8360fc9f..527a63c 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -759,8 +759,7 @@ bool MCAssembler::RelaxDwarfCallFrameFragment(MCAsmLayout &Layout,
   SmallString<8> &Data = DF.getContents();
   Data.clear();
   raw_svector_ostream OSE(Data);
-  const TargetAsmInfo &AsmInfo = getContext().getTargetAsmInfo();
-  MCDwarfFrameEmitter::EncodeAdvanceLoc(AddrDelta, OSE, AsmInfo);
+  MCDwarfFrameEmitter::EncodeAdvanceLoc(AddrDelta, OSE);
   OSE.flush();
   return OldSize != Data.size();
 }
diff --git a/lib/MC/MCDisassembler/Disassembler.cpp b/lib/MC/MCDisassembler/Disassembler.cpp
index ced57e8..6e636f0 100644
--- a/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/lib/MC/MCDisassembler/Disassembler.cpp
@@ -6,11 +6,10 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+
 #include "Disassembler.h"
-#include <stdio.h>
 #include "llvm-c/Disassembler.h"
 
-#include <string>
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
@@ -27,17 +26,12 @@ class Target;
 } // namespace llvm
 using namespace llvm;
 
-#ifdef __cplusplus
-extern "C" {
-#endif // __cplusplus
-
-//
 // LLVMCreateDisasm() creates a disassembler for the TripleName.  Symbolic
 // disassembly is supported by passing a block of information in the DisInfo
-// parameter and specifing the TagType and call back functions as described in
+// parameter and specifying the TagType and callback functions as described in
 // the header llvm-c/Disassembler.h .  The pointer to the block and the 
-// functions can all be passed as NULL.  If successful this returns a
-// disassembler context if not it returns NULL.
+// functions can all be passed as NULL.  If successful, this returns a
+// disassembler context.  If not, it returns NULL.
 //
 LLVMDisasmContextRef LLVMCreateDisasm(const char *TripleName, void *DisInfo,
                                       int TagType, LLVMOpInfoCallback GetOpInfo,
@@ -108,7 +102,6 @@ namespace {
 // The memory object created by LLVMDisasmInstruction().
 //
 class DisasmMemoryObject : public MemoryObject {
-private:
   uint8_t *Bytes;
   uint64_t Size;
   uint64_t BasePC;
@@ -126,7 +119,7 @@ public:
     return 0;
   }
 };
-} // namespace
+} // end anonymous namespace
 
 //
 // LLVMDisasmInstruction() disassembles a single instruction using the
@@ -154,18 +147,15 @@ size_t LLVMDisasmInstruction(LLVMDisasmContextRef DCR, uint8_t *Bytes,
   if (!DisAsm->getInstruction(Inst, Size, MemoryObject, PC, /*REMOVE*/ nulls()))
     return 0;
 
-  std::string InsnStr;
-  raw_string_ostream OS(InsnStr);
+  SmallVector<char, 64> InsnStr;
+  raw_svector_ostream OS(InsnStr);
   IP->printInst(&Inst, OS);
   OS.flush();
 
+  assert(OutStringSize != 0 && "Output buffer cannot be zero size");
   size_t OutputSize = std::min(OutStringSize-1, InsnStr.size());
   std::memcpy(OutString, InsnStr.data(), OutputSize);
   OutString[OutputSize] = '\0'; // Terminate string.
 
   return Size;
 }
-
-#ifdef __cplusplus
-}
-#endif // __cplusplus
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index f61f0c2..13cb81a 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -501,10 +501,13 @@ namespace {
     int CFAOffset;
     int CIENum;
     bool UsingCFI;
+    bool IsEH;
+    const MCSymbol *SectionStart;
 
   public:
-    FrameEmitterImpl(bool usingCFI) : CFAOffset(0), CIENum(0),
-                     UsingCFI(usingCFI) {
+    FrameEmitterImpl(bool usingCFI, bool isEH, const MCSymbol *sectionStart) :
+      CFAOffset(0), CIENum(0), UsingCFI(usingCFI), IsEH(isEH),
+      SectionStart(sectionStart) {
     }
 
     const MCSymbol &EmitCIE(MCStreamer &streamer,
@@ -514,8 +517,7 @@ namespace {
                             unsigned lsdaEncoding);
     MCSymbol *EmitFDE(MCStreamer &streamer,
                       const MCSymbol &cieStart,
-                      const MCDwarfFrameInfo &frame,
-                      bool forceLsda);
+                      const MCDwarfFrameInfo &frame);
     void EmitCFIInstructions(MCStreamer &streamer,
                              const std::vector<MCCFIInstruction> &Instrs,
                              MCSymbol *BaseLabel);
@@ -537,8 +539,6 @@ void FrameEmitterImpl::EmitCFIInstruction(MCStreamer &Streamer,
 
     // If advancing cfa.
     if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) {
-      assert(!Src.isReg() && "Machine move not supported yet.");
-
       if (Src.getReg() == MachineLocation::VirtualFP) {
         Streamer.EmitIntValue(dwarf::DW_CFA_def_cfa_offset, 1);
       } else {
@@ -630,11 +630,9 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
                                           unsigned lsdaEncoding) {
   MCContext &context = streamer.getContext();
   const TargetAsmInfo &asmInfo = context.getTargetAsmInfo();
-  const MCSection &section = *asmInfo.getEHFrameSection();
-  streamer.SwitchSection(&section);
 
   MCSymbol *sectionStart;
-  if (asmInfo.isFunctionEHFrameSymbolPrivate())
+  if (asmInfo.isFunctionEHFrameSymbolPrivate() || !IsEH)
     sectionStart = context.CreateTempSymbol();
   else
     sectionStart = context.GetOrCreateSymbol(Twine("EH_frame") + Twine(CIENum));
@@ -650,20 +648,23 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
   streamer.EmitAbsValue(Length, 4);
 
   // CIE ID
-  streamer.EmitIntValue(0, 4);
+  unsigned CIE_ID = IsEH ? 0 : -1;
+  streamer.EmitIntValue(CIE_ID, 4);
 
   // Version
   streamer.EmitIntValue(dwarf::DW_CIE_VERSION, 1);
 
   // Augmentation String
   SmallString<8> Augmentation;
-  Augmentation += "z";
-  if (personality)
-    Augmentation += "P";
-  if (lsda)
-    Augmentation += "L";
-  Augmentation += "R";
-  streamer.EmitBytes(Augmentation.str(), 0);
+  if (IsEH) {
+    Augmentation += "z";
+    if (personality)
+      Augmentation += "P";
+    if (lsda)
+      Augmentation += "L";
+    Augmentation += "R";
+    streamer.EmitBytes(Augmentation.str(), 0);
+  }
   streamer.EmitIntValue(0, 1);
 
   // Code Alignment Factor
@@ -678,33 +679,32 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
   // Augmentation Data Length (optional)
 
   unsigned augmentationLength = 0;
-  if (personality) {
-    // Personality Encoding
-    augmentationLength += 1;
-    // Personality
-    augmentationLength += getSizeForEncoding(streamer, personalityEncoding);
-  }
-  if (lsda) {
+  if (IsEH) {
+    if (personality) {
+      // Personality Encoding
+      augmentationLength += 1;
+      // Personality
+      augmentationLength += getSizeForEncoding(streamer, personalityEncoding);
+    }
+    if (lsda)
+      augmentationLength += 1;
+    // Encoding of the FDE pointers
     augmentationLength += 1;
-  }
-  // Encoding of the FDE pointers
-  augmentationLength += 1;
 
-  streamer.EmitULEB128IntValue(augmentationLength);
+    streamer.EmitULEB128IntValue(augmentationLength);
 
-  // Augmentation Data (optional)
-  if (personality) {
-    // Personality Encoding
-    streamer.EmitIntValue(personalityEncoding, 1);
-    // Personality
-    EmitPersonality(streamer, *personality, personalityEncoding);
-  }
-  if (lsda) {
-    // LSDA Encoding
-    streamer.EmitIntValue(lsdaEncoding, 1);
+    // Augmentation Data (optional)
+    if (personality) {
+      // Personality Encoding
+      streamer.EmitIntValue(personalityEncoding, 1);
+      // Personality
+      EmitPersonality(streamer, *personality, personalityEncoding);
+    }
+    if (lsda)
+      streamer.EmitIntValue(lsdaEncoding, 1); // LSDA Encoding
+    // Encoding of the FDE pointers
+    streamer.EmitIntValue(asmInfo.getFDEEncoding(UsingCFI), 1);
   }
-  // Encoding of the FDE pointers
-  streamer.EmitIntValue(asmInfo.getFDEEncoding(UsingCFI), 1);
 
   // Initial Instructions
 
@@ -724,7 +724,7 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
   EmitCFIInstructions(streamer, Instructions, NULL);
 
   // Padding
-  streamer.EmitValueToAlignment(4);
+  streamer.EmitValueToAlignment(IsEH ? 4 : asmInfo.getPointerSize());
 
   streamer.EmitLabel(sectionEnd);
   return *sectionStart;
@@ -732,16 +732,15 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
 
 MCSymbol *FrameEmitterImpl::EmitFDE(MCStreamer &streamer,
                                     const MCSymbol &cieStart,
-                                    const MCDwarfFrameInfo &frame,
-                                    bool forceLsda) {
+                                    const MCDwarfFrameInfo &frame) {
   MCContext &context = streamer.getContext();
   MCSymbol *fdeStart = context.CreateTempSymbol();
   MCSymbol *fdeEnd = context.CreateTempSymbol();
-  const TargetAsmInfo &asmInfo = context.getTargetAsmInfo();
+  const TargetAsmInfo &TAsmInfo = context.getTargetAsmInfo();
 
-  if (!asmInfo.isFunctionEHFrameSymbolPrivate()) {
-    Twine EHName = frame.Function->getName() + Twine(".eh");
-    MCSymbol *EHSym = context.GetOrCreateSymbol(EHName);
+  if (!TAsmInfo.isFunctionEHFrameSymbolPrivate() && IsEH) {
+    MCSymbol *EHSym = context.GetOrCreateSymbol(
+      frame.Function->getName() + Twine(".eh"));
     streamer.EmitEHSymAttributes(frame.Function, EHSym);
     streamer.EmitLabel(EHSym);
   }
@@ -751,45 +750,54 @@ MCSymbol *FrameEmitterImpl::EmitFDE(MCStreamer &streamer,
   streamer.EmitAbsValue(Length, 4);
 
   streamer.EmitLabel(fdeStart);
+
   // CIE Pointer
-  const MCExpr *offset = MakeStartMinusEndExpr(streamer, cieStart, *fdeStart,
-                                               0);
-  streamer.EmitAbsValue(offset, 4);
-  unsigned fdeEncoding = asmInfo.getFDEEncoding(UsingCFI);
+  const MCAsmInfo &asmInfo = context.getAsmInfo();
+  if (IsEH) {
+    const MCExpr *offset = MakeStartMinusEndExpr(streamer, cieStart, *fdeStart,
+                                                 0);
+    streamer.EmitAbsValue(offset, 4);
+  } else if (!asmInfo.doesDwarfRequireRelocationForSectionOffset()) {
+    const MCExpr *offset = MakeStartMinusEndExpr(streamer, *SectionStart,
+                                                 cieStart, 0);
+    streamer.EmitAbsValue(offset, 4);
+  } else {
+    streamer.EmitSymbolValue(&cieStart, 4);
+  }
+  unsigned fdeEncoding = TAsmInfo.getFDEEncoding(UsingCFI);
   unsigned size = getSizeForEncoding(streamer, fdeEncoding);
 
   // PC Begin
-  EmitSymbol(streamer, *frame.Begin, fdeEncoding);
+  unsigned PCBeginEncoding = IsEH ? fdeEncoding :
+    (unsigned)dwarf::DW_EH_PE_absptr;
+  unsigned PCBeginSize = getSizeForEncoding(streamer, PCBeginEncoding);
+  EmitSymbol(streamer, *frame.Begin, PCBeginEncoding);
 
   // PC Range
   const MCExpr *Range = MakeStartMinusEndExpr(streamer, *frame.Begin,
                                               *frame.End, 0);
   streamer.EmitAbsValue(Range, size);
 
-  // Augmentation Data Length
-  unsigned augmentationLength = 0;
-
-  if (frame.Lsda || forceLsda)
-    augmentationLength += getSizeForEncoding(streamer, frame.LsdaEncoding);
+  if (IsEH) {
+    // Augmentation Data Length
+    unsigned augmentationLength = 0;
 
-  streamer.EmitULEB128IntValue(augmentationLength);
+    if (frame.Lsda)
+      augmentationLength += getSizeForEncoding(streamer, frame.LsdaEncoding);
 
-  // Augmentation Data
+    streamer.EmitULEB128IntValue(augmentationLength);
 
-  // When running in "CodeGen compatibility mode" a FDE with no LSDA can be
-  // assigned to a CIE that requires one. In that case we output a 0 (as does
-  // CodeGen).
-  if (frame.Lsda)
-    EmitSymbol(streamer, *frame.Lsda, frame.LsdaEncoding);
-  else if (forceLsda)
-    streamer.EmitIntValue(0, getSizeForEncoding(streamer, frame.LsdaEncoding));
+    // Augmentation Data
+    if (frame.Lsda)
+      EmitSymbol(streamer, *frame.Lsda, frame.LsdaEncoding);
+  }
 
   // Call Frame Instructions
 
   EmitCFIInstructions(streamer, frame.Instructions, frame.Begin);
 
   // Padding
-  streamer.EmitValueToAlignment(size);
+  streamer.EmitValueToAlignment(PCBeginSize);
 
   return fdeEnd;
 }
@@ -835,89 +843,32 @@ namespace llvm {
   };
 }
 
-// This is an implementation of CIE and FDE emission that is bug by bug
-// compatible with the one in CodeGen. It is useful during the transition
-// to make it easy to compare the outputs, but should probably be removed
-// afterwards.
-void MCDwarfFrameEmitter::EmitDarwin(MCStreamer &streamer,
-                                     bool usingCFI) {
-  FrameEmitterImpl Emitter(usingCFI);
-  DenseMap<const MCSymbol*, const MCSymbol*> Personalities;
-  const MCSymbol *aCIE = NULL;
-  const MCDwarfFrameInfo *aFrame = NULL;
-
-  for (unsigned i = 0, n = streamer.getNumFrameInfos(); i < n; ++i) {
-    const MCDwarfFrameInfo &frame = streamer.getFrameInfo(i);
-    if (!frame.Personality)
-      continue;
-    if (Personalities.count(frame.Personality))
-      continue;
-
-    const MCSymbol *cieStart = &Emitter.EmitCIE(streamer, frame.Personality,
-                                                frame.PersonalityEncoding,
-                                                frame.Lsda,
-                                                frame.LsdaEncoding);
-    aCIE = cieStart;
-    aFrame = &frame;
-    Personalities[frame.Personality] = cieStart;
-  }
-
-  if (Personalities.empty()) {
-    const MCDwarfFrameInfo &frame = streamer.getFrameInfo(0);
-    aCIE = &Emitter.EmitCIE(streamer, frame.Personality,
-                            frame.PersonalityEncoding, frame.Lsda,
-                            frame.LsdaEncoding);
-    aFrame = &frame;
-  }
-
-  MCSymbol *fdeEnd = NULL;
-  for (unsigned i = 0, n = streamer.getNumFrameInfos(); i < n; ++i) {
-    const MCDwarfFrameInfo &frame = streamer.getFrameInfo(i);
-    const MCSymbol *cieStart = Personalities[frame.Personality];
-    bool hasLSDA;
-    if (!cieStart) {
-      cieStart = aCIE;
-      hasLSDA = aFrame->Lsda;
-    } else {
-      hasLSDA = true;
-    }
-
-    fdeEnd = Emitter.EmitFDE(streamer, *cieStart, frame,
-                             hasLSDA);
-    if (i != n - 1)
-      streamer.EmitLabel(fdeEnd);
-  }
-
-  const MCContext &context = streamer.getContext();
-  const TargetAsmInfo &asmInfo = context.getTargetAsmInfo();
-  streamer.EmitValueToAlignment(asmInfo.getPointerSize());
-  if (fdeEnd)
-    streamer.EmitLabel(fdeEnd);
-}
-
 void MCDwarfFrameEmitter::Emit(MCStreamer &streamer,
-                               bool usingCFI) {
-  const MCContext &context = streamer.getContext();
+                               bool usingCFI,
+                               bool isEH) {
+  MCContext &context = streamer.getContext();
   const TargetAsmInfo &asmInfo = context.getTargetAsmInfo();
-  if (!asmInfo.isFunctionEHFrameSymbolPrivate()) {
-    EmitDarwin(streamer, usingCFI);
-    return;
-  }
+  const MCSection &section = isEH ?
+    *asmInfo.getEHFrameSection() : *asmInfo.getDwarfFrameSection();
+  streamer.SwitchSection(&section);
+  MCSymbol *SectionStart = context.CreateTempSymbol();
+  streamer.EmitLabel(SectionStart);
 
   MCSymbol *fdeEnd = NULL;
   DenseMap<CIEKey, const MCSymbol*> CIEStarts;
-  FrameEmitterImpl Emitter(usingCFI);
+  FrameEmitterImpl Emitter(usingCFI, isEH, SectionStart);
 
+  const MCSymbol *DummyDebugKey = NULL;
   for (unsigned i = 0, n = streamer.getNumFrameInfos(); i < n; ++i) {
     const MCDwarfFrameInfo &frame = streamer.getFrameInfo(i);
     CIEKey key(frame.Personality, frame.PersonalityEncoding,
                frame.LsdaEncoding);
-    const MCSymbol *&cieStart = CIEStarts[key];
+    const MCSymbol *&cieStart = isEH ? CIEStarts[key] : DummyDebugKey;
     if (!cieStart)
       cieStart = &Emitter.EmitCIE(streamer, frame.Personality,
                                   frame.PersonalityEncoding, frame.Lsda,
                                   frame.LsdaEncoding);
-    fdeEnd = Emitter.EmitFDE(streamer, *cieStart, frame, false);
+    fdeEnd = Emitter.EmitFDE(streamer, *cieStart, frame);
     if (i != n - 1)
       streamer.EmitLabel(fdeEnd);
   }
@@ -931,28 +882,21 @@ void MCDwarfFrameEmitter::EmitAdvanceLoc(MCStreamer &Streamer,
                                          uint64_t AddrDelta) {
   SmallString<256> Tmp;
   raw_svector_ostream OS(Tmp);
-  const TargetAsmInfo &AsmInfo = Streamer.getContext().getTargetAsmInfo();
-  MCDwarfFrameEmitter::EncodeAdvanceLoc(AddrDelta, OS, AsmInfo);
+  MCDwarfFrameEmitter::EncodeAdvanceLoc(AddrDelta, OS);
   Streamer.EmitBytes(OS.str(), /*AddrSpace=*/0);
 }
 
 void MCDwarfFrameEmitter::EncodeAdvanceLoc(uint64_t AddrDelta,
-                                           raw_ostream &OS,
-                                           const TargetAsmInfo &AsmInfo) {
-  // This is a small hack to facilitate the transition to CFI on OS X. It
-  // relaxes all address advances which lets us produces identical output
-  // to the one produce by CodeGen.
-  const bool Relax = !AsmInfo.isFunctionEHFrameSymbolPrivate();
-
+                                           raw_ostream &OS) {
   // FIXME: Assumes the code alignment factor is 1.
   if (AddrDelta == 0) {
-  } else if (isUIntN(6, AddrDelta) && !Relax) {
+  } else if (isUIntN(6, AddrDelta)) {
     uint8_t Opcode = dwarf::DW_CFA_advance_loc | AddrDelta;
     OS << Opcode;
-  } else if (isUInt<8>(AddrDelta) && !Relax) {
+  } else if (isUInt<8>(AddrDelta)) {
     OS << uint8_t(dwarf::DW_CFA_advance_loc1);
     OS << uint8_t(AddrDelta);
-  } else if (isUInt<16>(AddrDelta) && !Relax) {
+  } else if (isUInt<16>(AddrDelta)) {
     // FIXME: check what is the correct behavior on a big endian machine.
     OS << uint8_t(dwarf::DW_CFA_advance_loc2);
     OS << uint8_t( AddrDelta       & 0xff);
diff --git a/lib/MC/MCELF.cpp b/lib/MC/MCELF.cpp
index ce7783e..2c3f8e8 100644
--- a/lib/MC/MCELF.cpp
+++ b/lib/MC/MCELF.cpp
@@ -57,13 +57,13 @@ void MCELF::SetVisibility(MCSymbolData &SD, unsigned Visibility) {
   assert(Visibility == ELF::STV_DEFAULT || Visibility == ELF::STV_INTERNAL ||
          Visibility == ELF::STV_HIDDEN || Visibility == ELF::STV_PROTECTED);
 
-  uint32_t OtherFlags = SD.getFlags() & ~(0xf << ELF_STV_Shift);
+  uint32_t OtherFlags = SD.getFlags() & ~(0x3 << ELF_STV_Shift);
   SD.setFlags(OtherFlags | (Visibility << ELF_STV_Shift));
 }
 
 unsigned MCELF::GetVisibility(MCSymbolData &SD) {
   unsigned Visibility =
-    (SD.getFlags() & (0xf << ELF_STV_Shift)) >> ELF_STV_Shift;
+    (SD.getFlags() & (0x3 << ELF_STV_Shift)) >> ELF_STV_Shift;
   assert(Visibility == ELF::STV_DEFAULT || Visibility == ELF::STV_INTERNAL ||
          Visibility == ELF::STV_HIDDEN || Visibility == ELF::STV_PROTECTED);
   return Visibility;
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index be8e2e3..bbb2789 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -66,6 +66,11 @@ void MCELFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
 
 void MCELFStreamer::EmitThumbFunc(MCSymbol *Func) {
   // FIXME: Anything needed here to flag the function as thumb?
+
+  getAssembler().setIsThumbFunc(Func);
+
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Func);
+  SD.setFlags(SD.getFlags() | ELF_Other_ThumbFunc);
 }
 
 void MCELFStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
@@ -345,8 +350,7 @@ void MCELFStreamer::EmitInstToData(const MCInst &Inst) {
 }
 
 void MCELFStreamer::Finish() {
-  if (getNumFrameInfos())
-    MCDwarfFrameEmitter::Emit(*this, true);
+  EmitFrames(true);
 
   for (std::vector<LocalCommon>::const_iterator i = LocalCommons.begin(),
                                                 e = LocalCommons.end();
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index 3a674d7..fcf1aab 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -42,8 +42,8 @@ void MCExpr::print(raw_ostream &OS) const {
     // absolute names.
     bool UseParens = Sym.getName()[0] == '$';
 
-    if (SRE.getKind() == MCSymbolRefExpr::VK_PPC_HA16 ||
-        SRE.getKind() == MCSymbolRefExpr::VK_PPC_LO16) {
+    if (SRE.getKind() == MCSymbolRefExpr::VK_PPC_DARWIN_HA16 ||
+        SRE.getKind() == MCSymbolRefExpr::VK_PPC_DARWIN_LO16) {
       OS << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
       UseParens = true;
     }
@@ -61,8 +61,8 @@ void MCExpr::print(raw_ostream &OS) const {
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_GOTTPOFF)
       OS << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
     else if (SRE.getKind() != MCSymbolRefExpr::VK_None &&
-             SRE.getKind() != MCSymbolRefExpr::VK_PPC_HA16 &&
-             SRE.getKind() != MCSymbolRefExpr::VK_PPC_LO16)
+             SRE.getKind() != MCSymbolRefExpr::VK_PPC_DARWIN_HA16 &&
+             SRE.getKind() != MCSymbolRefExpr::VK_PPC_DARWIN_LO16)
       OS << '@' << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
 
     return;
@@ -197,8 +197,10 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_ARM_GOTTPOFF: return "(gottpoff)";
   case VK_ARM_TLSGD: return "(tlsgd)";
   case VK_PPC_TOC: return "toc";
-  case VK_PPC_HA16: return "ha16";
-  case VK_PPC_LO16: return "lo16";
+  case VK_PPC_DARWIN_HA16: return "ha16";
+  case VK_PPC_DARWIN_LO16: return "lo16";
+  case VK_PPC_GAS_HA16: return "ha";
+  case VK_PPC_GAS_LO16: return "l";
   }
 }
 
diff --git a/lib/MC/MCInstPrinter.cpp b/lib/MC/MCInstPrinter.cpp
index 212b85e..81a939f 100644
--- a/lib/MC/MCInstPrinter.cpp
+++ b/lib/MC/MCInstPrinter.cpp
@@ -20,7 +20,6 @@ StringRef MCInstPrinter::getOpcodeName(unsigned Opcode) const {
   return "";
 }
 
-StringRef MCInstPrinter::getRegName(unsigned RegNo) const {
+void MCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   assert(0 && "Target should implement this");
-  return "";
 }
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index 3da5b49..12aeb4f 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -377,8 +377,7 @@ void MCMachOStreamer::EmitInstToData(const MCInst &Inst) {
 }
 
 void MCMachOStreamer::Finish() {
-  if (getNumFrameInfos())
-    MCDwarfFrameEmitter::Emit(*this, true);
+  EmitFrames(true);
 
   // We have to set the fragment atom associations so we can relax properly for
   // Mach-O.
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index 0f349d0..e230c53 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -127,7 +127,7 @@ void MCObjectStreamer::EmitULEB128Value(const MCExpr *Value) {
     EmitULEB128IntValue(IntValue);
     return;
   }
-  Value = ForceExpAbs(this, getContext(), Value);
+  Value = ForceExpAbs(Value);
   new MCLEBFragment(*Value, false, getCurrentSectionData());
 }
 
@@ -137,7 +137,7 @@ void MCObjectStreamer::EmitSLEB128Value(const MCExpr *Value) {
     EmitSLEB128IntValue(IntValue);
     return;
   }
-  Value = ForceExpAbs(this, getContext(), Value);
+  Value = ForceExpAbs(Value);
   new MCLEBFragment(*Value, true, getCurrentSectionData());
 }
 
@@ -209,7 +209,7 @@ void MCObjectStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta,
     MCDwarfLineAddr::Emit(this, LineDelta, Res);
     return;
   }
-  AddrDelta = ForceExpAbs(this, getContext(), AddrDelta);
+  AddrDelta = ForceExpAbs(AddrDelta);
   new MCDwarfLineAddrFragment(LineDelta, *AddrDelta, getCurrentSectionData());
 }
 
@@ -221,7 +221,7 @@ void MCObjectStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
     MCDwarfFrameEmitter::EmitAdvanceLoc(*this, Res);
     return;
   }
-  AddrDelta = ForceExpAbs(this, getContext(), AddrDelta);
+  AddrDelta = ForceExpAbs(AddrDelta);
   new MCDwarfCallFrameFragment(*AddrDelta, getCurrentSectionData());
 }
 
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index a3d3a49..0c1f8f0 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -388,6 +388,7 @@ AsmToken AsmLexer::LexToken() {
   case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
   case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
   case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1));
+  case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
   case '=':
     if (*CurPtr == '=')
       return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index d8fd27d..4f55cea 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
@@ -36,15 +37,21 @@
 #include <vector>
 using namespace llvm;
 
+static cl::opt<bool>
+FatalAssemblerWarnings("fatal-assembler-warnings",
+                       cl::desc("Consider warnings as error"));
+
 namespace {
 
 /// \brief Helper class for tracking macro definitions.
 struct Macro {
   StringRef Name;
   StringRef Body;
+  std::vector<StringRef> Parameters;
 
 public:
-  Macro(StringRef N, StringRef B) : Name(N), Body(B) {}
+  Macro(StringRef N, StringRef B, const std::vector<StringRef> &P) :
+    Name(N), Body(B), Parameters(P) {}
 };
 
 /// \brief Helper class for storing information about an active macro
@@ -64,7 +71,7 @@ struct MacroInstantiation {
 
 public:
   MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL,
-                     const std::vector<std::vector<AsmToken> > &A);
+                     MemoryBuffer *I);
 };
 
 /// \brief The concrete assembly parser instance.
@@ -128,7 +135,7 @@ public:
   virtual MCContext &getContext() { return Ctx; }
   virtual MCStreamer &getStreamer() { return Out; }
 
-  virtual void Warning(SMLoc L, const Twine &Meg);
+  virtual bool Warning(SMLoc L, const Twine &Meg);
   virtual bool Error(SMLoc L, const Twine &Msg);
 
   const AsmToken &Lex();
@@ -146,6 +153,10 @@ private:
   bool ParseStatement();
 
   bool HandleMacroEntry(StringRef Name, SMLoc NameLoc, const Macro *M);
+  bool expandMacro(SmallString<256> &Buf, StringRef Body,
+                   const std::vector<StringRef> &Parameters,
+                   const std::vector<std::vector<AsmToken> > &A,
+                   const SMLoc &L);
   void HandleMacroExit();
 
   void PrintMacroInstantiations();
@@ -243,6 +254,8 @@ public:
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveStabs>(".stabs");
 
     // CFI directives.
+    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFISections>(
+                                                               ".cfi_sections");
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIStartProc>(
                                                               ".cfi_startproc");
     AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIEndProc>(
@@ -289,6 +302,7 @@ public:
   bool ParseDirectiveLine(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveStabs(StringRef, SMLoc DirectiveLoc);
+  bool ParseDirectiveCFISections(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveCFIStartProc(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveCFIEndProc(StringRef, SMLoc DirectiveLoc);
   bool ParseDirectiveCFIDefCfa(StringRef, SMLoc DirectiveLoc);
@@ -367,9 +381,12 @@ void AsmParser::PrintMacroInstantiations() {
                  "note");
 }
 
-void AsmParser::Warning(SMLoc L, const Twine &Msg) {
+bool AsmParser::Warning(SMLoc L, const Twine &Msg) {
+  if (FatalAssemblerWarnings)
+    return Error(L, Msg);
   PrintMessage(L, Msg, "warning");
   PrintMacroInstantiations();
+  return false;
 }
 
 bool AsmParser::Error(SMLoc L, const Twine &Msg) {
@@ -380,7 +397,8 @@ bool AsmParser::Error(SMLoc L, const Twine &Msg) {
 }
 
 bool AsmParser::EnterIncludeFile(const std::string &Filename) {
-  int NewBuf = SrcMgr.AddIncludeFile(Filename, Lexer.getLoc());
+  std::string IncludedFile;
+  int NewBuf = SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
   if (NewBuf == -1)
     return true;
 
@@ -542,7 +560,7 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
 
     StringRef Identifier;
     if (ParseIdentifier(Identifier))
-      return false;
+      return true;
 
     // This is a symbol reference.
     std::pair<StringRef, StringRef> Split = Identifier.split('@');
@@ -1126,9 +1144,9 @@ bool AsmParser::ParseStatement() {
     if (!getTargetParser().ParseDirective(ID))
       return false;
 
-    Warning(IDLoc, "ignoring directive for now");
+    bool retval = Warning(IDLoc, "ignoring directive for now");
     EatToEndOfStatement();
-    return false;
+    return retval;
   }
 
   CheckForValidSection();
@@ -1171,27 +1189,33 @@ bool AsmParser::ParseStatement() {
   return false;
 }
 
-MacroInstantiation::MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL,
-                                   const std::vector<std::vector<AsmToken> > &A)
-  : TheMacro(M), InstantiationLoc(IL), ExitLoc(EL)
-{
-  // Macro instantiation is lexical, unfortunately. We construct a new buffer
-  // to hold the macro body with substitutions.
-  SmallString<256> Buf;
+bool AsmParser::expandMacro(SmallString<256> &Buf, StringRef Body,
+                            const std::vector<StringRef> &Parameters,
+                            const std::vector<std::vector<AsmToken> > &A,
+                            const SMLoc &L) {
   raw_svector_ostream OS(Buf);
+  unsigned NParameters = Parameters.size();
+  if (NParameters != 0 && NParameters != A.size())
+    return Error(L, "Wrong number of arguments");
 
-  StringRef Body = M->Body;
   while (!Body.empty()) {
     // Scan for the next substitution.
     std::size_t End = Body.size(), Pos = 0;
     for (; Pos != End; ++Pos) {
       // Check for a substitution or escape.
-      if (Body[Pos] != '$' || Pos + 1 == End)
-        continue;
-
-      char Next = Body[Pos + 1];
-      if (Next == '$' || Next == 'n' || isdigit(Next))
-        break;
+      if (!NParameters) {
+        // This macro has no parameters, look for $0, $1, etc.
+        if (Body[Pos] != '$' || Pos + 1 == End)
+          continue;
+
+        char Next = Body[Pos + 1];
+        if (Next == '$' || Next == 'n' || isdigit(Next))
+          break;
+      } else {
+        // This macro has parameters, look for \foo, \bar, etc.
+        if (Body[Pos] == '\\' && Pos + 1 != End)
+          break;
+      }
     }
 
     // Add the prefix.
@@ -1201,41 +1225,69 @@ MacroInstantiation::MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL,
     if (Pos == End)
       break;
 
-    switch (Body[Pos+1]) {
-       // $$ => $
-    case '$':
-      OS << '$';
-      break;
+    if (!NParameters) {
+      switch (Body[Pos+1]) {
+        // $$ => $
+      case '$':
+        OS << '$';
+        break;
 
-      // $n => number of arguments
-    case 'n':
-      OS << A.size();
-      break;
+        // $n => number of arguments
+      case 'n':
+        OS << A.size();
+        break;
 
-       // $[0-9] => argument
-    default: {
-      // Missing arguments are ignored.
-      unsigned Index = Body[Pos+1] - '0';
-      if (Index >= A.size())
+        // $[0-9] => argument
+      default: {
+        // Missing arguments are ignored.
+        unsigned Index = Body[Pos+1] - '0';
+        if (Index >= A.size())
+          break;
+
+        // Otherwise substitute with the token values, with spaces eliminated.
+        for (std::vector<AsmToken>::const_iterator it = A[Index].begin(),
+               ie = A[Index].end(); it != ie; ++it)
+          OS << it->getString();
         break;
+      }
+      }
+      Pos += 2;
+    } else {
+      unsigned I = Pos + 1;
+      while (isalnum(Body[I]) && I + 1 != End)
+        ++I;
+
+      const char *Begin = Body.data() + Pos +1;
+      StringRef Argument(Begin, I - (Pos +1));
+      unsigned Index = 0;
+      for (; Index < NParameters; ++Index)
+        if (Parameters[Index] == Argument)
+          break;
+
+      // FIXME: We should error at the macro definition.
+      if (Index == NParameters)
+        return Error(L, "Parameter not found");
 
-      // Otherwise substitute with the token values, with spaces eliminated.
       for (std::vector<AsmToken>::const_iterator it = A[Index].begin(),
              ie = A[Index].end(); it != ie; ++it)
         OS << it->getString();
-      break;
-    }
-    }
 
+      Pos += 1 + Argument.size();
+    }
     // Update the scan point.
-    Body = Body.substr(Pos + 2);
+    Body = Body.substr(Pos);
   }
 
   // We include the .endmacro in the buffer as our queue to exit the macro
   // instantiation.
   OS << ".endmacro\n";
+  return false;
+}
 
-  Instantiation = MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
+MacroInstantiation::MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL,
+                                       MemoryBuffer *I)
+  : TheMacro(M), Instantiation(I), InstantiationLoc(IL), ExitLoc(EL)
+{
 }
 
 bool AsmParser::HandleMacroEntry(StringRef Name, SMLoc NameLoc,
@@ -1272,11 +1324,22 @@ bool AsmParser::HandleMacroEntry(StringRef Name, SMLoc NameLoc,
     Lex();
   }
 
+  // Macro instantiation is lexical, unfortunately. We construct a new buffer
+  // to hold the macro body with substitutions.
+  SmallString<256> Buf;
+  StringRef Body = M->Body;
+
+  if (expandMacro(Buf, Body, M->Parameters, MacroArguments, getTok().getLoc()))
+    return true;
+
+  MemoryBuffer *Instantiation =
+    MemoryBuffer::getMemBufferCopy(Buf.str(), "<instantiation>");
+
   // Create the macro instantiation object and add to the current macro
   // instantiation stack.
   MacroInstantiation *MI = new MacroInstantiation(M, NameLoc,
                                                   getTok().getLoc(),
-                                                  MacroArguments);
+                                                  Instantiation);
   ActiveMacros.push_back(MI);
 
   // Jump to the macro instantiation and prime the lexer.
@@ -2265,6 +2328,39 @@ bool GenericAsmParser::ParseDirectiveStabs(StringRef Directive,
   return TokError("unsupported directive '" + Directive + "'");
 }
 
+/// ParseDirectiveCFISections
+/// ::= .cfi_sections section [, section]
+bool GenericAsmParser::ParseDirectiveCFISections(StringRef,
+                                                 SMLoc DirectiveLoc) {
+  StringRef Name;
+  bool EH = false;
+  bool Debug = false;
+
+  if (getParser().ParseIdentifier(Name))
+    return TokError("Expected an identifier");
+
+  if (Name == ".eh_frame")
+    EH = true;
+  else if (Name == ".debug_frame")
+    Debug = true;
+
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+
+    if (getParser().ParseIdentifier(Name))
+      return TokError("Expected an identifier");
+
+    if (Name == ".eh_frame")
+      EH = true;
+    else if (Name == ".debug_frame")
+      Debug = true;
+  }
+
+  getStreamer().EmitCFISections(EH, Debug);
+
+  return false;
+}
+
 /// ParseDirectiveCFIStartProc
 /// ::= .cfi_startproc
 bool GenericAsmParser::ParseDirectiveCFIStartProc(StringRef,
@@ -2285,7 +2381,7 @@ bool GenericAsmParser::ParseRegisterOrRegisterNumber(int64_t &Register,
                                                      SMLoc DirectiveLoc) {
   unsigned RegNo;
 
-  if (getLexer().is(AsmToken::Percent)) {
+  if (getLexer().isNot(AsmToken::Integer)) {
     if (getParser().getTargetParser().ParseRegister(RegNo, DirectiveLoc,
       DirectiveLoc))
       return true;
@@ -2493,13 +2589,27 @@ bool GenericAsmParser::ParseDirectiveMacrosOnOff(StringRef Directive,
 }
 
 /// ParseDirectiveMacro
-/// ::= .macro name
+/// ::= .macro name [parameters]
 bool GenericAsmParser::ParseDirectiveMacro(StringRef Directive,
                                            SMLoc DirectiveLoc) {
   StringRef Name;
   if (getParser().ParseIdentifier(Name))
     return TokError("expected identifier in directive");
 
+  std::vector<StringRef> Parameters;
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for(;;) {
+      StringRef Parameter;
+      if (getParser().ParseIdentifier(Parameter))
+        return TokError("expected identifier in directive");
+      Parameters.push_back(Parameter);
+
+      if (getLexer().isNot(AsmToken::Comma))
+        break;
+      Lex();
+    }
+  }
+
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.macro' directive");
 
@@ -2537,7 +2647,7 @@ bool GenericAsmParser::ParseDirectiveMacro(StringRef Directive,
   const char *BodyStart = StartToken.getLoc().getPointer();
   const char *BodyEnd = EndToken.getLoc().getPointer();
   StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
-  getParser().MacroMap[Name] = new Macro(Name, Body);
+  getParser().MacroMap[Name] = new Macro(Name, Body, Parameters);
   return false;
 }
 
diff --git a/lib/MC/MCParser/COFFAsmParser.cpp b/lib/MC/MCParser/COFFAsmParser.cpp
index 5ecab03..64f6355 100644
--- a/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/lib/MC/MCParser/COFFAsmParser.cpp
@@ -14,6 +14,9 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetAsmParser.h"
 #include "llvm/Support/COFF.h"
 using namespace llvm;
 
@@ -41,6 +44,34 @@ class COFFAsmParser : public MCAsmParserExtension {
     AddDirectiveHandler<&COFFAsmParser::ParseDirectiveScl>(".scl");
     AddDirectiveHandler<&COFFAsmParser::ParseDirectiveType>(".type");
     AddDirectiveHandler<&COFFAsmParser::ParseDirectiveEndef>(".endef");
+
+    // Win64 EH directives.
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveStartProc>(
+                                                                   ".seh_proc");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndProc>(
+                                                                ".seh_endproc");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveStartChained>(
+                                                           ".seh_startchained");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndChained>(
+                                                             ".seh_endchained");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveHandler>(
+                                                                ".seh_handler");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveHandlerData>(
+                                                            ".seh_handlerdata");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectivePushReg>(
+                                                                ".seh_pushreg");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSetFrame>(
+                                                               ".seh_setframe");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveAllocStack>(
+                                                             ".seh_stackalloc");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSaveReg>(
+                                                                ".seh_savereg");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSaveXMM>(
+                                                                ".seh_savexmm");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectivePushFrame>(
+                                                              ".seh_pushframe");
+    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndProlog>(
+                                                            ".seh_endprologue");
   }
 
   bool ParseSectionDirectiveText(StringRef, SMLoc) {
@@ -70,6 +101,23 @@ class COFFAsmParser : public MCAsmParserExtension {
   bool ParseDirectiveType(StringRef, SMLoc);
   bool ParseDirectiveEndef(StringRef, SMLoc);
 
+  // Win64 EH directives.
+  bool ParseSEHDirectiveStartProc(StringRef, SMLoc);
+  bool ParseSEHDirectiveEndProc(StringRef, SMLoc);
+  bool ParseSEHDirectiveStartChained(StringRef, SMLoc);
+  bool ParseSEHDirectiveEndChained(StringRef, SMLoc);
+  bool ParseSEHDirectiveHandler(StringRef, SMLoc);
+  bool ParseSEHDirectiveHandlerData(StringRef, SMLoc);
+  bool ParseSEHDirectivePushReg(StringRef, SMLoc);
+  bool ParseSEHDirectiveSetFrame(StringRef, SMLoc);
+  bool ParseSEHDirectiveAllocStack(StringRef, SMLoc);
+  bool ParseSEHDirectiveSaveReg(StringRef, SMLoc);
+  bool ParseSEHDirectiveSaveXMM(StringRef, SMLoc);
+  bool ParseSEHDirectivePushFrame(StringRef, SMLoc);
+  bool ParseSEHDirectiveEndProlog(StringRef, SMLoc);
+
+  bool ParseAtUnwindOrAtExcept(bool &unwind, bool &except);
+  bool ParseSEHRegisterNumber(unsigned &RegNo);
 public:
   COFFAsmParser() {}
 };
@@ -135,6 +183,256 @@ bool COFFAsmParser::ParseDirectiveEndef(StringRef, SMLoc) {
   return false;
 }
 
+bool COFFAsmParser::ParseSEHDirectiveStartProc(StringRef, SMLoc) {
+  StringRef SymbolID;
+  if (getParser().ParseIdentifier(SymbolID))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  MCSymbol *Symbol = getContext().GetOrCreateSymbol(SymbolID);
+
+  Lex();
+  getStreamer().EmitWin64EHStartProc(Symbol);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveEndProc(StringRef, SMLoc) {
+  Lex();
+  getStreamer().EmitWin64EHEndProc();
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveStartChained(StringRef, SMLoc) {
+  Lex();
+  getStreamer().EmitWin64EHStartChained();
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveEndChained(StringRef, SMLoc) {
+  Lex();
+  getStreamer().EmitWin64EHEndChained();
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveHandler(StringRef, SMLoc) {
+  StringRef SymbolID;
+  if (getParser().ParseIdentifier(SymbolID))
+    return true;
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("you must specify one or both of @unwind or @except");
+  Lex();
+  bool unwind = false, except = false;
+  if (ParseAtUnwindOrAtExcept(unwind, except))
+    return true;
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+    if (ParseAtUnwindOrAtExcept(unwind, except))
+      return true;
+  }
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  MCSymbol *handler = getContext().GetOrCreateSymbol(SymbolID);
+
+  Lex();
+  getStreamer().EmitWin64EHHandler(handler, unwind, except);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveHandlerData(StringRef, SMLoc) {
+  Lex();
+  getStreamer().EmitWin64EHHandlerData();
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectivePushReg(StringRef, SMLoc L) {
+  unsigned Reg;
+  if (ParseSEHRegisterNumber(Reg))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  Lex();
+  getStreamer().EmitWin64EHPushReg(Reg);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveSetFrame(StringRef, SMLoc L) {
+  unsigned Reg;
+  int64_t Off;
+  if (ParseSEHRegisterNumber(Reg))
+    return true;
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("you must specify a stack pointer offset");
+
+  Lex();
+  SMLoc startLoc = getLexer().getLoc();
+  if (getParser().ParseAbsoluteExpression(Off))
+    return true;
+
+  if (Off & 0x0F)
+    return Error(startLoc, "offset is not a multiple of 16");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  Lex();
+  getStreamer().EmitWin64EHSetFrame(Reg, Off);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveAllocStack(StringRef, SMLoc) {
+  int64_t Size;
+  SMLoc startLoc = getLexer().getLoc();
+  if (getParser().ParseAbsoluteExpression(Size))
+    return true;
+
+  if (Size & 7)
+    return Error(startLoc, "size is not a multiple of 8");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  Lex();
+  getStreamer().EmitWin64EHAllocStack(Size);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveSaveReg(StringRef, SMLoc L) {
+  unsigned Reg;
+  int64_t Off;
+  if (ParseSEHRegisterNumber(Reg))
+    return true;
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("you must specify an offset on the stack");
+
+  Lex();
+  SMLoc startLoc = getLexer().getLoc();
+  if (getParser().ParseAbsoluteExpression(Off))
+    return true;
+
+  if (Off & 7)
+    return Error(startLoc, "size is not a multiple of 8");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  Lex();
+  // FIXME: Err on %xmm* registers
+  getStreamer().EmitWin64EHSaveReg(Reg, Off);
+  return false;
+}
+
+// FIXME: This method is inherently x86-specific. It should really be in the
+// x86 backend.
+bool COFFAsmParser::ParseSEHDirectiveSaveXMM(StringRef, SMLoc L) {
+  unsigned Reg;
+  int64_t Off;
+  if (ParseSEHRegisterNumber(Reg))
+    return true;
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("you must specify an offset on the stack");
+
+  Lex();
+  SMLoc startLoc = getLexer().getLoc();
+  if (getParser().ParseAbsoluteExpression(Off))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  if (Off & 0x0F)
+    return Error(startLoc, "offset is not a multiple of 16");
+
+  Lex();
+  // FIXME: Err on non-%xmm* registers
+  getStreamer().EmitWin64EHSaveXMM(Reg, Off);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectivePushFrame(StringRef, SMLoc) {
+  bool Code = false;
+  StringRef CodeID;
+  if (getLexer().is(AsmToken::At)) {
+    SMLoc startLoc = getLexer().getLoc();
+    Lex();
+    if (!getParser().ParseIdentifier(CodeID)) {
+      if (CodeID != "code")
+        return Error(startLoc, "expected @code");
+      Code = true;
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  Lex();
+  getStreamer().EmitWin64EHPushFrame(Code);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveEndProlog(StringRef, SMLoc) {
+  Lex();
+  getStreamer().EmitWin64EHEndProlog();
+  return false;
+}
+
+bool COFFAsmParser::ParseAtUnwindOrAtExcept(bool &unwind, bool &except) {
+  StringRef identifier;
+  if (getLexer().isNot(AsmToken::At))
+    return TokError("a handler attribute must begin with '@'");
+  SMLoc startLoc = getLexer().getLoc();
+  Lex();
+  if (getParser().ParseIdentifier(identifier))
+    return Error(startLoc, "expected @unwind or @except");
+  if (identifier == "unwind")
+    unwind = true;
+  else if (identifier == "except")
+    except = true;
+  else
+    return Error(startLoc, "expected @unwind or @except");
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHRegisterNumber(unsigned &RegNo) {
+  SMLoc startLoc = getLexer().getLoc();
+  if (getLexer().is(AsmToken::Percent)) {
+    const TargetAsmInfo &asmInfo = getContext().getTargetAsmInfo();
+    SMLoc endLoc;
+    unsigned LLVMRegNo;
+    if (getParser().getTargetParser().ParseRegister(LLVMRegNo,startLoc,endLoc))
+      return true;
+
+    // Check that this is a non-volatile register.
+    const unsigned *NVRegs = asmInfo.getCalleeSavedRegs();
+    unsigned i;
+    for (i = 0; NVRegs[i] != 0; ++i)
+      if (NVRegs[i] == LLVMRegNo)
+        break;
+    if (NVRegs[i] == 0)
+      return Error(startLoc, "expected non-volatile register");
+
+    int SEHRegNo = asmInfo.getSEHRegNum(LLVMRegNo);
+    if (SEHRegNo < 0)
+      return Error(startLoc,"register can't be represented in SEH unwind info");
+    RegNo = SEHRegNo;
+  }
+  else {
+    int64_t n;
+    if (getParser().ParseAbsoluteExpression(n))
+      return true;
+    if (n > 15)
+      return Error(startLoc, "register number is too high");
+    RegNo = n;
+  }
+
+  return false;
+}
+
 namespace llvm {
 
 MCAsmParserExtension *createCOFFAsmParser() {
diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
index 3c092cd..6f45068 100644
--- a/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -369,11 +369,9 @@ bool DarwinAsmParser::ParseDirectiveDumpOrLoad(StringRef Directive,
   // FIXME: If/when .dump and .load are implemented they will be done in the
   // the assembly parser and not have any need for an MCStreamer API.
   if (IsDump)
-    Warning(IDLoc, "ignoring directive .dump for now");
+    return Warning(IDLoc, "ignoring directive .dump for now");
   else
-    Warning(IDLoc, "ignoring directive .load for now");
-
-  return false;
+    return Warning(IDLoc, "ignoring directive .load for now");
 }
 
 /// ParseDirectiveLsym
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index fa245b1..ae3ed0f 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -15,17 +15,22 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetAsmInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 #include <cstdlib>
 using namespace llvm;
 
-MCStreamer::MCStreamer(MCContext &Ctx) : Context(Ctx) {
+MCStreamer::MCStreamer(MCContext &Ctx) : Context(Ctx), EmitEHFrame(true),
+                                         EmitDebugFrame(false),
+                                         CurrentW64UnwindInfo(0) {
   const MCSection *section = NULL;
   SectionStack.push_back(std::make_pair(section, section));
 }
 
 MCStreamer::~MCStreamer() {
+  for (unsigned i = 0; i < getNumW64UnwindInfos(); ++i)
+    delete W64UnwindInfos[i];
 }
 
 const MCExpr *MCStreamer::BuildSymbolDiff(MCContext &Context,
@@ -41,14 +46,14 @@ const MCExpr *MCStreamer::BuildSymbolDiff(MCContext &Context,
   return AddrDelta;
 }
 
-const MCExpr *MCStreamer::ForceExpAbs(MCStreamer *Streamer,
-                                      MCContext &Context, const MCExpr* Expr) {
- if (Context.getAsmInfo().hasAggressiveSymbolFolding())
-   return Expr;
+const MCExpr *MCStreamer::ForceExpAbs(const MCExpr* Expr) {
+  if (Context.getAsmInfo().hasAggressiveSymbolFolding() ||
+      isa<MCSymbolRefExpr>(Expr))
+    return Expr;
 
- MCSymbol *ABS = Context.CreateTempSymbol();
- Streamer->EmitAssignment(ABS, Expr);
- return MCSymbolRefExpr::Create(ABS, Context);
+  MCSymbol *ABS = Context.CreateTempSymbol();
+  EmitAssignment(ABS, Expr);
+  return MCSymbolRefExpr::Create(ABS, Context);
 }
 
 raw_ostream &MCStreamer::GetCommentOS() {
@@ -76,9 +81,11 @@ void MCStreamer::EmitIntValue(uint64_t Value, unsigned Size,
   assert((isUIntN(8 * Size, Value) || isIntN(8 * Size, Value)) &&
          "Invalid size");
   char buf[8];
-  // FIXME: Endianness assumption.
-  for (unsigned i = 0; i != Size; ++i)
-    buf[i] = uint8_t(Value >> (i * 8));
+  const bool isLittleEndian = Context.getTargetAsmInfo().isLittleEndian();
+  for (unsigned i = 0; i != Size; ++i) {
+    unsigned index = isLittleEndian ? i : (Size - i - 1);
+    buf[i] = uint8_t(Value >> (index * 8));
+  }
   EmitBytes(StringRef(buf, Size), AddrSpace);
 }
 
@@ -102,13 +109,8 @@ void MCStreamer::EmitSLEB128IntValue(int64_t Value, unsigned AddrSpace) {
 
 void MCStreamer::EmitAbsValue(const MCExpr *Value, unsigned Size,
                               unsigned AddrSpace) {
-  if (getContext().getAsmInfo().hasAggressiveSymbolFolding()) {
-    EmitValue(Value, Size, AddrSpace);
-    return;
-  }
-  MCSymbol *ABS = getContext().CreateTempSymbol();
-  EmitAssignment(ABS, Value);
-  EmitSymbolValue(ABS, Size, AddrSpace);
+  const MCExpr *ABS = ForceExpAbs(Value);
+  EmitValue(ABS, Size, AddrSpace);
 }
 
 
@@ -176,6 +178,12 @@ void MCStreamer::EmitLabel(MCSymbol *Symbol) {
     LastNonPrivate = Symbol;
 }
 
+void MCStreamer::EmitCFISections(bool EH, bool Debug) {
+  assert(EH || Debug);
+  EmitEHFrame = EH;
+  EmitDebugFrame = Debug;
+}
+
 void MCStreamer::EmitCFIStartProc() {
   MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
   if (CurFrame && !CurFrame->End)
@@ -303,6 +311,156 @@ void MCStreamer::EmitCFISameValue(int64_t Register) {
   CurFrame->Instructions.push_back(Instruction);
 }
 
+void MCStreamer::setCurrentW64UnwindInfo(MCWin64EHUnwindInfo *Frame) {
+  W64UnwindInfos.push_back(Frame);
+  CurrentW64UnwindInfo = W64UnwindInfos.back();
+}
+
+void MCStreamer::EnsureValidW64UnwindInfo() {
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (!CurFrame || CurFrame->End)
+    report_fatal_error("No open Win64 EH frame function!");
+}
+
+void MCStreamer::EmitWin64EHStartProc(const MCSymbol *Symbol) {
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (CurFrame && !CurFrame->End)
+    report_fatal_error("Starting a function before ending the previous one!");
+  MCWin64EHUnwindInfo *Frame = new MCWin64EHUnwindInfo;
+  Frame->Begin = getContext().CreateTempSymbol();
+  Frame->Function = Symbol;
+  EmitLabel(Frame->Begin);
+  setCurrentW64UnwindInfo(Frame);
+}
+
+void MCStreamer::EmitWin64EHEndProc() {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (CurFrame->ChainedParent)
+    report_fatal_error("Not all chained regions terminated!");
+  CurFrame->End = getContext().CreateTempSymbol();
+  EmitLabel(CurFrame->End);
+}
+
+void MCStreamer::EmitWin64EHStartChained() {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *Frame = new MCWin64EHUnwindInfo;
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  Frame->Begin = getContext().CreateTempSymbol();
+  Frame->Function = CurFrame->Function;
+  Frame->ChainedParent = CurFrame;
+  EmitLabel(Frame->Begin);
+  setCurrentW64UnwindInfo(Frame);
+}
+
+void MCStreamer::EmitWin64EHEndChained() {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (!CurFrame->ChainedParent)
+    report_fatal_error("End of a chained region outside a chained region!");
+  CurFrame->End = getContext().CreateTempSymbol();
+  EmitLabel(CurFrame->End);
+  CurrentW64UnwindInfo = CurFrame->ChainedParent;
+}
+
+void MCStreamer::EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
+                                    bool Except) {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (CurFrame->ChainedParent)
+    report_fatal_error("Chained unwind areas can't have handlers!");
+  CurFrame->ExceptionHandler = Sym;
+  if (!Except && !Unwind)
+    report_fatal_error("Don't know what kind of handler this is!");
+  if (Unwind)
+    CurFrame->HandlesUnwind = true;
+  if (Except)
+    CurFrame->HandlesExceptions = true;
+}
+
+void MCStreamer::EmitWin64EHHandlerData() {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (CurFrame->ChainedParent)
+    report_fatal_error("Chained unwind areas can't have handlers!");
+}
+
+void MCStreamer::EmitWin64EHPushReg(unsigned Register) {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  MCWin64EHInstruction Inst(Win64EH::UOP_PushNonVol, Label, Register);
+  EmitLabel(Label);
+  CurFrame->Instructions.push_back(Inst);
+}
+
+void MCStreamer::EmitWin64EHSetFrame(unsigned Register, unsigned Offset) {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (CurFrame->LastFrameInst >= 0)
+    report_fatal_error("Frame register and offset already specified!");
+  if (Offset & 0x0F)
+    report_fatal_error("Misaligned frame pointer offset!");
+  MCWin64EHInstruction Inst(Win64EH::UOP_SetFPReg, NULL, Register, Offset);
+  CurFrame->LastFrameInst = CurFrame->Instructions.size();
+  CurFrame->Instructions.push_back(Inst);
+}
+
+void MCStreamer::EmitWin64EHAllocStack(unsigned Size) {
+  EnsureValidW64UnwindInfo();
+  if (Size & 7)
+    report_fatal_error("Misaligned stack allocation!");
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  MCWin64EHInstruction Inst(Label, Size);
+  EmitLabel(Label);
+  CurFrame->Instructions.push_back(Inst);
+}
+
+void MCStreamer::EmitWin64EHSaveReg(unsigned Register, unsigned Offset) {
+  EnsureValidW64UnwindInfo();
+  if (Offset & 7)
+    report_fatal_error("Misaligned saved register offset!");
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  MCWin64EHInstruction Inst(
+     Offset > 512*1024-8 ? Win64EH::UOP_SaveNonVolBig : Win64EH::UOP_SaveNonVol,
+                            Label, Register, Offset);
+  EmitLabel(Label);
+  CurFrame->Instructions.push_back(Inst);
+}
+
+void MCStreamer::EmitWin64EHSaveXMM(unsigned Register, unsigned Offset) {
+  EnsureValidW64UnwindInfo();
+  if (Offset & 0x0F)
+    report_fatal_error("Misaligned saved vector register offset!");
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  MCWin64EHInstruction Inst(
+    Offset > 512*1024-16 ? Win64EH::UOP_SaveXMM128Big : Win64EH::UOP_SaveXMM128,
+                            Label, Register, Offset);
+  EmitLabel(Label);
+  CurFrame->Instructions.push_back(Inst);
+}
+
+void MCStreamer::EmitWin64EHPushFrame(bool Code) {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (CurFrame->Instructions.size() > 0)
+    report_fatal_error("If present, PushMachFrame must be the first UOP");
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  MCWin64EHInstruction Inst(Win64EH::UOP_PushMachFrame, Label, Code);
+  EmitLabel(Label);
+  CurFrame->Instructions.push_back(Inst);
+}
+
+void MCStreamer::EmitWin64EHEndProlog() {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  CurFrame->PrologEnd = getContext().CreateTempSymbol();
+  EmitLabel(CurFrame->PrologEnd);
+}
+
 void MCStreamer::EmitFnStart() {
   errs() << "Not implemented yet\n";
   abort();
@@ -357,3 +515,21 @@ void MCStreamer::EmitRawText(const Twine &T) {
   T.toVector(Str);
   EmitRawText(Str.str());
 }
+
+void MCStreamer::EmitFrames(bool usingCFI) {
+  if (!getNumFrameInfos())
+    return;
+
+  if (EmitEHFrame)
+    MCDwarfFrameEmitter::Emit(*this, usingCFI, true);
+
+  if (EmitDebugFrame)
+    MCDwarfFrameEmitter::Emit(*this, usingCFI, false);
+}
+
+void MCStreamer::EmitW64Tables() {
+  if (!getNumW64UnwindInfos())
+    return;
+
+  MCWin64EHUnwindEmitter::Emit(*this);
+}
diff --git a/lib/MC/MCWin64EH.cpp b/lib/MC/MCWin64EH.cpp
new file mode 100644
index 0000000..9453f5c
--- /dev/null
+++ b/lib/MC/MCWin64EH.cpp
@@ -0,0 +1,258 @@
+//===- lib/MC/MCWin64EH.cpp - MCWin64EH implementation --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCWin64EH.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Target/TargetAsmInfo.h"
+
+namespace llvm {
+
+// NOTE: All relocations generated here are 4-byte image-relative.
+
+static uint8_t CountOfUnwindCodes(std::vector<MCWin64EHInstruction> &instArray){
+  uint8_t count = 0;
+  for (std::vector<MCWin64EHInstruction>::const_iterator I = instArray.begin(),
+       E = instArray.end(); I != E; ++I) {
+    switch (I->getOperation()) {
+    case Win64EH::UOP_PushNonVol:
+    case Win64EH::UOP_AllocSmall:
+    case Win64EH::UOP_SetFPReg:
+    case Win64EH::UOP_PushMachFrame:
+      count += 1;
+      break;
+    case Win64EH::UOP_SaveNonVol:
+    case Win64EH::UOP_SaveXMM128:
+      count += 2;
+      break;
+    case Win64EH::UOP_SaveNonVolBig:
+    case Win64EH::UOP_SaveXMM128Big:
+      count += 3;
+      break;
+    case Win64EH::UOP_AllocLarge:
+      if (I->getSize() > 512*1024-8)
+        count += 3;
+      else
+        count += 2;
+      break;
+    }
+  }
+  return count;
+}
+
+static void EmitAbsDifference(MCStreamer &streamer, MCSymbol *lhs,
+                              MCSymbol *rhs) {
+  MCContext &context = streamer.getContext();
+  const MCExpr *diff = MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(
+                                                                  lhs, context),
+                                               MCSymbolRefExpr::Create(
+                                                                  rhs, context),
+                                               context);
+  streamer.EmitAbsValue(diff, 1);
+
+}
+
+static void EmitUnwindCode(MCStreamer &streamer, MCSymbol *begin,
+                           MCWin64EHInstruction &inst) {
+  uint8_t b1, b2;
+  uint16_t w;
+  b2 = (inst.getOperation() & 0x0F);
+  switch (inst.getOperation()) {
+  case Win64EH::UOP_PushNonVol:
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    b2 |= (inst.getRegister() & 0x0F) << 4;
+    streamer.EmitIntValue(b2, 1);
+    break;
+  case Win64EH::UOP_AllocLarge:
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    if (inst.getSize() > 512*1024-8) {
+      b2 |= 0x10;
+      streamer.EmitIntValue(b2, 1);
+      w = inst.getSize() & 0xFFF8;
+      streamer.EmitIntValue(w, 2);
+      w = inst.getSize() >> 16;
+    } else {
+      streamer.EmitIntValue(b2, 1);
+      w = inst.getSize() >> 3;
+    }
+    streamer.EmitIntValue(w, 2);
+    break;
+  case Win64EH::UOP_AllocSmall:
+    b2 |= (((inst.getSize()-8) >> 3) & 0x0F) << 4;
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    streamer.EmitIntValue(b2, 1);
+    break;
+  case Win64EH::UOP_SetFPReg:
+    b1 = inst.getOffset() & 0xF0;
+    streamer.EmitIntValue(b1, 1);
+    streamer.EmitIntValue(b2, 1);
+    break;
+  case Win64EH::UOP_SaveNonVol:
+  case Win64EH::UOP_SaveXMM128:
+    b2 |= (inst.getRegister() & 0x0F) << 4;
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    streamer.EmitIntValue(b2, 1);
+    w = inst.getOffset() >> 3;
+    if (inst.getOperation() == Win64EH::UOP_SaveXMM128)
+      w >>= 1;
+    streamer.EmitIntValue(w, 2);
+    break;
+  case Win64EH::UOP_SaveNonVolBig:
+  case Win64EH::UOP_SaveXMM128Big:
+    b2 |= (inst.getRegister() & 0x0F) << 4;
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    streamer.EmitIntValue(b2, 1);
+    if (inst.getOperation() == Win64EH::UOP_SaveXMM128Big)
+      w = inst.getOffset() & 0xFFF0;
+    else
+      w = inst.getOffset() & 0xFFF8;
+    streamer.EmitIntValue(w, 2);
+    w = inst.getOffset() >> 16;
+    streamer.EmitIntValue(w, 2);
+    break;
+  case Win64EH::UOP_PushMachFrame:
+    if (inst.isPushCodeFrame())
+      b2 |= 0x10;
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    streamer.EmitIntValue(b2, 1);
+    break;
+  }
+}
+
+static void EmitRuntimeFunction(MCStreamer &streamer,
+                                const MCWin64EHUnwindInfo *info) {
+  MCContext &context = streamer.getContext();
+
+  streamer.EmitValueToAlignment(4);
+  streamer.EmitValue(MCSymbolRefExpr::Create(info->Begin, context), 4);
+  streamer.EmitValue(MCSymbolRefExpr::Create(info->End, context), 4);
+  streamer.EmitValue(MCSymbolRefExpr::Create(info->Symbol, context), 4);
+}
+
+static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info) {
+  // If this UNWIND_INFO already has a symbol, it's already been emitted.
+  if (info->Symbol) return;
+
+  MCContext &context = streamer.getContext();
+  streamer.EmitValueToAlignment(4);
+  // Upper 3 bits are the version number (currently 1).
+  uint8_t flags = 0x01;
+  info->Symbol = context.CreateTempSymbol();
+  streamer.EmitLabel(info->Symbol);
+
+  if (info->ChainedParent)
+    flags |= Win64EH::UNW_ChainInfo << 3;
+  else {
+    if (info->HandlesUnwind)
+      flags |= Win64EH::UNW_TerminateHandler << 3;
+    if (info->HandlesExceptions)
+      flags |= Win64EH::UNW_ExceptionHandler << 3;
+  }
+  streamer.EmitIntValue(flags, 1);
+
+  if (info->PrologEnd)
+    EmitAbsDifference(streamer, info->PrologEnd, info->Begin);
+  else
+    streamer.EmitIntValue(0, 1);
+
+  uint8_t numCodes = CountOfUnwindCodes(info->Instructions);
+  streamer.EmitIntValue(numCodes, 1);
+
+  uint8_t frame = 0;
+  if (info->LastFrameInst >= 0) {
+    MCWin64EHInstruction &frameInst = info->Instructions[info->LastFrameInst];
+    assert(frameInst.getOperation() == Win64EH::UOP_SetFPReg);
+    frame = (frameInst.getRegister() & 0x0F) |
+            (frameInst.getOffset() & 0xF0);
+  }
+  streamer.EmitIntValue(frame, 1);
+
+  // Emit unwind instructions (in reverse order).
+  uint8_t numInst = info->Instructions.size();
+  for (uint8_t c = 0; c < numInst; ++c) {
+    MCWin64EHInstruction inst = info->Instructions.back();
+    info->Instructions.pop_back();
+    EmitUnwindCode(streamer, info->Begin, inst);
+  }
+
+  if (flags & (Win64EH::UNW_ChainInfo << 3))
+    EmitRuntimeFunction(streamer, info->ChainedParent);
+  else if (flags &
+           ((Win64EH::UNW_TerminateHandler|Win64EH::UNW_ExceptionHandler) << 3))
+    streamer.EmitValue(MCSymbolRefExpr::Create(info->ExceptionHandler, context),
+                       4);
+  else if (numCodes < 2) {
+    // The minimum size of an UNWIND_INFO struct is 8 bytes. If we're not
+    // a chained unwind info, if there is no handler, and if there are fewer
+    // than 2 slots used in the unwind code array, we have to pad to 8 bytes.
+    if (numCodes == 1)
+      streamer.EmitIntValue(0, 2);
+    else
+      streamer.EmitIntValue(0, 4);
+  }
+}
+
+StringRef MCWin64EHUnwindEmitter::GetSectionSuffix(const MCSymbol *func) {
+  if (!func || !func->isInSection()) return "";
+  const MCSection *section = &func->getSection();
+  const MCSectionCOFF *COFFSection;
+  if ((COFFSection = dyn_cast<MCSectionCOFF>(section))) {
+    StringRef name = COFFSection->getSectionName();
+    size_t dollar = name.find('$');
+    size_t dot = name.find('.', 1);
+    if (dollar == StringRef::npos && dot == StringRef::npos)
+      return "";
+    if (dot == StringRef::npos)
+      return name.substr(dollar);
+    if (dollar == StringRef::npos || dot < dollar)
+      return name.substr(dot);
+    return name.substr(dollar);
+  }
+  return "";
+}
+
+void MCWin64EHUnwindEmitter::EmitUnwindInfo(MCStreamer &streamer,
+                                            MCWin64EHUnwindInfo *info) {
+  // Switch sections (the static function above is meant to be called from
+  // here and from Emit().
+  MCContext &context = streamer.getContext();
+  const TargetAsmInfo &asmInfo = context.getTargetAsmInfo();
+  const MCSection *xdataSect =
+    asmInfo.getWin64EHTableSection(GetSectionSuffix(info->Function));
+  streamer.SwitchSection(xdataSect);
+
+  llvm::EmitUnwindInfo(streamer, info);
+}
+
+void MCWin64EHUnwindEmitter::Emit(MCStreamer &streamer) {
+  MCContext &context = streamer.getContext();
+  // Emit the unwind info structs first.
+  const TargetAsmInfo &asmInfo = context.getTargetAsmInfo();
+  for (unsigned i = 0; i < streamer.getNumW64UnwindInfos(); ++i) {
+    MCWin64EHUnwindInfo &info = streamer.getW64UnwindInfo(i);
+    const MCSection *xdataSect =
+      asmInfo.getWin64EHTableSection(GetSectionSuffix(info.Function));
+    streamer.SwitchSection(xdataSect);
+    llvm::EmitUnwindInfo(streamer, &info);
+  }
+  // Now emit RUNTIME_FUNCTION entries.
+  for (unsigned i = 0; i < streamer.getNumW64UnwindInfos(); ++i) {
+    MCWin64EHUnwindInfo &info = streamer.getW64UnwindInfo(i);
+    const MCSection *pdataSect =
+      asmInfo.getWin64EHFuncTableSection(GetSectionSuffix(info.Function));
+    streamer.SwitchSection(pdataSect);
+    EmitRuntimeFunction(streamer, &info);
+  }
+}
+
+} // End of namespace llvm
+
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index 46968e6..6c36c12 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCWin64EH.h"
 #include "llvm/Target/TargetRegistry.h"
 #include "llvm/Target/TargetAsmBackend.h"
 #include "llvm/ADT/StringMap.h"
@@ -74,6 +75,7 @@ public:
                                  unsigned MaxBytesToEmit);
   virtual void EmitFileDirective(StringRef Filename);
   virtual void EmitInstruction(const MCInst &Instruction);
+  virtual void EmitWin64EHHandlerData();
   virtual void Finish();
 
 private:
@@ -377,7 +379,16 @@ void WinCOFFStreamer::EmitInstruction(const MCInst &Instruction) {
                                                 Fragment->getFixups());
 }
 
+void WinCOFFStreamer::EmitWin64EHHandlerData() {
+  MCStreamer::EmitWin64EHHandlerData();
+
+  // We have to emit the unwind info now, because this directive
+  // actually switches to the .xdata section!
+  MCWin64EHUnwindEmitter::EmitUnwindInfo(*this, getCurrentW64UnwindInfo());
+}
+
 void WinCOFFStreamer::Finish() {
+  EmitW64Tables();
   MCObjectStreamer::Finish();
 }
 
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 23a22ac..74d61c1 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -1375,7 +1375,7 @@ APInt APInt::sqrt() const {
                  uint64_t(::round(::sqrt(double(isSingleWord()?VAL:pVal[0])))));
 #else
     return APInt(BitWidth,
-                 uint64_t(::sqrt(double(isSingleWord()?VAL:pVal[0]))) + 0.5);
+                 uint64_t(::sqrt(double(isSingleWord()?VAL:pVal[0])) + 0.5));
 #endif
   }
 
diff --git a/lib/Support/BranchProbability.cpp b/lib/Support/BranchProbability.cpp
new file mode 100644
index 0000000..97342da
--- /dev/null
+++ b/lib/Support/BranchProbability.cpp
@@ -0,0 +1,44 @@
+//===-------------- lib/Support/BranchProbability.cpp -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Branch Probability class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+BranchProbability::BranchProbability(uint32_t n, uint32_t d) {
+  assert(d > 0 && "Denomiator cannot be 0!");
+  assert(n <= d && "Probability cannot be bigger than 1!");
+  N = n;
+  D = d;
+}
+
+raw_ostream &BranchProbability::print(raw_ostream &OS) const {
+  OS << N << " / " << D << " = " << ((double)N / D);
+  return OS;
+}
+
+void BranchProbability::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+
+namespace llvm {
+
+raw_ostream &operator<<(raw_ostream &OS, const BranchProbability &Prob) {
+  Prob.print(OS);
+  return OS;
+}
+
+}
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index a0e997d..867d930 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -9,6 +9,7 @@ add_llvm_library(LLVMSupport
   APInt.cpp
   APSInt.cpp
   Allocator.cpp
+  BranchProbability.cpp
   circular_raw_ostream.cpp
   CommandLine.cpp
   ConstantRange.cpp
diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
index 74a9fda..0813321 100644
--- a/lib/Support/Dwarf.cpp
+++ b/lib/Support/Dwarf.cpp
@@ -207,6 +207,7 @@ const char *llvm::dwarf::AttributeString(unsigned Attribute) {
   case DW_AT_APPLE_property_getter:      return "DW_AT_APPLE_property_getter";
   case DW_AT_APPLE_property_setter:      return "DW_AT_APPLE_property_setter";
   case DW_AT_APPLE_property_attribute:   return "DW_AT_APPLE_property_attribute";
+  case DW_AT_APPLE_objc_complete_type:   return "DW_AT_APPLE_objc_complete_type";
   }
   return 0;
 }
diff --git a/lib/Support/FoldingSet.cpp b/lib/Support/FoldingSet.cpp
index d2e35b8..1568342 100644
--- a/lib/Support/FoldingSet.cpp
+++ b/lib/Support/FoldingSet.cpp
@@ -92,7 +92,7 @@ void FoldingSetNodeID::AddInteger(long long I) {
 }
 void FoldingSetNodeID::AddInteger(unsigned long long I) {
   AddInteger(unsigned(I));
-  if ((uint64_t)(int)I != I)
+  if ((uint64_t)(unsigned)I != I)
     Bits.push_back(unsigned(I >> 32));
 }
 
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 911c64a..4299aa4 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -215,7 +215,8 @@ std::string sys::getHostCPUName() {
       case 37: // Intel Core i7, laptop version.
         return "corei7";
       case 42: // SandyBridge
-        return "sandybridge";
+      case 45:
+        return "corei7-avx";
 
       case 28: // Intel Atom processor. All processors are manufactured using
                // the 45 nm process
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index e2b5b7a..d264be9 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -67,7 +67,7 @@ static void CopyStringRef(char *Memory, StringRef Data) {
 
 /// GetNamedBuffer - Allocates a new MemoryBuffer with Name copied after it.
 template <typename T>
-static T* GetNamedBuffer(StringRef Buffer, StringRef Name,
+static T *GetNamedBuffer(StringRef Buffer, StringRef Name,
                          bool RequiresNullTerminator) {
   char *Mem = static_cast<char*>(operator new(sizeof(T) + Name.size() + 1));
   CopyStringRef(Mem + sizeof(T), Name);
@@ -94,7 +94,7 @@ public:
 }
 
 /// getMemBuffer - Open the specified memory range as a MemoryBuffer.  Note
-/// that EndPtr[0] must be a null byte and be accessible!
+/// that InputData must be a null terminated if RequiresNullTerminator is true!
 MemoryBuffer *MemoryBuffer::getMemBuffer(StringRef InputData,
                                          StringRef BufferName,
                                          bool RequiresNullTerminator) {
@@ -221,9 +221,9 @@ error_code MemoryBuffer::getFile(const char *Filename,
   OpenFlags |= O_BINARY;  // Open input file in binary mode on win32.
 #endif
   int FD = ::open(Filename, OpenFlags);
-  if (FD == -1) {
+  if (FD == -1)
     return error_code(errno, posix_category());
-  }
+
   error_code ret = getOpenFile(FD, Filename, result, FileSize, FileSize,
                                0, RequiresNullTerminator);
   close(FD);
diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index ef09916..de042a9f 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp
@@ -49,14 +49,16 @@ SourceMgr::~SourceMgr() {
 /// directory or in one of the IncludeDirs.  If no file is found, this returns
 /// ~0, otherwise it returns the buffer ID of the stacked file.
 unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
-                                   SMLoc IncludeLoc) {
+                                   SMLoc IncludeLoc,
+                                   std::string &IncludedFile) {
   OwningPtr<MemoryBuffer> NewBuf;
-  MemoryBuffer::getFile(Filename.c_str(), NewBuf);
+  IncludedFile = Filename;
+  MemoryBuffer::getFile(IncludedFile.c_str(), NewBuf);
 
   // If the file didn't exist directly, see if it's in an include path.
   for (unsigned i = 0, e = IncludeDirectories.size(); i != e && !NewBuf; ++i) {
-    std::string IncFile = IncludeDirectories[i] + "/" + Filename;
-    MemoryBuffer::getFile(IncFile.c_str(), NewBuf);
+    IncludedFile = IncludeDirectories[i] + "/" + Filename;
+    MemoryBuffer::getFile(IncludedFile.c_str(), NewBuf);
   }
 
   if (NewBuf == 0) return ~0U;
diff --git a/lib/Support/Unix/Host.inc b/lib/Support/Unix/Host.inc
index 8cbec8c..5fd0e5e 100644
--- a/lib/Support/Unix/Host.inc
+++ b/lib/Support/Unix/Host.inc
@@ -44,35 +44,6 @@ std::string sys::getHostTriple() {
   // Normalize the arch, since the host triple may not actually match the host.
   std::string Arch = ArchSplit.first;
 
-  // It would be nice to do this in terms of llvm::Triple, but that is in
-  // Support which is layered above us.
-#if defined(__x86_64__)
-  Arch = "x86_64";
-#elif defined(__i386__)
-  Arch = "i386";
-#elif defined(__ppc64__)
-  Arch = "powerpc64";
-#elif defined(__ppc__)
-  Arch = "powerpc";
-#elif defined(__arm__)
-
-  // FIXME: We need to pick the right ARM triple (which involves querying the
-  // chip). However, for now this is most important for LLVM arch selection, so
-  // we only need to make sure to distinguish ARM and Thumb.
-#  if defined(__thumb__)
-  Arch = "thumb";
-#  else
-  Arch = "arm";
-#  endif
-
-#else
-
-  // FIXME: When enough auto-detection is in place, this should just
-  // #error. Then at least the arch selection is done, and we only need the OS
-  // etc selection to kill off the use of LLVM_HOSTTRIPLE.
-
-#endif
-
   std::string Triple(Arch);
   Triple += '-';
   Triple += ArchSplit.second;
diff --git a/lib/Support/Unix/Program.inc b/lib/Support/Unix/Program.inc
index 9f0a9ef..346baf1 100644
--- a/lib/Support/Unix/Program.inc
+++ b/lib/Support/Unix/Program.inc
@@ -338,7 +338,7 @@ Program::Wait(const sys::Path &path,
       else
         MakeErrMsg(ErrMsg, "Child timed out", 0);
 
-      return -1;   // Timeout detected
+      return -2;   // Timeout detected
     } else if (errno != EINTR) {
       MakeErrMsg(ErrMsg, "Error waiting for child process");
       return -1;
@@ -382,7 +382,9 @@ Program::Wait(const sys::Path &path,
         *ErrMsg += " (core dumped)";
 #endif
     }
-    return -1;
+    // Return a special value to indicate that the process received an unhandled
+    // signal during execution as opposed to failing to execute.
+    return -2;
   }
   return result;
 #else
diff --git a/lib/Support/Windows/Program.inc b/lib/Support/Windows/Program.inc
index 350363c..e486e6e 100644
--- a/lib/Support/Windows/Program.inc
+++ b/lib/Support/Windows/Program.inc
@@ -349,7 +349,8 @@ Program::Wait(const Path &path,
   if (WaitForSingleObject(hProcess, millisecondsToWait) == WAIT_TIMEOUT) {
     if (!TerminateProcess(hProcess, 1)) {
       MakeErrMsg(ErrMsg, "Failed to terminate timed-out program.");
-      return -1;
+      // -2 indicates a crash or timeout as opposed to failure to execute.
+      return -2;
     }
     WaitForSingleObject(hProcess, INFINITE);
   }
@@ -362,7 +363,8 @@ Program::Wait(const Path &path,
   if (!rc) {
     SetLastError(err);
     MakeErrMsg(ErrMsg, "Failed getting status for program.");
-    return -1;
+    // -2 indicates a crash or timeout as opposed to failure to execute.
+    return -2;
   }
 
   return status;
diff --git a/lib/Target/ARM/ARMAsmBackend.cpp b/lib/Target/ARM/ARMAsmBackend.cpp
index f062819..618a2b5 100644
--- a/lib/Target/ARM/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/ARMAsmBackend.cpp
@@ -76,7 +76,7 @@ public:
 { "fixup_arm_thumb_blx",     7,            21,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_arm_thumb_cb",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_arm_thumb_cp",      1,             8,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_bcc",     1,             8,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_bcc",     0,             8,  MCFixupKindInfo::FKF_IsPCRel },
 // movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 - 19.
 { "fixup_arm_movt_hi16",     0,            20,  0 },
 { "fixup_arm_movw_lo16",     0,            20,  0 },
@@ -164,23 +164,25 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case FK_Data_4:
     return Value;
   case ARM::fixup_arm_movt_hi16:
-  case ARM::fixup_arm_movt_hi16_pcrel:
     Value >>= 16;
     // Fallthrough
   case ARM::fixup_arm_movw_lo16:
+  case ARM::fixup_arm_movt_hi16_pcrel:
   case ARM::fixup_arm_movw_lo16_pcrel: {
     unsigned Hi4 = (Value & 0xF000) >> 12;
     unsigned Lo12 = Value & 0x0FFF;
+    assert ((((int64_t)Value) >= -0x8000) && (((int64_t)Value) <= 0x7fff) &&
+            "Out of range pc-relative fixup value!");
     // inst{19-16} = Hi4;
     // inst{11-0} = Lo12;
     Value = (Hi4 << 16) | (Lo12);
     return Value;
   }
   case ARM::fixup_t2_movt_hi16:
-  case ARM::fixup_t2_movt_hi16_pcrel:
     Value >>= 16;
     // Fallthrough
   case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_t2_movt_hi16_pcrel:
   case ARM::fixup_t2_movw_lo16_pcrel: {
     unsigned Hi4 = (Value & 0xF000) >> 12;
     unsigned i = (Value & 0x800) >> 11;
@@ -190,8 +192,9 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     // inst{26} = i;
     // inst{14-12} = Mid3;
     // inst{7-0} = Lo8;
+    assert ((((int64_t)Value) >= -0x8000) && (((int64_t)Value) <= 0x7fff) &&
+            "Out of range pc-relative fixup value!");
     Value = (Hi4 << 16) | (i << 26) | (Mid3 << 12) | (Lo8);
-
     uint64_t swapped = (Value & 0xFFFF0000) >> 16;
     swapped |= (Value & 0x0000FFFF) << 16;
     return swapped;
@@ -305,7 +308,7 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     //
     // Note that the halfwords are stored high first, low second; so we need
     // to transpose the fixup value here to map properly.
-    unsigned isNeg = (int64_t(Value) < 0) ? 1 : 0;
+    unsigned isNeg = (int64_t(Value - 4) < 0) ? 1 : 0;
     uint32_t Binary = 0;
     Value = 0x3fffff & ((Value - 4) >> 1);
     Binary  = (Value & 0x7ff) << 16;    // Low imm11 value.
@@ -323,7 +326,7 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     //
     // Note that the halfwords are stored high first, low second; so we need
     // to transpose the fixup value here to map properly.
-    unsigned isNeg = (int64_t(Value) < 0) ? 1 : 0;
+    unsigned isNeg = (int64_t(Value-4) < 0) ? 1 : 0;
     uint32_t Binary = 0;
     Value = 0xfffff & ((Value - 2) >> 2);
     Binary  = (Value & 0x3ff) << 17;    // Low imm10L value.
@@ -404,7 +407,6 @@ void ELFARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data,
   if (!Value) return;           // Doesn't change encoding.
 
   unsigned Offset = Fixup.getOffset();
-  assert(Offset % NumBytes == 0 && "Offset mod NumBytes is nonzero!");
 
   // For each byte of the fragment that the fixup touches, mask in the bits from
   // the fixup value. The Value has been "split up" into the appropriate
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index c428e18..eb73902 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -172,47 +172,6 @@ getDebugValueLocation(const MachineInstr *MI) const {
   return Location;
 }
 
-/// getDwarfRegOpSize - get size required to emit given machine location using
-/// dwarf encoding.
-unsigned ARMAsmPrinter::getDwarfRegOpSize(const MachineLocation &MLoc) const {
- const TargetRegisterInfo *RI = TM.getRegisterInfo();
-  if (RI->getDwarfRegNum(MLoc.getReg(), false) != -1)
-    return AsmPrinter::getDwarfRegOpSize(MLoc);
-  else {
-    unsigned Reg = MLoc.getReg();
-    if (Reg >= ARM::S0 && Reg <= ARM::S31) {
-      assert(ARM::S0 + 31 == ARM::S31 && "Unexpected ARM S register numbering");
-      // S registers are described as bit-pieces of a register
-      // S[2x] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 0)
-      // S[2x+1] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 32)
-      
-      unsigned SReg = Reg - ARM::S0;
-      unsigned Rx = 256 + (SReg >> 1);
-      OutStreamer.AddComment("Loc expr size");
-      // DW_OP_regx + ULEB + DW_OP_bit_piece + ULEB + ULEB
-      //   1 + ULEB(Rx) + 1 + 1 + 1
-      return 4 + MCAsmInfo::getULEB128Size(Rx);
-    } 
-    
-    if (Reg >= ARM::Q0 && Reg <= ARM::Q15) {
-      assert(ARM::Q0 + 15 == ARM::Q15 && "Unexpected ARM Q register numbering");
-      // Q registers Q0-Q15 are described by composing two D registers together.
-      // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1) DW_OP_piece(8)
-
-      unsigned QReg = Reg - ARM::Q0;
-      unsigned D1 = 256 + 2 * QReg;
-      unsigned D2 = D1 + 1;
-      
-      OutStreamer.AddComment("Loc expr size");
-      // DW_OP_regx + ULEB + DW_OP_piece + ULEB(8) +
-      // DW_OP_regx + ULEB + DW_OP_piece + ULEB(8);
-      //   6 + ULEB(D1) + ULEB(D2)
-      return 6 + MCAsmInfo::getULEB128Size(D1) + MCAsmInfo::getULEB128Size(D2);
-    }
-  }
-  return 0;
-}
-
 /// EmitDwarfRegOp - Emit dwarf register operation.
 void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
   const TargetRegisterInfo *RI = TM.getRegisterInfo();
@@ -229,10 +188,6 @@ void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
       unsigned SReg = Reg - ARM::S0;
       bool odd = SReg & 0x1;
       unsigned Rx = 256 + (SReg >> 1);
-      OutStreamer.AddComment("Loc expr size");
-      // DW_OP_regx + ULEB + DW_OP_bit_piece + ULEB + ULEB
-      //   1 + ULEB(Rx) + 1 + 1 + 1
-      EmitInt16(4 + MCAsmInfo::getULEB128Size(Rx));
 
       OutStreamer.AddComment("DW_OP_regx for S register");
       EmitInt8(dwarf::DW_OP_regx);
@@ -260,12 +215,6 @@ void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
       unsigned D1 = 256 + 2 * QReg;
       unsigned D2 = D1 + 1;
       
-      OutStreamer.AddComment("Loc expr size");
-      // DW_OP_regx + ULEB + DW_OP_piece + ULEB(8) +
-      // DW_OP_regx + ULEB + DW_OP_piece + ULEB(8);
-      //   6 + ULEB(D1) + ULEB(D2)
-      EmitInt16(6 + MCAsmInfo::getULEB128Size(D1) + MCAsmInfo::getULEB128Size(D2));
-
       OutStreamer.AddComment("DW_OP_regx for Q register: D1");
       EmitInt8(dwarf::DW_OP_regx);
       EmitULEB128(D1);
@@ -286,7 +235,7 @@ void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
 void ARMAsmPrinter::EmitFunctionEntryLabel() {
   if (AFI->isThumbFunction()) {
     OutStreamer.EmitAssemblerFlag(MCAF_Code16);
-    OutStreamer.EmitThumbFunc(Subtarget->isTargetDarwin()? CurrentFnSym : 0);
+    OutStreamer.EmitThumbFunc(CurrentFnSym);
   }
 
   OutStreamer.EmitLabel(CurrentFnSym);
@@ -416,10 +365,63 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
     case 'q': // Print a NEON quad precision register.
       printOperand(MI, OpNum, O);
       return false;
-    case 'Q':
-    case 'R':
-    case 'H':
-      // These modifiers are not yet supported.
+    case 'y': // Print a VFP single precision register as indexed double.
+      // This uses the ordering of the alias table to get the first 'd' register
+      // that overlaps the 's' register. Also, s0 is an odd register, hence the
+      // odd modulus check below.
+      if (MI->getOperand(OpNum).isReg()) {
+        unsigned Reg = MI->getOperand(OpNum).getReg();
+        const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+        O << ARMInstPrinter::getRegisterName(TRI->getAliasSet(Reg)[0]) <<
+        (((Reg % 2) == 1) ? "[0]" : "[1]");
+        return false;
+      }
+      return true;
+    case 'B': // Bitwise inverse of integer or symbol without a preceding #.
+      if (!MI->getOperand(OpNum).isImm())
+        return true;
+      O << ~(MI->getOperand(OpNum).getImm());
+      return false;
+    case 'L': // The low 16 bits of an immediate constant.
+      if (!MI->getOperand(OpNum).isImm())
+        return true;
+      O << (MI->getOperand(OpNum).getImm() & 0xffff);
+      return false;
+    case 'M': { // A register range suitable for LDM/STM.
+      if (!MI->getOperand(OpNum).isReg())
+        return true;
+      const MachineOperand &MO = MI->getOperand(OpNum);
+      unsigned RegBegin = MO.getReg();
+      // This takes advantage of the 2 operand-ness of ldm/stm and that we've
+      // already got the operands in registers that are operands to the
+      // inline asm statement.
+      
+      O << "{" << ARMInstPrinter::getRegisterName(RegBegin);
+      
+      // FIXME: The register allocator not only may not have given us the
+      // registers in sequence, but may not be in ascending registers. This
+      // will require changes in the register allocator that'll need to be
+      // propagated down here if the operands change.
+      unsigned RegOps = OpNum + 1;
+      while (MI->getOperand(RegOps).isReg()) {
+        O << ", " 
+          << ARMInstPrinter::getRegisterName(MI->getOperand(RegOps).getReg());
+        RegOps++;
+      }
+
+      O << "}";
+
+      return false;
+    }
+    // These modifiers are not yet supported.
+    case 'p': // The high single-precision register of a VFP double-precision
+              // register.
+    case 'e': // The low doubleword register of a NEON quad register.
+    case 'f': // The high doubleword register of a NEON quad register.
+    case 'h': // A range of VFP/NEON registers suitable for VLD1/VST1.
+    case 'Q': // The least significant register of a pair.
+    case 'R': // The most significant register of a pair.
+    case 'H': // The highest-numbered register of a pair.
       return true;
     }
   }
@@ -432,9 +434,21 @@ bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
                                           unsigned OpNum, unsigned AsmVariant,
                                           const char *ExtraCode,
                                           raw_ostream &O) {
-  if (ExtraCode && ExtraCode[0])
-    return true; // Unknown modifier.
-
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+    
+    switch (ExtraCode[0]) {
+      case 'A': // A memory operand for a VLD1/VST1 instruction.
+      default: return true;  // Unknown modifier.
+      case 'm': // The base register of a memory operand.
+        if (!MI->getOperand(OpNum).isReg())
+          return true;
+        O << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg());
+        return false;
+    }
+  }
+  
   const MachineOperand &MO = MI->getOperand(OpNum);
   assert(MO.isReg() && "unexpected inline asm memory operand");
   O << "[" << ARMInstPrinter::getRegisterName(MO.getReg()) << "]";
@@ -600,6 +614,12 @@ void ARMAsmPrinter::emitAttributes() {
     //
 
     /// ADD additional Else-cases here!
+  } else if (CPUString == "xscale") {
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5TEJ);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
+                               ARMBuildAttrs::Allowed);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                               ARMBuildAttrs::Allowed);
   } else if (CPUString == "generic") {
     // FIXME: Why these defaults?
     AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T);
@@ -1188,6 +1208,26 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     return;
   }
+  case ARM::tBXr9_CALL:
+  case ARM::tBX_CALL: {
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::tMOVr);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::LR));
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::tBX);
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+      // Add predicate operands.
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    return;
+  }
   case ARM::BMOVPCRXr9_CALL:
   case ARM::BMOVPCRX_CALL: {
     {
@@ -1809,7 +1849,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     {
       MCInst TmpInst;
-      TmpInst.setOpcode(ARM::tBX_RET_vararg);
+      TmpInst.setOpcode(ARM::tBX);
       TmpInst.addOperand(MCOperand::CreateReg(ScratchReg));
       // Predicate.
       TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
@@ -1838,7 +1878,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case ARM::tTAILJMPdND: {
     MCInst TmpInst, TmpInst2;
     LowerARMMachineInstrToMCInst(MI, TmpInst2, *this);
-    TmpInst.setOpcode(ARM::tB);
+    // The Darwin toolchain doesn't support tail call relocations of 16-bit
+    // branches.
+    TmpInst.setOpcode(Opc == ARM::tTAILJMPd ? ARM::t2B : ARM::tB);
     TmpInst.addOperand(TmpInst2.getOperand(0));
     OutStreamer.AddComment("TAILCALL");
     OutStreamer.EmitInstruction(TmpInst);
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index 1ee1b70..5f9169e 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -89,10 +89,6 @@ public:
 
   MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
 
-  /// getDwarfRegOpSize - get size required to emit given machine location
-  /// using dwarf encoding.
-  virtual unsigned getDwarfRegOpSize(const MachineLocation &MLoc) const;
-
   /// EmitDwarfRegOp - Emit dwarf register operation.
   virtual void EmitDwarfRegOp(const MachineLocation &MLoc) const;
 
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index ea1f08a..4ab37f6 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -684,6 +684,10 @@ int ARMBaseRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return ARMGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
 }
 
+int ARMBaseRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
+  return ARMGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
+}
+
 unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg,
                                               const MachineFunction &MF) const {
   switch (Reg) {
@@ -1111,8 +1115,11 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB,
   if (Ins != MBB->end())
     DL = Ins->getDebugLoc();
 
-  MachineInstrBuilder MIB =
-    BuildMI(*MBB, Ins, DL, TII.get(ADDriOpc), BaseReg)
+  const TargetInstrDesc &TID = TII.get(ADDriOpc);
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MRI.constrainRegClass(BaseReg, TID.OpInfo[0].getRegClass(this));
+
+  MachineInstrBuilder MIB = BuildMI(*MBB, Ins, DL, TID, BaseReg)
     .addFrameIndex(FrameIdx).addImm(Offset);
 
   if (!AFI->isThumb1OnlyFunction())
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 9edf72d..c60d75a 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -172,6 +172,7 @@ public:
   unsigned getEHHandlerRegister() const;
 
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 
   bool isLowRegister(unsigned Reg) const;
 
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index fa73716..16d4ca5 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -221,6 +221,9 @@ namespace {
       const { return 0; }
     unsigned getAddrMode6AddressOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
+    unsigned getAddrMode6OneLane32AddressOpValue(const MachineInstr &MI,
+                                                 unsigned Op)
+      const { return 0; }
     unsigned getAddrMode6DupAddressOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
     unsigned getAddrMode6OffsetOpValue(const MachineInstr &MI, unsigned Op)
@@ -229,6 +232,8 @@ namespace {
                                             unsigned Op) const { return 0; }
     unsigned getMsbOpValue(const MachineInstr &MI,
                            unsigned Op) const { return 0; }
+    unsigned getSsatBitPosValue(const MachineInstr &MI,
+                                unsigned Op) const { return 0; }
     uint32_t getLdStmModeOpValue(const MachineInstr &MI, unsigned OpIdx)
       const {return 0; }
     uint32_t getLdStSORegOpValue(const MachineInstr &MI, unsigned OpIdx)
@@ -1372,6 +1377,12 @@ void ARMCodeEmitter::emitMiscArithInstruction(const MachineInstr &MI) {
   // Set the conditional execution predicate
   Binary |= II->getPredicate(&MI) << ARMII::CondShift;
 
+  // PKH instructions are finished at this point
+  if (TID.Opcode == ARM::PKHBT || TID.Opcode == ARM::PKHTB) {
+    emitWordLE(Binary);
+    return;
+  }
+
   unsigned OpIdx = 0;
 
   // Encode Rd
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 3baf274..5cf73c4 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -71,12 +71,10 @@ namespace {
     } Base;
 
     int Offset;
-    unsigned Scale;
-    unsigned PlusReg;
 
     // Innocuous defaults for our address.
     Address()
-     : BaseType(RegBase), Offset(0), Scale(0), PlusReg(0) {
+     : BaseType(RegBase), Offset(0) {
        Base.Reg = 0;
      }
   } Address;
@@ -169,6 +167,7 @@ class ARMFastISel : public FastISel {
     bool SelectCall(const Instruction *I);
     bool SelectSelect(const Instruction *I);
     bool SelectRet(const Instruction *I);
+    bool SelectIntCast(const Instruction *I);
 
     // Utility routines.
   private:
@@ -208,7 +207,8 @@ class ARMFastISel : public FastISel {
     bool DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR);
     const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB);
     void AddLoadStoreOperands(EVT VT, Address &Addr,
-                              const MachineInstrBuilder &MIB);
+                              const MachineInstrBuilder &MIB,
+                              unsigned Flags);
 };
 
 } // end anonymous namespace
@@ -443,14 +443,14 @@ unsigned ARMFastISel::FastEmitInst_ii(unsigned MachineInstOpcode,
                                       uint64_t Imm1, uint64_t Imm2) {
   unsigned ResultReg = createResultReg(RC);
   const TargetInstrDesc &II = TII.get(MachineInstOpcode);
-  
+
   if (II.getNumDefs() >= 1)
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                     .addImm(Imm1).addImm(Imm2));
   else {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
                     .addImm(Imm1).addImm(Imm2));
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, 
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                             TII.get(TargetOpcode::COPY),
                             ResultReg)
                     .addReg(II.ImplicitDefs[0]));
@@ -577,9 +577,6 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) {
 
   Reloc::Model RelocM = TM.getRelocationModel();
 
-  // TODO: No external globals for now.
-  if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) return 0;
-
   // TODO: Need more magic for ARM PIC.
   if (!isThumb && (RelocM == Reloc::PIC_)) return 0;
 
@@ -614,6 +611,23 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) {
           .addImm(0);
   }
   AddOptionalDefs(MIB);
+
+  if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) {
+    unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+    if (isThumb)
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::t2LDRi12),
+                    NewDestReg)
+            .addReg(DestReg)
+            .addImm(0);
+    else
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRi12),
+                    NewDestReg)
+            .addReg(DestReg)
+            .addImm(0);
+    DestReg = NewDestReg;
+    AddOptionalDefs(MIB);
+  }
+
   return DestReg;
 }
 
@@ -853,7 +867,8 @@ void ARMFastISel::ARMSimplifyAddress(Address &Addr, EVT VT) {
 }
 
 void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr,
-                                       const MachineInstrBuilder &MIB) {
+                                       const MachineInstrBuilder &MIB,
+                                       unsigned Flags) {
   // addrmode5 output depends on the selection dag addressing dividing the
   // offset by 4 that it then later multiplies. Do this here as well.
   if (VT.getSimpleVT().SimpleTy == MVT::f32 ||
@@ -867,7 +882,7 @@ void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr,
     MachineMemOperand *MMO =
           FuncInfo.MF->getMachineMemOperand(
                                   MachinePointerInfo::getFixedStack(FI, Offset),
-                                  MachineMemOperand::MOLoad,
+                                  Flags,
                                   MFI.getObjectSize(FI),
                                   MFI.getObjectAlignment(FI));
     // Now add the rest of the operands.
@@ -926,7 +941,7 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr) {
   ResultReg = createResultReg(RC);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                                     TII.get(Opc), ResultReg);
-  AddLoadStoreOperands(VT, Addr, MIB);
+  AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOLoad);
   return true;
 }
 
@@ -985,7 +1000,7 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr) {
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                                     TII.get(StrOpc))
                             .addReg(SrcReg, getKillRegState(true));
-  AddLoadStoreOperands(VT, Addr, MIB);
+  AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOStore);
   return true;
 }
 
@@ -1131,7 +1146,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
   } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
     MVT SourceVT;
     if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
-        (isTypeLegal(TI->getOperand(0)->getType(), SourceVT))) {
+        (isLoadTypeLegal(TI->getOperand(0)->getType(), SourceVT))) {
       unsigned TstOpc = isThumb ? ARM::t2TSTri : ARM::TSTri;
       unsigned OpReg = getRegForValue(TI->getOperand(0));
       AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
@@ -1298,6 +1313,10 @@ bool ARMFastISel::SelectSIToFP(const Instruction *I) {
   if (!isTypeLegal(Ty, DstVT))
     return false;
 
+  // FIXME: Handle sign-extension where necessary.
+  if (!I->getOperand(0)->getType()->isIntegerTy(32))
+    return false;
+
   unsigned Op = getRegForValue(I->getOperand(0));
   if (Op == 0) return false;
 
@@ -1523,7 +1542,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
                                   CallingConv::ID CC,
                                   unsigned &NumBytes) {
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, false, TM, ArgLocs, *Context);
+  CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context);
   CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC, false));
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -1636,7 +1655,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
   // Now the return value.
   if (RetVT != MVT::isVoid) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, false, TM, RVLocs, *Context);
+    CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
     CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true));
 
     // Copy all of the result registers out of their specified physreg.
@@ -1692,7 +1711,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
-    CCState CCInfo(CC, F.isVarArg(), TM, ValLocs, I->getContext());
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, I->getContext());
     CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */));
 
     const Value *RV = Ret->getOperand(0);
@@ -1839,9 +1858,9 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
   // Can't handle inline asm or worry about intrinsics yet.
   if (isa<InlineAsm>(Callee) || isa<IntrinsicInst>(CI)) return false;
 
-  // Only handle global variable Callees that are direct calls.
+  // Only handle global variable Callees.
   const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
-  if (!GV || Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel()))
+  if (!GV)
     return false;
 
   // Check the calling convention.
@@ -1946,6 +1965,79 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
 
 }
 
+bool ARMFastISel::SelectIntCast(const Instruction *I) {
+  // On ARM, in general, integer casts don't involve legal types; this code
+  // handles promotable integers.  The high bits for a type smaller than
+  // the register size are assumed to be undefined.
+  const Type *DestTy = I->getType();
+  Value *Op = I->getOperand(0);
+  const Type *SrcTy = Op->getType();
+
+  EVT SrcVT, DestVT;
+  SrcVT = TLI.getValueType(SrcTy, true);
+  DestVT = TLI.getValueType(DestTy, true);
+
+  if (isa<TruncInst>(I)) {
+    if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
+      return false;
+    if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1)
+      return false;
+
+    unsigned SrcReg = getRegForValue(Op);
+    if (!SrcReg) return false;
+
+    // Because the high bits are undefined, a truncate doesn't generate
+    // any code.
+    UpdateValueMap(I, SrcReg);
+    return true;
+  }
+  if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8)
+    return false;
+
+  unsigned Opc;
+  bool isZext = isa<ZExtInst>(I);
+  bool isBoolZext = false;
+  if (!SrcVT.isSimple())
+    return false;
+  switch (SrcVT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i16:
+    if (isZext)
+      Opc = isThumb ? ARM::t2UXTHr : ARM::UXTHr;
+    else
+      Opc = isThumb ? ARM::t2SXTHr : ARM::SXTHr;
+    break;
+  case MVT::i8:
+    if (isZext)
+      Opc = isThumb ? ARM::t2UXTBr : ARM::UXTBr;
+    else
+      Opc = isThumb ? ARM::t2SXTBr : ARM::SXTBr;
+    break;
+  case MVT::i1:
+    if (isZext) {
+      Opc = isThumb ? ARM::t2ANDri : ARM::ANDri;
+      isBoolZext = true;
+      break;
+    }
+    return false;
+  }
+
+  // FIXME: We could save an instruction in many cases by special-casing
+  // load instructions.
+  unsigned SrcReg = getRegForValue(Op);
+  if (!SrcReg) return false;
+
+  unsigned DestReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+        .addReg(SrcReg);
+  if (isBoolZext)
+    MIB.addImm(1);
+  AddOptionalDefs(MIB);
+  UpdateValueMap(I, DestReg);
+  return true;
+}
+
 // TODO: SoftFP support.
 bool ARMFastISel::TargetSelectInstruction(const Instruction *I) {
 
@@ -1983,6 +2075,10 @@ bool ARMFastISel::TargetSelectInstruction(const Instruction *I) {
       return SelectSelect(I);
     case Instruction::Ret:
       return SelectRet(I);
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      return SelectIntCast(I);
     default: break;
   }
   return false;
diff --git a/lib/Target/ARM/ARMFixupKinds.h b/lib/Target/ARM/ARMFixupKinds.h
index 3d175e3..350c92d 100644
--- a/lib/Target/ARM/ARMFixupKinds.h
+++ b/lib/Target/ARM/ARMFixupKinds.h
@@ -56,7 +56,7 @@ enum Fixups {
   // fixup_arm_thumb_br - 12-bit fixup for Thumb B instructions.
   fixup_arm_thumb_br,
 
-  // fixup_arm_thumb_blx - Fixup for Thumb BL instructions.
+  // fixup_arm_thumb_bl - Fixup for Thumb BL instructions.
   fixup_arm_thumb_bl,
 
   // fixup_arm_thumb_blx - Fixup for Thumb BLX instructions.
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index abe5a31..9ad516d 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -2691,6 +2691,111 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     default:
       break;
 
+    case Intrinsic::arm_ldrexd: {
+      SDValue MemAddr = N->getOperand(2);
+      DebugLoc dl = N->getDebugLoc();
+      SDValue Chain = N->getOperand(0);
+
+      unsigned NewOpc = ARM::LDREXD;
+      if (Subtarget->isThumb() && Subtarget->hasThumb2())
+        NewOpc = ARM::t2LDREXD;
+
+      // arm_ldrexd returns a i64 value in {i32, i32}
+      std::vector<EVT> ResTys;
+      ResTys.push_back(MVT::i32);
+      ResTys.push_back(MVT::i32);
+      ResTys.push_back(MVT::Other);
+
+      // place arguments in the right order
+      SmallVector<SDValue, 7> Ops;
+      Ops.push_back(MemAddr);
+      Ops.push_back(getAL(CurDAG));
+      Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+      Ops.push_back(Chain);
+      SDNode *Ld = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops.data(),
+                                          Ops.size());
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+      cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+
+      // Until there's support for specifing explicit register constraints
+      // like the use of even/odd register pair, hardcode ldrexd to always
+      // use the pair [R0, R1] to hold the load result.
+      Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ARM::R0,
+                                   SDValue(Ld, 0), SDValue(0,0));
+      Chain = CurDAG->getCopyToReg(Chain, dl, ARM::R1,
+                                   SDValue(Ld, 1), Chain.getValue(1));
+
+      // Remap uses.
+      SDValue Glue = Chain.getValue(1);
+      if (!SDValue(N, 0).use_empty()) {
+        SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                                ARM::R0, MVT::i32, Glue);
+        Glue = Result.getValue(2);
+        ReplaceUses(SDValue(N, 0), Result);
+      }
+      if (!SDValue(N, 1).use_empty()) {
+        SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                                ARM::R1, MVT::i32, Glue);
+        Glue = Result.getValue(2);
+        ReplaceUses(SDValue(N, 1), Result);
+      }
+
+      ReplaceUses(SDValue(N, 2), SDValue(Ld, 2));
+      return NULL;
+    }
+
+    case Intrinsic::arm_strexd: {
+      DebugLoc dl = N->getDebugLoc();
+      SDValue Chain = N->getOperand(0);
+      SDValue Val0 = N->getOperand(2);
+      SDValue Val1 = N->getOperand(3);
+      SDValue MemAddr = N->getOperand(4);
+
+      // Until there's support for specifing explicit register constraints
+      // like the use of even/odd register pair, hardcode strexd to always
+      // use the pair [R2, R3] to hold the i64 (i32, i32) value to be stored.
+      Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ARM::R2, Val0,
+                                   SDValue(0, 0));
+      Chain = CurDAG->getCopyToReg(Chain, dl, ARM::R3, Val1, Chain.getValue(1));
+
+      SDValue Glue = Chain.getValue(1);
+      Val0 = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                    ARM::R2, MVT::i32, Glue);
+      Glue = Val0.getValue(1);
+      Val1 = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                    ARM::R3, MVT::i32, Glue);
+
+      // Store exclusive double return a i32 value which is the return status
+      // of the issued store.
+      std::vector<EVT> ResTys;
+      ResTys.push_back(MVT::i32);
+      ResTys.push_back(MVT::Other);
+
+      // place arguments in the right order
+      SmallVector<SDValue, 7> Ops;
+      Ops.push_back(Val0);
+      Ops.push_back(Val1);
+      Ops.push_back(MemAddr);
+      Ops.push_back(getAL(CurDAG));
+      Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+      Ops.push_back(Chain);
+
+      unsigned NewOpc = ARM::STREXD;
+      if (Subtarget->isThumb() && Subtarget->hasThumb2())
+        NewOpc = ARM::t2STREXD;
+
+      SDNode *St = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops.data(),
+                                          Ops.size());
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+      cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+      return St;
+    }
+
     case Intrinsic::arm_neon_vld1: {
       unsigned DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16,
                               ARM::VLD1d32, ARM::VLD1d64 };
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 0a31b87..339c858 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -72,6 +72,21 @@ ARMInterworking("arm-interworking", cl::Hidden,
   cl::desc("Enable / disable ARM interworking (for debugging only)"),
   cl::init(true));
 
+namespace llvm {
+  class ARMCCState : public CCState {
+  public:
+    ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+               const TargetMachine &TM, SmallVector<CCValAssign, 16> &locs,
+               LLVMContext &C, ParmContext PC)
+        : CCState(CC, isVarArg, MF, TM, locs, C) {
+      assert(((PC == Call) || (PC == Prologue)) &&
+             "ARMCCState users must specify whether their context is call"
+             "or prologue generation.");
+      CallOrPrologue = PC;
+    }
+  };
+}
+
 // The APCS parameter registers.
 static const unsigned GPRArgRegs[] = {
   ARM::R0, ARM::R1, ARM::R2, ARM::R3
@@ -396,12 +411,12 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setLibcallCallingConv(RTLIB::UDIV_I8, CallingConv::ARM_AAPCS);
     setLibcallCallingConv(RTLIB::UDIV_I16, CallingConv::ARM_AAPCS);
     setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS);
-  }
 
-  // Use divmod iOS compiler-rt calls.
-  if (Subtarget->getTargetTriple().getOS() == Triple::IOS) {
-    setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
-    setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
+    // Memory operations
+    // RTABI chapter 4.3.4
+    setLibcallName(RTLIB::MEMCPY,  "__aeabi_memcpy");
+    setLibcallName(RTLIB::MEMMOVE, "__aeabi_memmove");
+    setLibcallName(RTLIB::MEMSET,  "__aeabi_memset");
   }
 
   if (Subtarget->isThumb1Only())
@@ -656,6 +671,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
     setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
     setOperationAction(ISD::EH_SJLJ_DISPATCHSETUP, MVT::Other, Custom);
+    setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
   }
 
   setOperationAction(ISD::SETCC,     MVT::i32, Expand);
@@ -730,6 +746,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setMinStackArgumentAlignment(4);
 
   benefitFromCodePlacementOpt = true;
+
+  setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
 }
 
 // FIXME: It might make sense to define the representative register class as the
@@ -931,11 +949,6 @@ ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const {
   return ARM::createFastISel(funcInfo);
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const {
-  return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 1 : 2;
-}
-
 /// getMaximalGlobalOffset - Returns the maximal possible offset which can
 /// be used for loads / stores from the global.
 unsigned ARMTargetLowering::getMaximalGlobalOffset() const {
@@ -1073,8 +1086,8 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs, *DAG.getContext(), Call);
   CCInfo.AnalyzeCallResult(Ins,
                            CCAssignFnForNode(CallConv, /* Return*/ true,
                                              isVarArg));
@@ -1208,9 +1221,8 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
-                 *DAG.getContext());
-  CCInfo.setCallOrPrologue(Call);
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext(), Call);
   CCInfo.AnalyzeCallOperands(Outs,
                              CCAssignFnForNode(CallConv, /* Return*/ false,
                                                isVarArg));
@@ -1640,13 +1652,13 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   // results are returned in the same way as what the caller expects.
   if (!CCMatch) {
     SmallVector<CCValAssign, 16> RVLocs1;
-    CCState CCInfo1(CalleeCC, false, getTargetMachine(),
-                    RVLocs1, *DAG.getContext());
+    ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
+                       getTargetMachine(), RVLocs1, *DAG.getContext(), Call);
     CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg));
 
     SmallVector<CCValAssign, 16> RVLocs2;
-    CCState CCInfo2(CallerCC, false, getTargetMachine(),
-                    RVLocs2, *DAG.getContext());
+    ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
+                       getTargetMachine(), RVLocs2, *DAG.getContext(), Call);
     CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg));
 
     if (RVLocs1.size() != RVLocs2.size())
@@ -1672,8 +1684,8 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     // Check if stack adjustment is needed. For now, do not do this if any
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(),
-                   ArgLocs, *DAG.getContext());
+    ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+                      getTargetMachine(), ArgLocs, *DAG.getContext(), Call);
     CCInfo.AnalyzeCallOperands(Outs,
                                CCAssignFnForNode(CalleeCC, false, isVarArg));
     if (CCInfo.getNextStackOffset()) {
@@ -1732,8 +1744,8 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slots.
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
-                 *DAG.getContext());
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs, *DAG.getContext(), Call);
 
   // Analyze outgoing return values.
   CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
@@ -2085,7 +2097,8 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
-  if (Subtarget->useMovt()) {
+  // FIXME: Enable this for static codegen when tool issues are fixed.
+  if (Subtarget->useMovt() && RelocM != Reloc::Static) {
     ++NumMovwMovt;
     // FIXME: Once remat is capable of dealing with instructions with register
     // operands, expand this into two nodes.
@@ -2160,7 +2173,7 @@ ARMTargetLowering::LowerEH_SJLJ_DISPATCHSETUP(SDValue Op, SelectionDAG &DAG)
   const {
   DebugLoc dl = Op.getDebugLoc();
   return DAG.getNode(ARMISD::EH_SJLJ_DISPATCHSETUP, dl, MVT::Other,
-                     Op.getOperand(0));
+                     Op.getOperand(0), Op.getOperand(1));
 }
 
 SDValue
@@ -2425,9 +2438,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
-                 *DAG.getContext());
-  CCInfo.setCallOrPrologue(Prologue);
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                    getTargetMachine(), ArgLocs, *DAG.getContext(), Prologue);
   CCInfo.AnalyzeFormalArguments(Ins,
                                 CCAssignFnForNode(CallConv, /* Return*/ false,
                                                   isVarArg));
@@ -2525,7 +2537,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
       if (index != lastInsIndex)
         {
           ISD::ArgFlagsTy Flags = Ins[index].Flags;
-          // FIXME: For now, all byval parameter objects are marked mutable. 
+          // FIXME: For now, all byval parameter objects are marked mutable.
           // This can be changed with more analysis.
           // In case of tail call optimization mark all arguments mutable.
           // Since they could be overwritten by lowering of arguments in case of
@@ -2702,10 +2714,11 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       }
 
       if (True.getNode() && False.getNode()) {
-        EVT VT = Cond.getValueType();
+        EVT VT = Op.getValueType();
         SDValue ARMcc = Cond.getOperand(2);
         SDValue CCR = Cond.getOperand(3);
         SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
+        assert(True.getValueType() == VT);
         return DAG.getNode(ARMISD::CMOV, dl, VT, True, False, ARMcc, CCR, Cmp);
       }
     }
@@ -4191,7 +4204,16 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
   switch (OpNum) {
   default: llvm_unreachable("Unknown shuffle opcode!");
   case OP_VREV:
-    return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
+    // VREV divides the vector in half and swaps within the half.
+    if (VT.getVectorElementType() == MVT::i32 ||
+        VT.getVectorElementType() == MVT::f32)
+      return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
+    // vrev <4 x i16> -> VREV32
+    if (VT.getVectorElementType() == MVT::i16)
+      return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
+    // vrev <4 x i8> -> VREV16
+    assert(VT.getVectorElementType() == MVT::i8);
+    return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
   case OP_VDUP0:
   case OP_VDUP1:
   case OP_VDUP2:
@@ -4662,10 +4684,10 @@ LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) {
   // Because short has a smaller range than ushort, we can actually get away
   // with only a single newton step.  This requires that we use a weird bias
   // of 89, however (again, this has been exhaustively tested).
-  // float4 result = as_float4(as_int4(xf*recip) + 89);
+  // float4 result = as_float4(as_int4(xf*recip) + 0x89);
   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
-  N1 = DAG.getConstant(89, MVT::i32);
+  N1 = DAG.getConstant(0x89, MVT::i32);
   N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
@@ -4752,26 +4774,26 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
   N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
   N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
-  N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
+  SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
 
   // Use reciprocal estimate and two refinement steps.
   // float4 recip = vrecpeq_f32(yf);
   // recip *= vrecpsq_f32(yf, recip);
   // recip *= vrecpsq_f32(yf, recip);
   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
-                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
+                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), BN1);
   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                    DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
-                   N1, N2);
+                   BN1, N2);
   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                    DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
-                   N1, N2);
+                   BN1, N2);
   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
   // Simply multiplying by the reciprocal estimate can leave us a few ulps
   // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
   // and that it will never cause us to return an answer too large).
-  // float4 result = as_float4(as_int4(xf*recip) + 89);
+  // float4 result = as_float4(as_int4(xf*recip) + 2);
   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
   N1 = DAG.getConstant(2, MVT::i32);
@@ -4869,12 +4891,21 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   unsigned ptr     = MI->getOperand(1).getReg();
   unsigned oldval  = MI->getOperand(2).getReg();
   unsigned newval  = MI->getOperand(3).getReg();
-  unsigned scratch = BB->getParent()->getRegInfo()
-    .createVirtualRegister(ARM::GPRRegisterClass);
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
 
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  unsigned scratch =
+    MRI.createVirtualRegister(isThumb2 ? ARM::rGPRRegisterClass
+                                       : ARM::GPRRegisterClass);
+
+  if (isThumb2) {
+    MRI.constrainRegClass(dest, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(oldval, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(newval, ARM::rGPRRegisterClass);
+  }
+
   unsigned ldrOpc, strOpc;
   switch (Size) {
   default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
@@ -4966,8 +4997,14 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   unsigned ptr = MI->getOperand(1).getReg();
   unsigned incr = MI->getOperand(2).getReg();
   DebugLoc dl = MI->getDebugLoc();
-
   bool isThumb2 = Subtarget->isThumb2();
+
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  if (isThumb2) {
+    MRI.constrainRegClass(dest, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass);
+  }
+
   unsigned ldrOpc, strOpc;
   switch (Size) {
   default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
@@ -4996,10 +5033,10 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                   BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
-  MachineRegisterInfo &RegInfo = MF->getRegInfo();
-  unsigned scratch = RegInfo.createVirtualRegister(ARM::GPRRegisterClass);
-  unsigned scratch2 = (!BinOpcode) ? incr :
-    RegInfo.createVirtualRegister(ARM::GPRRegisterClass);
+  TargetRegisterClass *TRC =
+    isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+  unsigned scratch = MRI.createVirtualRegister(TRC);
+  unsigned scratch2 = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
 
   //  thisMBB:
   //   ...
@@ -5062,8 +5099,14 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   unsigned incr = MI->getOperand(2).getReg();
   unsigned oldval = dest;
   DebugLoc dl = MI->getDebugLoc();
-
   bool isThumb2 = Subtarget->isThumb2();
+
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  if (isThumb2) {
+    MRI.constrainRegClass(dest, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass);
+  }
+
   unsigned ldrOpc, strOpc, extendOpc;
   switch (Size) {
   default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
@@ -5095,9 +5138,10 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
                   BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
-  MachineRegisterInfo &RegInfo = MF->getRegInfo();
-  unsigned scratch = RegInfo.createVirtualRegister(ARM::GPRRegisterClass);
-  unsigned scratch2 = RegInfo.createVirtualRegister(ARM::GPRRegisterClass);
+  TargetRegisterClass *TRC =
+    isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+  unsigned scratch = MRI.createVirtualRegister(TRC);
+  unsigned scratch2 = MRI.createVirtualRegister(TRC);
 
   //  thisMBB:
   //   ...
@@ -5118,7 +5162,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
 
   // Sign extend the value, if necessary.
   if (signExtend && extendOpc) {
-    oldval = RegInfo.createVirtualRegister(ARM::GPRRegisterClass);
+    oldval = MRI.createVirtualRegister(ARM::GPRRegisterClass);
     AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval).addReg(dest));
   }
 
@@ -6984,6 +7028,14 @@ bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   return Imm >= 0 && Imm <= 255;
 }
 
+/// isLegalAddImmediate - Return true if the specified immediate is legal
+/// add immediate, that is the target has add instructions which can add
+/// a register with the immediate without having to materialize the
+/// immediate into a register.
+bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  return ARM_AM::getSOImmVal(Imm) != -1;
+}
+
 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
                                       bool isSEXTLoad, SDValue &Base,
                                       SDValue &Offset, bool &isInc,
@@ -7226,6 +7278,9 @@ ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
     case 'l': return C_RegisterClass;
     case 'w': return C_RegisterClass;
     }
+  } else {
+    if (Constraint == "Uv")
+      return C_Memory;
   }
   return TargetLowering::getConstraintType(Constraint);
 }
@@ -7337,12 +7392,16 @@ getRegClassForInlineAsmConstraint(const std::string &Constraint,
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
 /// vector.  If it is invalid, don't add anything to Ops.
 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
-                                                     char Constraint,
+                                                     std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
   SDValue Result(0, 0);
 
-  switch (Constraint) {
+  // Currently only support length 1 constraints.
+  if (Constraint.length() != 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
   default: break;
   case 'I': case 'J': case 'K': case 'L':
   case 'M': case 'N': case 'O':
@@ -7357,7 +7416,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     if (CVal != CVal64)
       return;
 
-    switch (Constraint) {
+    switch (ConstraintLetter) {
       case 'I':
         if (Subtarget->isThumb1Only()) {
           // This must be a constant between 0 and 255, for ADD
@@ -7620,6 +7679,28 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.writeMem = true;
     return true;
   }
+  case Intrinsic::arm_strexd: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i64;
+    Info.ptrVal = I.getArgOperand(2);
+    Info.offset = 0;
+    Info.align = 8;
+    Info.vol = false;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::arm_ldrexd: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i64;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = 8;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
   default:
     break;
   }
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index a2e6260..21a9a3a 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -264,6 +264,12 @@ namespace llvm {
     /// the immediate into a register.
     virtual bool isLegalICmpImmediate(int64_t Imm) const;
 
+    /// isLegalAddImmediate - Return true if the specified immediate is legal
+    /// add immediate, that is the target has add instructions which can
+    /// add a register and the immediate without having to materialize
+    /// the immediate into a register.
+    virtual bool isLegalAddImmediate(int64_t Imm) const;
+
     /// getPreIndexedAddressParts - returns true by value, base pointer and
     /// offset pointer and addressing mode by reference if the node's address
     /// can be legally represented as pre-indexed load / store address.
@@ -309,7 +315,7 @@ namespace llvm {
     /// true it means one of the asm constraint of the inline asm instruction
     /// being processed is 'm'.
     virtual void LowerAsmOperandForConstraint(SDValue Op,
-                                              char ConstraintLetter,
+                                              std::string &Constraint,
                                               std::vector<SDValue> &Ops,
                                               SelectionDAG &DAG) const;
 
@@ -321,9 +327,6 @@ namespace llvm {
     /// specified value type.
     virtual TargetRegisterClass *getRegClassFor(EVT VT) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const;
-
     /// getMaximalGlobalOffset - Returns the maximal possible offset which can
     /// be used for loads / stores from the global.
     virtual unsigned getMaximalGlobalOffset() const;
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index f5fb98e..897d8a5 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -860,6 +860,9 @@ class APKHI<bits<8> opcod, bit tb, dag oops, dag iops, InstrItinClass itin,
 class ARMPat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [IsARM];
 }
+class ARMV5TPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV5T];
+}
 class ARMV5TEPat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [IsARM, HasV5TE];
 }
@@ -1020,6 +1023,10 @@ class T1LoadStore<bits<4> opA, bits<3> opB> : Encoding16 {
 }
 class T1LdStSP<bits<3> opB>   : T1LoadStore<0b1001, opB>; // SP relative
 
+class T1BranchCond<bits<4> opcode> : Encoding16 {
+  let Inst{15-12} = opcode;
+}
+
 // Helper classes to encode Thumb1 loads and stores. For immediates, the
 // following bits are used for "opA" (see A6.2.4):
 //
@@ -1208,6 +1215,11 @@ class T1Pat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [IsThumb, IsThumb1Only];
 }
 
+// T2v6Pat - Same as Pat<>, but requires V6T2 Thumb2 mode.
+class T2v6Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb2, HasV6T2];
+}
+
 // T2Pat - Same as Pat<>, but requires that the compiler be in Thumb2 mode.
 class T2Pat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [IsThumb2];
@@ -1742,9 +1754,10 @@ class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
 
 // NEON 3 vector register format.
 
-class N3VCommon<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
-          dag oops, dag iops, Format f, InstrItinClass itin,
-          string opc, string dt, string asm, string cstr, list<dag> pattern>
+class N3VCommon<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string dt, string asm, string cstr,
+                list<dag> pattern>
   : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
   let Inst{24}    = op24;
   let Inst{23}    = op23;
@@ -1773,9 +1786,10 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
   let Inst{5}     = Vm{4};
 }
 
-class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
-          dag oops, dag iops, Format f, InstrItinClass itin,
-          string opc, string dt, string asm, string cstr, list<dag> pattern>
+class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string dt, string asm, string cstr,
+                list<dag> pattern>
   : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
               oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
 
@@ -1793,9 +1807,10 @@ class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bi
   let Inst{5}     = lane;
 }
 
-class N3VLane16<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
-          dag oops, dag iops, Format f, InstrItinClass itin,
-          string opc, string dt, string asm, string cstr, list<dag> pattern>
+class N3VLane16<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string dt, string asm, string cstr,
+                list<dag> pattern>
   : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
               oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
 
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 209c1a3..9af76df 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -58,7 +58,7 @@ def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>,
                                                  SDTCisInt<2>]>;
 def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>;
 
-def SDT_ARMEH_SJLJ_DispatchSetup: SDTypeProfile<0, 0, []>;
+def SDT_ARMEH_SJLJ_DispatchSetup: SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_ARMMEMBARRIER     : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
@@ -475,6 +475,12 @@ def width_imm : Operand<i32>, ImmLeaf<i32, [{
   let EncoderMethod = "getMsbOpValue";
 }
 
+def ssat_imm : Operand<i32>, ImmLeaf<i32, [{
+  return Imm > 0 && Imm <= 32;
+}]> {
+  let EncoderMethod = "getSsatBitPosValue";
+}
+
 // Define ARM specific addressing modes.
 
 def MemMode2AsmOperand : AsmOperandClass {
@@ -588,6 +594,15 @@ def am6offset : Operand<i32>,
   let EncoderMethod = "getAddrMode6OffsetOpValue";
 }
 
+// Special version of addrmode6 to handle alignment encoding for VST1/VLD1
+// (single element from one lane) for size 32.
+def addrmode6oneL32 : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm);
+  let EncoderMethod = "getAddrMode6OneLane32AddressOpValue";
+}
+
 // Special version of addrmode6 to handle alignment encoding for VLD-dup
 // instructions, specifically VLD4-dup.
 def addrmode6dup : Operand<i32>,
@@ -1304,6 +1319,15 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
     let Inst{3-0}  = dst;
   }
 
+  // For disassembly only.
+  def BX_pred : AXI<(outs), (ins GPR:$dst, pred:$p), BrMiscFrm, IIC_Br,
+                  "bx$p\t$dst", [/* pattern left blank */]>,
+              Requires<[IsARM, HasV4T]> {
+    bits<4> dst;
+    let Inst{27-4} = 0b000100101111111111110001;
+    let Inst{3-0}  = dst;
+  }
+
   // ARMV4 only
   // FIXME: We would really like to define this as a vanilla ARMPat like:
   // ARMPat<(brind GPR:$dst), (MOVr PC, GPR:$dst)>
@@ -1321,10 +1345,7 @@ let isCall = 1,
   // FIXME:  Do we really need a non-predicated version? If so, it should
   // at least be a pseudo instruction expanding to the predicated version
   // at MC lowering time.
-  Defs = [R0,  R1,  R2,  R3,  R12, LR,
-          D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
-          D16, D17, D18, D19, D20, D21, D22, D23,
-          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR],
+  Defs = [R0,  R1,  R2,  R3,  R12, LR, QQQQ0, QQQQ2, QQQQ3, CPSR, FPSCR],
   Uses = [SP] in {
   def BL  : ABXI<0b1011, (outs), (ins bl_target:$func, variable_ops),
                 IIC_Br, "bl\t$func",
@@ -1378,10 +1399,7 @@ let isCall = 1,
   // On Darwin R9 is call-clobbered.
   // R7 is marked as a use to prevent frame-pointer assignments from being
   // moved above / below calls.
-  Defs = [R0,  R1,  R2,  R3,  R9,  R12, LR,
-          D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
-          D16, D17, D18, D19, D20, D21, D22, D23,
-          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR],
+  Defs = [R0,  R1,  R2,  R3,  R9,  R12, LR, QQQQ0, QQQQ2, QQQQ3, CPSR, FPSCR],
   Uses = [R7, SP] in {
   def BLr9  : ARMPseudoInst<(outs), (ins bltarget:$func, variable_ops),
                 Size4Bytes, IIC_Br,
@@ -1420,10 +1438,7 @@ let isCall = 1,
 // FIXME: The Thumb versions of these should live in ARMInstrThumb.td
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // Darwin versions.
-  let Defs = [R0, R1, R2, R3, R9, R12,
-              D0, D1, D2, D3, D4, D5, D6, D7,
-              D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26,
-              D27, D28, D29, D30, D31, PC],
+  let Defs = [R0, R1, R2, R3, R9, R12, QQQQ0, QQQQ2, QQQQ3, PC],
       Uses = [SP] in {
     def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst, variable_ops),
                        IIC_Br, []>, Requires<[IsDarwin]>;
@@ -1449,10 +1464,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   }
 
   // Non-Darwin versions (the difference is R9).
-  let Defs = [R0, R1, R2, R3, R12,
-              D0, D1, D2, D3, D4, D5, D6, D7,
-              D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26,
-              D27, D28, D29, D30, D31, PC],
+  let Defs = [R0, R1, R2, R3, R12, QQQQ0, QQQQ2, QQQQ3, PC],
       Uses = [SP] in {
     def TCRETURNdiND : PseudoInst<(outs), (ins i32imm:$dst, variable_ops),
                        IIC_Br, []>, Requires<[IsNotDarwin]>;
@@ -2449,7 +2461,7 @@ def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
 
 // Signed/Unsigned saturate -- for disassembly only
 
-def SSAT : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a, shift_imm:$sh),
+def SSAT : AI<(outs GPR:$Rd), (ins ssat_imm:$sat_imm, GPR:$a, shift_imm:$sh),
               SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $a$sh",
               [/* For disassembly only; pattern left blank */]> {
   bits<4> Rd;
@@ -2465,7 +2477,7 @@ def SSAT : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a, shift_imm:$sh),
   let Inst{3-0} = Rn;
 }
 
-def SSAT16 : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$Rn), SatFrm,
+def SSAT16 : AI<(outs GPR:$Rd), (ins ssat_imm:$sat_imm, GPR:$Rn), SatFrm,
                 NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn",
                 [/* For disassembly only; pattern left blank */]> {
   bits<4> Rd;
@@ -3360,8 +3372,9 @@ def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addrmode7:$addr), NoItinerary,
                     "ldrexh", "\t$Rt, $addr", []>;
 def LDREX  : AIldrex<0b00, (outs GPR:$Rt), (ins addrmode7:$addr), NoItinerary,
                     "ldrex", "\t$Rt, $addr", []>;
-def LDREXD : AIldrex<0b01, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode7:$addr),
-                    NoItinerary, "ldrexd", "\t$Rt, $Rt2, $addr", []>;
+let hasExtraDefRegAllocReq = 1 in
+  def LDREXD : AIldrex<0b01, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode7:$addr),
+                      NoItinerary, "ldrexd", "\t$Rt, $Rt2, $addr", []>;
 }
 
 let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
@@ -3371,10 +3384,12 @@ def STREXH : AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addrmode7:$addr),
                     NoItinerary, "strexh", "\t$Rd, $Rt, $addr", []>;
 def STREX  : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addrmode7:$addr),
                     NoItinerary, "strex", "\t$Rd, $Rt, $addr", []>;
+}
+
+let hasExtraSrcRegAllocReq = 1, Constraints = "@earlyclobber $Rd" in
 def STREXD : AIstrex<0b01, (outs GPR:$Rd),
                     (ins GPR:$Rt, GPR:$Rt2, addrmode7:$addr),
                     NoItinerary, "strexd", "\t$Rd, $Rt, $Rt2, $addr", []>;
-}
 
 // Clear-Exclusive is for disassembly only.
 def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex",
@@ -3398,7 +3413,8 @@ def SWPB : AIswp<1, (outs GPR:$Rt), (ins GPR:$Rt2, GPR:$Rn), "swpb",
 def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1,
             c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
             NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
-            [/* For disassembly only; pattern left blank */]> {
+            [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                          imm:$CRm, imm:$opc2)]> {
   bits<4> opc1;
   bits<4> CRn;
   bits<4> CRd;
@@ -3418,7 +3434,8 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1,
 def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, i32imm:$opc1,
                c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
                NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
-               [/* For disassembly only; pattern left blank */]> {
+               [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                              imm:$CRm, imm:$opc2)]> {
   let Inst{31-28} = 0b1111;
   bits<4> opc1;
   bits<4> CRn;
@@ -3542,10 +3559,10 @@ defm STC2 : LdStCop<0b1111,    0, (ins),         "stc2", "">;
 // Move between coprocessor and ARM core register -- for disassembly only
 //
 
-class MovRCopro<string opc, bit direction, dag oops, dag iops>
+class MovRCopro<string opc, bit direction, dag oops, dag iops,
+                list<dag> pattern>
   : ABI<0b1110, oops, iops, NoItinerary, opc,
-        "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2",
-        [/* For disassembly only; pattern left blank */]> {
+        "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2", pattern> {
   let Inst{20} = direction;
   let Inst{4} = 1;
 
@@ -3565,17 +3582,23 @@ class MovRCopro<string opc, bit direction, dag oops, dag iops>
 }
 
 def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
-                    (outs), (ins p_imm:$cop, i32imm:$opc1,
-                                 GPR:$Rt, c_imm:$CRn, c_imm:$CRm,
-                                 i32imm:$opc2)>;
+                    (outs),
+                    (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn,
+                         c_imm:$CRm, i32imm:$opc2),
+                    [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                                  imm:$CRm, imm:$opc2)]>;
 def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */,
-                    (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1,
-                                         c_imm:$CRn, c_imm:$CRm, i32imm:$opc2)>;
+                    (outs GPR:$Rt),
+                    (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, c_imm:$CRm,
+                         i32imm:$opc2), []>;
 
-class MovRCopro2<string opc, bit direction, dag oops, dag iops>
+def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
+             (MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+
+class MovRCopro2<string opc, bit direction, dag oops, dag iops,
+                 list<dag> pattern>
   : ABXI<0b1110, oops, iops, NoItinerary,
-         !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"),
-         [/* For disassembly only; pattern left blank */]> {
+         !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), pattern> {
   let Inst{31-28} = 0b1111;
   let Inst{20} = direction;
   let Inst{4} = 1;
@@ -3596,19 +3619,25 @@ class MovRCopro2<string opc, bit direction, dag oops, dag iops>
 }
 
 def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
-                      (outs), (ins p_imm:$cop, i32imm:$opc1,
-                                   GPR:$Rt, c_imm:$CRn, c_imm:$CRm,
-                                   i32imm:$opc2)>;
+                      (outs),
+                      (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn,
+                           c_imm:$CRm, i32imm:$opc2),
+                      [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                                     imm:$CRm, imm:$opc2)]>;
 def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
-                      (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1,
-                                           c_imm:$CRn, c_imm:$CRm,
-                                           i32imm:$opc2)>;
+                      (outs GPR:$Rt),
+                      (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, c_imm:$CRm,
+                           i32imm:$opc2), []>;
+
+def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
+                              imm:$CRm, imm:$opc2),
+                (MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
 
-class MovRRCopro<string opc, bit direction>
+class MovRRCopro<string opc, bit direction,
+                 list<dag> pattern = [/* For disassembly only */]>
   : ABI<0b1100, (outs), (ins p_imm:$cop, i32imm:$opc1,
         GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
-        NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm",
-        [/* For disassembly only; pattern left blank */]> {
+        NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", pattern> {
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
 
@@ -3625,14 +3654,16 @@ class MovRRCopro<string opc, bit direction>
   let Inst{3-0}   = CRm;
 }
 
-def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */>;
+def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */,
+                      [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2,
+                                     imm:$CRm)]>;
 def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
 
-class MovRRCopro2<string opc, bit direction>
+class MovRRCopro2<string opc, bit direction,
+                  list<dag> pattern = [/* For disassembly only */]>
   : ABXI<0b1100, (outs), (ins p_imm:$cop, i32imm:$opc1,
-         GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
-         NoItinerary, !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"),
-         [/* For disassembly only; pattern left blank */]> {
+         GPR:$Rt, GPR:$Rt2, c_imm:$CRm), NoItinerary,
+         !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> {
   let Inst{31-28} = 0b1111;
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
@@ -3650,7 +3681,9 @@ class MovRRCopro2<string opc, bit direction>
   let Inst{3-0}   = CRm;
 }
 
-def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */>;
+def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */,
+                        [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2,
+                                        imm:$CRm)]>;
 def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>;
 
 //===----------------------------------------------------------------------===//
@@ -3739,10 +3772,8 @@ let isCall = 1,
 // These are pseudo-instructions and are lowered to individual MC-insts, so
 // no encoding information is necessary.
 let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR,  D0,
-    D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
-    D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
-    D31 ], hasSideEffects = 1, isBarrier = 1 in {
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR,
+    QQQQ0, QQQQ1, QQQQ2, QQQQ3 ], hasSideEffects = 1, isBarrier = 1 in {
   def Int_eh_sjlj_setjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
                                NoItinerary,
                          [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
@@ -3750,7 +3781,7 @@ let Defs =
 }
 
 let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR ],
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR ],
   hasSideEffects = 1, isBarrier = 1 in {
   def Int_eh_sjlj_setjmp_nofp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
                                    NoItinerary,
@@ -3773,8 +3804,8 @@ def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch),
 // that need the instruction size).
 let isBarrier = 1, hasSideEffects = 1 in
 def Int_eh_sjlj_dispatchsetup :
- PseudoInst<(outs), (ins), NoItinerary,
-            [(ARMeh_sjlj_dispatchsetup)]>,
+ PseudoInst<(outs), (ins GPR:$src), NoItinerary,
+            [(ARMeh_sjlj_dispatchsetup GPR:$src)]>,
               Requires<[IsDarwin]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index e34d69a..79d95d9 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -531,6 +531,17 @@ class VLD1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
                                          imm:$lane))]> {
   let Rm = 0b1111;
 }
+class VLD1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+             PatFrag LoadOp>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd),
+          (ins addrmode6oneL32:$Rn, DPR:$src, nohash_imm:$lane),
+          IIC_VLD1ln, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn",
+          "$src = $Vd",
+          [(set DPR:$Vd, (vector_insert (Ty DPR:$src),
+                                         (i32 (LoadOp addrmode6oneL32:$Rn)),
+                                         imm:$lane))]> {
+  let Rm = 0b1111;
+}
 class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln> {
   let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src),
                                                (i32 (LoadOp addrmode6:$addr)),
@@ -544,7 +555,7 @@ def VLD1LNd16 : VLD1LN<0b0100, {?,?,0,?}, "16", v4i16, extloadi16> {
   let Inst{7-6} = lane{1-0};
   let Inst{4}   = Rn{4};
 }
-def VLD1LNd32 : VLD1LN<0b1000, {?,0,?,?}, "32", v2i32, load> {
+def VLD1LNd32 : VLD1LN32<0b1000, {?,0,?,?}, "32", v2i32, load> {
   let Inst{7} = lane{0};
   let Inst{5} = Rn{4};
   let Inst{4} = Rn{4};
@@ -1371,6 +1382,14 @@ class VST1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
           [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), addrmode6:$Rn)]> {
   let Rm = 0b1111;
 }
+class VST1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+             PatFrag StoreOp, SDNode ExtractOp>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6oneL32:$Rn, DPR:$Vd, nohash_imm:$lane),
+          IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "",
+          [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), addrmode6oneL32:$Rn)]> {
+  let Rm = 0b1111;
+}
 class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
   : VSTQLNPseudo<IIC_VST1ln> {
   let Pattern = [(StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
@@ -1386,7 +1405,8 @@ def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16,
   let Inst{7-6} = lane{1-0};
   let Inst{4}   = Rn{5};
 }
-def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt> {
+
+def VST1LNd32 : VST1LN32<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt> {
   let Inst{7}   = lane{0};
   let Inst{5-4} = Rn{5-4};
 }
@@ -4683,8 +4703,9 @@ def VEXTd32 : VEXTd<"vext", "32", v2i32> {
   let Inst{9-8}    = 0b00;
 }
 def VEXTdf  : VEXTd<"vext", "32", v2f32> {
-  let Inst{11}    = index{0};
-  let Inst{10-8}  = 0b000;
+  let Inst{11-10}    = index{1-0};
+  let Inst{9-8}  = 0b00;
+
 }
 
 def VEXTq8  : VEXTq<"vext", "8",  v16i8> {
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 8c542fe..4777189 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -369,15 +369,6 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
     let Inst{2-0} = 0b000;
   }
 
-  def tBX_Rm : TI<(outs), (ins pred:$p, GPR:$Rm), IIC_Br, "bx${p}\t$Rm",
-                  [/* for disassembly only */]>,
-               T1Special<{1,1,0,?}> {
-    // A6.2.3 & A8.6.25
-    bits<4> Rm;
-    let Inst{6-3} = Rm;
-    let Inst{2-0} = 0b000;
-  }
-
   // Alternative return instruction used by vararg functions.
   def tBX_RET_vararg : TI<(outs), (ins tGPR:$Rm),
                           IIC_Br, "bx\t$Rm",
@@ -392,6 +383,14 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
 
 // Indirect branches
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def tBX : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bx${p}\t$Rm", []>,
+            T1Special<{1,1,0,?}> {
+    // A6.2.3 & A8.6.25
+    bits<4> Rm;
+    let Inst{6-3} = Rm;
+    let Inst{2-0} = 0b000;
+  }
+
   def tBRIND : TI<(outs), (ins GPR:$Rm),
                   IIC_Br,
                   "mov\tpc, $Rm",
@@ -423,10 +422,7 @@ def tPOP_RET : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
 // potentially appearing dead.
 let isCall = 1,
   // On non-Darwin platforms R9 is callee-saved.
-  Defs = [R0,  R1,  R2,  R3,  R12, LR,
-          D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
-          D16, D17, D18, D19, D20, D21, D22, D23,
-          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR],
+  Defs = [R0,  R1,  R2,  R3,  R12, LR, QQQQ0, QQQQ2, QQQQ3, CPSR, FPSCR],
   Uses = [SP] in {
   // Also used for Thumb2
   def tBL  : TIx2<0b11110, 0b11, 1,
@@ -460,14 +456,15 @@ let isCall = 1,
                   "blx\t$func",
                   [(ARMtcall GPR:$func)]>,
               Requires<[IsThumb, HasV5T, IsNotDarwin]>,
-              T1Special<{1,1,1,?}>; // A6.2.3 & A8.6.24;
+              T1Special<{1,1,1,?}> { // A6.2.3 & A8.6.24;
+    bits<4> func;
+    let Inst{6-3} = func;
+    let Inst{2-0} = 0b000;
+  }
 
   // ARMv4T
-  // FIXME: Should be a pseudo.
-  let isCodeGenOnly = 1 in
-  def tBX : TIx2<{?,?,?,?,?}, {?,?}, ?,
-                  (outs), (ins tGPR:$func, variable_ops), IIC_Br,
-                  "mov\tlr, pc\n\tbx\t$func",
+  def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func, variable_ops),
+                  Size4Bytes, IIC_Br,
                   [(ARMcall_nolink tGPR:$func)]>,
             Requires<[IsThumb, IsThumb1Only, IsNotDarwin]>;
 }
@@ -476,10 +473,7 @@ let isCall = 1,
   // On Darwin R9 is call-clobbered.
   // R7 is marked as a use to prevent frame-pointer assignments from being
   // moved above / below calls.
-  Defs = [R0,  R1,  R2,  R3,  R9,  R12, LR,
-          D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
-          D16, D17, D18, D19, D20, D21, D22, D23,
-          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR],
+  Defs = [R0,  R1,  R2,  R3,  R9,  R12, LR, QQQQ0, QQQQ2, QQQQ3, CPSR, FPSCR],
   Uses = [R7, SP] in {
   // Also used for Thumb2
   def tBLr9 : TIx2<0b11110, 0b11, 1,
@@ -521,11 +515,8 @@ let isCall = 1,
   }
 
   // ARMv4T
-  let isCodeGenOnly = 1 in
-  // FIXME: Should be a pseudo.
-  def tBXr9 : TIx2<{?,?,?,?,?}, {?,?}, ?,
-                   (outs), (ins tGPR:$func, variable_ops), IIC_Br,
-                   "mov\tlr, pc\n\tbx\t$func",
+  def tBXr9_CALL : tPseudoInst<(outs), (ins tGPR:$func, variable_ops),
+                   Size4Bytes, IIC_Br,
                    [(ARMcall_nolink tGPR:$func)]>,
               Requires<[IsThumb, IsThumb1Only, IsDarwin]>;
 }
@@ -560,7 +551,7 @@ let isBranch = 1, isTerminator = 1 in
   def tBcc : T1I<(outs), (ins t_bcctarget:$target, pred:$p), IIC_Br,
                  "b${p}\t$target",
                  [/*(ARMbrcond bb:$target, imm:$cc)*/]>,
-             T1Encoding<{1,1,0,1,?,?}> {
+             T1BranchCond<{1,1,0,1}> {
   bits<4> p;
   bits<8> target;
   let Inst{11-8} = p;
@@ -606,7 +597,7 @@ def tSVC : T1pI<(outs), (ins i32imm:$imm), IIC_Br,
 
 // The assembler uses 0xDEFE for a trap instruction.
 let isBarrier = 1, isTerminator = 1 in
-def tTRAP : TI<(outs), (ins), IIC_Br, 
+def tTRAP : TI<(outs), (ins), IIC_Br,
                "trap", [(trap)]>, Encoding16 {
   let Inst = 0xdefe;
 }
@@ -813,7 +804,7 @@ defm tLDM : thumb_ldst_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu,
 let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
 defm tSTM : thumb_ldst_mult<"stm", IIC_iStore_m, IIC_iStore_mu,
                             {1,1,0,0,0,?}, 0>;
- 
+
 } // neverHasSideEffects
 
 let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in
@@ -1352,9 +1343,10 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
 // Move between coprocessor and ARM core register -- for disassembly only
 //
 
-class tMovRCopro<string opc, bit direction, dag oops, dag iops>
+class tMovRCopro<string opc, bit direction, dag oops, dag iops,
+                 list<dag> pattern>
   : T1Cop<oops, iops, !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"),
-          [/* For disassembly only; pattern left blank */]> {
+          pattern> {
   let Inst{27-24} = 0b1110;
   let Inst{20} = direction;
   let Inst{4} = 1;
@@ -1375,16 +1367,24 @@ class tMovRCopro<string opc, bit direction, dag oops, dag iops>
 }
 
 def tMCR : tMovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
-           (outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn,
-                        c_imm:$CRm, i32imm:$opc2)>;
+           (outs),
+           (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn,
+                c_imm:$CRm, i32imm:$opc2),
+           [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                         imm:$CRm, imm:$opc2)]>;
 def tMRC : tMovRCopro<"mrc", 1 /* from coprocessor to ARM core register */,
-           (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn,
-                                c_imm:$CRm, i32imm:$opc2)>;
+           (outs GPR:$Rt),
+           (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
+           []>;
 
-class tMovRRCopro<string opc, bit direction>
+def : Pat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
+          (tMRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>,
+          Requires<[IsThumb, HasV6T2]>;
+
+class tMovRRCopro<string opc, bit direction,
+                  list<dag> pattern = [/* For disassembly only */]>
   : T1Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
-          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"),
-          [/* For disassembly only; pattern left blank */]> {
+          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> {
   let Inst{27-24} = 0b1100;
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
@@ -1402,7 +1402,9 @@ class tMovRRCopro<string opc, bit direction>
   let Inst{3-0}   = CRm;
 }
 
-def tMCRR : tMovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */>;
+def tMCRR : tMovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */,
+                        [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2,
+                                       imm:$CRm)]>;
 def tMRRC : tMovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
 
 //===----------------------------------------------------------------------===//
@@ -1411,7 +1413,8 @@ def tMRRC : tMovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
 def tCDP : T1Cop<(outs), (ins p_imm:$cop, i32imm:$opc1,
                  c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
                  "cdp\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
-                 [/* For disassembly only; pattern left blank */]> {
+                 [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                               imm:$CRm, imm:$opc2)]> {
   let Inst{27-24} = 0b1110;
 
   bits<4> opc1;
@@ -1445,7 +1448,7 @@ def tTPsoft : TIx2<0b11110, 0b11, 1, (outs), (ins), IIC_Br,
 
 //===----------------------------------------------------------------------===//
 // SJLJ Exception handling intrinsics
-// 
+//
 
 // eh_sjlj_setjmp() is an instruction sequence to store the return address and
 // save #0 in R0 for the non-longjmp case.  Since by its nature we may be coming
@@ -1457,7 +1460,7 @@ def tTPsoft : TIx2<0b11110, 0b11, 1, (outs), (ins), IIC_Br,
 // Defs. By doing so, we also cause the prologue/epilogue code to actively
 // preserve all of the callee-saved resgisters, which is exactly what we want.
 // $val is a scratch register for our use.
-let Defs = [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7, R12 ],
+let Defs = [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7, R12, CPSR ],
     hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in
 def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val),
                                   AddrModeNone, SizeSpecial, NoItinerary, "","",
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 600a121..598660c 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -158,7 +158,7 @@ def t2addrmode_so_reg : Operand<i32>,
 //
 def t2addrmode_reg : Operand<i32> {
   let PrintMethod = "printAddrMode7Operand";
-  let MIOperandInfo = (ops tGPR);
+  let MIOperandInfo = (ops GPR);
   let ParserMatchClass = MemMode7AsmOperand;
 }
 
@@ -1973,9 +1973,9 @@ class T2SatI<dag oops, dag iops, InstrItinClass itin,
 }
 
 def t2SSAT: T2SatI<
-                (outs rGPR:$Rd), (ins i32imm:$sat_imm, rGPR:$Rn, shift_imm:$sh),
-                NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh",
-                [/* For disassembly only; pattern left blank */]> {
+              (outs rGPR:$Rd), (ins ssat_imm:$sat_imm, rGPR:$Rn, shift_imm:$sh),
+              NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh",
+              [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1100;
   let Inst{20} = 0;
@@ -1983,9 +1983,9 @@ def t2SSAT: T2SatI<
 }
 
 def t2SSAT16: T2SatI<
-                   (outs rGPR:$Rd), (ins i32imm:$sat_imm, rGPR:$Rn), NoItinerary,
-                   "ssat16", "\t$Rd, $sat_imm, $Rn",
-                   [/* For disassembly only; pattern left blank */]> {
+                (outs rGPR:$Rd), (ins ssat_imm:$sat_imm, rGPR:$Rn), NoItinerary,
+                "ssat16", "\t$Rd, $sat_imm, $Rn",
+                [/* For disassembly only; pattern left blank */]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1100;
   let Inst{20} = 0;
@@ -2881,7 +2881,9 @@ def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_reg:$addr), AddrModeNone
   let Inst{19-16} = addr;
   let Inst{15-12} = Rt;
 }
-def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2), (ins t2addrmode_reg:$addr),
+let hasExtraDefRegAllocReq = 1 in
+def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2),
+                         (ins t2addrmode_reg:$addr),
                          AddrModeNone, Size4Bytes, NoItinerary,
                          "ldrexd", "\t$Rt, $Rt2, $addr", "",
                          [], {?, ?, ?, ?}> {
@@ -2912,6 +2914,9 @@ def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_reg:$addr),
   let Inst{19-16} = addr;
   let Inst{15-12} = Rt;
 }
+}
+
+let hasExtraSrcRegAllocReq = 1, Constraints = "@earlyclobber $Rd" in
 def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_reg:$addr),
                          AddrModeNone, Size4Bytes, NoItinerary,
@@ -2920,7 +2925,6 @@ def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd),
   bits<4> Rt2;
   let Inst{11-8} = Rt2;
 }
-}
 
 // Clear-Exclusive is for disassembly only.
 def t2CLREX : T2XI<(outs), (ins), NoItinerary, "clrex",
@@ -2965,10 +2969,9 @@ let isCall = 1,
 //   all of the callee-saved resgisters, which is exactly what we want.
 //   $val is a scratch register for our use.
 let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR,  D0,
-    D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
-    D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
-    D31 ], hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in {
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR,
+    QQQQ0, QQQQ1, QQQQ2, QQQQ3 ],
+  hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in {
   def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val),
                                AddrModeNone, SizeSpecial, NoItinerary, "", "",
                           [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>,
@@ -2976,7 +2979,7 @@ let Defs =
 }
 
 let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR ],
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR ],
   hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1 in {
   def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val),
                                AddrModeNone, SizeSpecial, NoItinerary, "", "",
@@ -3347,9 +3350,10 @@ def t2MSR : T2SpecialReg<0b111100111000 /* op31-20 */, 0b10 /* op15-14 */,
 // Move between coprocessor and ARM core register -- for disassembly only
 //
 
-class t2MovRCopro<string opc, bit direction, dag oops, dag iops>
+class t2MovRCopro<string opc, bit direction, dag oops, dag iops,
+                  list<dag> pattern>
   : T2Cop<oops, iops, !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"),
-          [/* For disassembly only; pattern left blank */]> {
+          pattern> {
   let Inst{27-24} = 0b1110;
   let Inst{20} = direction;
   let Inst{4} = 1;
@@ -3371,15 +3375,21 @@ class t2MovRCopro<string opc, bit direction, dag oops, dag iops>
 
 def t2MCR2 : t2MovRCopro<"mcr2", 0 /* from ARM core register to coprocessor */,
              (outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, c_imm:$CRn,
-                          c_imm:$CRm, i32imm:$opc2)>;
+                          c_imm:$CRm, i32imm:$opc2),
+             [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                            imm:$CRm, imm:$opc2)]>;
 def t2MRC2 : t2MovRCopro<"mrc2", 1 /* from coprocessor to ARM core register */,
              (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn,
-                                  c_imm:$CRm, i32imm:$opc2)>;
+                                  c_imm:$CRm, i32imm:$opc2), []>;
 
-class t2MovRRCopro<string opc, bit direction>
+def : T2v6Pat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
+                            imm:$CRm, imm:$opc2),
+              (t2MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+
+class t2MovRRCopro<string opc, bit direction,
+                   list<dag> pattern = [/* For disassembly only */]>
   : T2Cop<(outs), (ins p_imm:$cop, i32imm:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
-          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"),
-          [/* For disassembly only; pattern left blank */]> {
+          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> {
   let Inst{27-24} = 0b1100;
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
@@ -3398,7 +3408,9 @@ class t2MovRRCopro<string opc, bit direction>
 }
 
 def t2MCRR2 : t2MovRRCopro<"mcrr2",
-                           0 /* from ARM core register to coprocessor */>;
+                           0 /* from ARM core register to coprocessor */,
+                           [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt,
+                                           GPR:$Rt2, imm:$CRm)]>;
 def t2MRRC2 : t2MovRRCopro<"mrrc2",
                            1 /* from coprocessor to ARM core register */>;
 
@@ -3409,7 +3421,8 @@ def t2MRRC2 : t2MovRRCopro<"mrrc2",
 def t2CDP2 : T2Cop<(outs), (ins p_imm:$cop, i32imm:$opc1,
                    c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
                    "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
-                   [/* For disassembly only; pattern left blank */]> {
+                   [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                                  imm:$CRm, imm:$opc2)]> {
   let Inst{27-24} = 0b1110;
 
   bits<4> opc1;
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index df89fad..f4645f1 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -947,8 +947,8 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
   return true;
 }
 
-/// isMemoryOp - Returns true if instruction is a memory operations (that this
-/// pass is capable of operating on).
+/// isMemoryOp - Returns true if instruction is a memory operation that this
+/// pass is capable of operating on.
 static bool isMemoryOp(const MachineInstr *MI) {
   // When no memory operands are present, conservatively assume unaligned,
   // volatile, unfoldable.
@@ -1672,10 +1672,14 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
           Ops.pop_back();
           Ops.pop_back();
 
+          const TargetInstrDesc &TID = TII->get(NewOpc);
+          const TargetRegisterClass *TRC = TID.OpInfo[0].getRegClass(TRI);
+          MRI->constrainRegClass(EvenReg, TRC);
+          MRI->constrainRegClass(OddReg, TRC);
+
           // Form the pair instruction.
           if (isLd) {
-            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
-                                              dl, TII->get(NewOpc))
+            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, TID)
               .addReg(EvenReg, RegState::Define)
               .addReg(OddReg, RegState::Define)
               .addReg(BaseReg);
@@ -1687,8 +1691,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
             MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
             ++NumLDRDFormed;
           } else {
-            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
-                                              dl, TII->get(NewOpc))
+            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, TID)
               .addReg(EvenReg)
               .addReg(OddReg)
               .addReg(BaseReg);
diff --git a/lib/Target/ARM/ARMMCAsmInfo.cpp b/lib/Target/ARM/ARMMCAsmInfo.cpp
index a3f89e9..53b4c95 100644
--- a/lib/Target/ARM/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/ARMMCAsmInfo.cpp
@@ -70,8 +70,6 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo() {
   WeakRefDirective = "\t.weak\t";
   HasLCOMMDirective = true;
 
-  DwarfRequiresFrameSection = false;
-
   SupportsDebugInformation = true;
 
   // Exceptions handling
diff --git a/lib/Target/ARM/ARMMCCodeEmitter.cpp b/lib/Target/ARM/ARMMCCodeEmitter.cpp
index 10607b1..c5f727d 100644
--- a/lib/Target/ARM/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMMCCodeEmitter.cpp
@@ -269,10 +269,15 @@ public:
   unsigned getMsbOpValue(const MCInst &MI, unsigned Op,
                          SmallVectorImpl<MCFixup> &Fixups) const;
 
+  unsigned getSsatBitPosValue(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+
   unsigned getRegisterListOpValue(const MCInst &MI, unsigned Op,
                                   SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
                                       SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
+                                        SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
                                         SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
@@ -1122,6 +1127,13 @@ getMsbOpValue(const MCInst &MI, unsigned Op,
 }
 
 unsigned ARMMCCodeEmitter::
+getSsatBitPosValue(const MCInst &MI, unsigned Op,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  // For ssat instructions, the bit position should be encoded decremented by 1
+  return MI.getOperand(Op).getImm()-1;
+}
+
+unsigned ARMMCCodeEmitter::
 getRegisterListOpValue(const MCInst &MI, unsigned Op,
                        SmallVectorImpl<MCFixup> &Fixups) const {
   // VLDM/VSTM:
@@ -1178,6 +1190,30 @@ getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
   return RegNo | (Align << 4);
 }
 
+/// getAddrMode6OneLane32AddressOpValue - Encode an addrmode6 register number
+/// along  with the alignment operand for use in VST1 and VLD1 with size 32.
+unsigned ARMMCCodeEmitter::
+getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
+                                    SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &Reg = MI.getOperand(Op);
+  const MCOperand &Imm = MI.getOperand(Op + 1);
+
+  unsigned RegNo = getARMRegisterNumbering(Reg.getReg());
+  unsigned Align = 0;
+
+  switch (Imm.getImm()) {
+  default: break;
+  case 2:
+  case 4:
+  case 8:
+  case 16: Align = 0x00; break;
+  case 32: Align = 0x03; break;
+  }
+
+  return RegNo | (Align << 4);
+}
+
+
 /// getAddrMode6DupAddressOpValue - Encode an addrmode6 register number and
 /// alignment operand for use in VLD-dup instructions.  This is the same as
 /// getAddrMode6AddressOpValue except for the alignment encoding, which is
diff --git a/lib/Target/ARM/ARMPerfectShuffle.h b/lib/Target/ARM/ARMPerfectShuffle.h
index edecc4b..18e1620 100644
--- a/lib/Target/ARM/ARMPerfectShuffle.h
+++ b/lib/Target/ARM/ARMPerfectShuffle.h
@@ -21,6566 +21,6566 @@
 
 // This table is 6561*4 = 26244 bytes in size.
 static const unsigned PerfectShuffleTable[6561+1] = {
-   135053414U,  // <0,0,0,0>: Cost 1 vdup0 LHS
-  1543503974U,  // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
-  2618572962U,  // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
-  2568054923U,  // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
-  1476398390U,  // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
-  2550140624U,  // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
-  2550141434U,  // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
-  2591945711U,  // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
-   135053414U,  // <0,0,0,u>: Cost 1 vdup0 LHS
-  2886516736U,  // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
-  1812775014U,  // <0,0,1,1>: Cost 2 vzipl LHS, LHS
-  1618133094U,  // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
-  2625209292U,  // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
-  2886558034U,  // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
-  2617246864U,  // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
-  3659723031U,  // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
-  2591953904U,  // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
-  1812775581U,  // <0,0,1,u>: Cost 2 vzipl LHS, LHS
-  3020734464U,  // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
-  3020734474U,  // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
-  1946992742U,  // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
-  2631181989U,  // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
-  3020734668U,  // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
-  3826550569U,  // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
-  2617247674U,  // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
-  2591962097U,  // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
-  1946992796U,  // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
-  2635163787U,  // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
-  2686419196U,  // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
-  2686492933U,  // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
-  2617248156U,  // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
-  2617248258U,  // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
-  3826551298U,  // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
-  3690990200U,  // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
-  3713551042U,  // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
-  2635163787U,  // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
-  2617248658U,  // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
-  2888450150U,  // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
-  3021570150U,  // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
-  3641829519U,  // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
-  3021570252U,  // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
-  1543507254U,  // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
-  2752810294U,  // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
-  3786998152U,  // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
-  1543507497U,  // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
-  2684354972U,  // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
-  2617249488U,  // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
-  3765617070U,  // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
-  3635865780U,  // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
-  2617249734U,  // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
-  2617249796U,  // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
-  2718712274U,  // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
-  2617249960U,  // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
-  2720039396U,  // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
-  2684355053U,  // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
-  3963609190U,  // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
-  2617250298U,  // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
-  3796435464U,  // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
-  3659762998U,  // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
-  3659763810U,  // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
-  2617250616U,  // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
-  2657727309U,  // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
-  2658390942U,  // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
-  2659054575U,  // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
-  3635880854U,  // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
-  3635881401U,  // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
-  3734787298U,  // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
-  2617251174U,  // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
-  3659772002U,  // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
-  3659772189U,  // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
-  2617251436U,  // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
-  2659054575U,  // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
-   135053414U,  // <0,0,u,0>: Cost 1 vdup0 LHS
-  1817419878U,  // <0,0,u,1>: Cost 2 vzipl LHS, LHS
-  1947435110U,  // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
-  2568120467U,  // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
-  1476463926U,  // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
-  1543510170U,  // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
-  2752813210U,  // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
-  2592011255U,  // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
-   135053414U,  // <0,0,u,u>: Cost 1 vdup0 LHS
-  2618581002U,  // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
-  1557446758U,  // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
-  2618581155U,  // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
-  2690548468U,  // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
-  2626543954U,  // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
-  4094985216U,  // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
-  2592019278U,  // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
-  2592019448U,  // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
-  1557447325U,  // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
-  1476476938U,  // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
-  2886517556U,  // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
-  2886517654U,  // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
-  2886517720U,  // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
-  1476480310U,  // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
-  2886558864U,  // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
-  2550223354U,  // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
-  2550223856U,  // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
-  1476482862U,  // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
-  1494401126U,  // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
-  3020735284U,  // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
-  2562172349U,  // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
-      835584U,  // <0,1,2,3>: Cost 0 copy LHS
-  1494404406U,  // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
-  3020735488U,  // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
-  2631190458U,  // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
-  1518294010U,  // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
-      835584U,  // <0,1,2,u>: Cost 0 copy LHS
-  2692318156U,  // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
-  2691875800U,  // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
-  2691875806U,  // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
-  2692539367U,  // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
-  2562182454U,  // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
-  2691875840U,  // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
-  2692760578U,  // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
-  2639817411U,  // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
-  2691875863U,  // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
-  2568159334U,  // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
-  4095312692U,  // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
-  2568160934U,  // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
-  2568161432U,  // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
-  2568162614U,  // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
-  1557450038U,  // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
-  2754235702U,  // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
-  2592052220U,  // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
-  1557450281U,  // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
-  3765617775U,  // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
-  2647781007U,  // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
-  3704934138U,  // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
-  2691875984U,  // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
-  2657734598U,  // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
-  2650435539U,  // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
-  2651099172U,  // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
-  2651762805U,  // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
-  2691876029U,  // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
-  2592063590U,  // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
-  3765617871U,  // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
-  2654417337U,  // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
-  3765617889U,  // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
-  2592066870U,  // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
-  3765617907U,  // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
-  2657071869U,  // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
-  1583993678U,  // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
-  1584657311U,  // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
-  2657735672U,  // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
-  2657735808U,  // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
-  2631193772U,  // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
-  2661053667U,  // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
-  2657736038U,  // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
-  3721524621U,  // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
-  2657736158U,  // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
-  2657736300U,  // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
-  2657736322U,  // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
-  1494450278U,  // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
-  1557452590U,  // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
-  2754238254U,  // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
-      835584U,  // <0,1,u,3>: Cost 0 copy LHS
-  1494453558U,  // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
-  1557452954U,  // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
-  2754238618U,  // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
-  1518343168U,  // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
-      835584U,  // <0,1,u,u>: Cost 0 copy LHS
-  2752299008U,  // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
-  1544847462U,  // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
-  1678557286U,  // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
-  2696521165U,  // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
-  2752340172U,  // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
-  2691876326U,  // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
-  2618589695U,  // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
-  2592093185U,  // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
-  1678557340U,  // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
-  2618589942U,  // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
-  2752299828U,  // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
-  2886518376U,  // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
-  2752299766U,  // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
-  2550295862U,  // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
-  2752340992U,  // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
-  2886559674U,  // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
-  3934208106U,  // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
-  2752340771U,  // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
-  1476558868U,  // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
-  2226628029U,  // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
-  2752300648U,  // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
-  3020736114U,  // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
-  1476562230U,  // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
-  2550304464U,  // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
-  2618591162U,  // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
-  2550305777U,  // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
-  1476564782U,  // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
-  2618591382U,  // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
-  2752301206U,  // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
-  3826043121U,  // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
-  2752301468U,  // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
-  2618591746U,  // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
-  2752301570U,  // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
-  3830688102U,  // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
-  2698807012U,  // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
-  2752301269U,  // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
-  2562261094U,  // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
-  4095313828U,  // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
-  2226718152U,  // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
-  2568235169U,  // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
-  2562264374U,  // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
-  1544850742U,  // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
-  1678560566U,  // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
-  2592125957U,  // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
-  1678560584U,  // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
-  2691876686U,  // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
-  2618592976U,  // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
-  3765618528U,  // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
-  3765618536U,  // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
-  2618593222U,  // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
-  2752303108U,  // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
-  2618593378U,  // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
-  2824785206U,  // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
-  2824785207U,  // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
-  2752303950U,  // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
-  3830690081U,  // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
-  2618593786U,  // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
-  2691876794U,  // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
-  2752303990U,  // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
-  3830690445U,  // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
-  2752303928U,  // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
-  2657743695U,  // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
-  2691876839U,  // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
-  2659070961U,  // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
-  2659734594U,  // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
-  3734140051U,  // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
-  2701166596U,  // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
-  2662389094U,  // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
-  2662389126U,  // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
-  3736794583U,  // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
-  2752304748U,  // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
-  2659070961U,  // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
-  1476608026U,  // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
-  1544853294U,  // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
-  1678563118U,  // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
-  3021178482U,  // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
-  1476611382U,  // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
-  1544853658U,  // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
-  1678563482U,  // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
-  2824785449U,  // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
-  1678563172U,  // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
-  2556329984U,  // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
-  2686421142U,  // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
-  2562303437U,  // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
-  4094986652U,  // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
-  2556333366U,  // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
-  4094986754U,  // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
-  3798796488U,  // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
-  3776530634U,  // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
-  2556335918U,  // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
-  2886518934U,  // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
-  2556338933U,  // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
-  2691877105U,  // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
-  2886519196U,  // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
-  2886519298U,  // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
-  4095740418U,  // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
-  3659944242U,  // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
-  3769600286U,  // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
-  2886519582U,  // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
-  1482604646U,  // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
-  1482605302U,  // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
-  2556348008U,  // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
-  3020736924U,  // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
-  1482607926U,  // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
-  3020737026U,  // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
-  2598154746U,  // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
-  2598155258U,  // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
-  1482610478U,  // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
-  3692341398U,  // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
-  2635851999U,  // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
-  3636069840U,  // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
-  2691877276U,  // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
-  3961522690U,  // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
-  3826797058U,  // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
-  3703622282U,  // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
-  3769600452U,  // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
-  2640497430U,  // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
-  3962194070U,  // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
-  2232617112U,  // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
-  2232690849U,  // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
-  4095314332U,  // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
-  3962194434U,  // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
-  2691877378U,  // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
-  3826765110U,  // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
-  3665941518U,  // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
-  2691877405U,  // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
-  3630112870U,  // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
-  3630113526U,  // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
-  4035199734U,  // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
-  3769600578U,  // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
-  2232846516U,  // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
-  3779037780U,  // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
-  2718714461U,  // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
-  2706106975U,  // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
-  2233141464U,  // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
-  2691877496U,  // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
-  3727511914U,  // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
-  3765619338U,  // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
-  3765619347U,  // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
-  3765987996U,  // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
-  3306670270U,  // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
-  3792456365U,  // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
-  2706770608U,  // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
-  2706844345U,  // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
-  3769600707U,  // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
-  2659742787U,  // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
-  3636102612U,  // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
-  3769600740U,  // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
-  3769600747U,  // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
-  3769600758U,  // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
-  3659993400U,  // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
-  3781176065U,  // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
-  2664388218U,  // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
-  1482653798U,  // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
-  1482654460U,  // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
-  2556397160U,  // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
-  3021179292U,  // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
-  1482657078U,  // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
-  3021179394U,  // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
-  2598203898U,  // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
-  2708097874U,  // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
-  1482659630U,  // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
-  2617278468U,  // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
-  2618605670U,  // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
-  2618605734U,  // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
-  3642091695U,  // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
-  2753134796U,  // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
-  2718714770U,  // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
-  3021245750U,  // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
-  3665982483U,  // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
-  3021245768U,  // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
-  2568355942U,  // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
-  3692348212U,  // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
-  3692348310U,  // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
-  2568358064U,  // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
-  2568359222U,  // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
-  1812778294U,  // <0,4,1,5>: Cost 2 vzipl LHS, RHS
-  3022671158U,  // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
-  2592248852U,  // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
-  1812778537U,  // <0,4,1,u>: Cost 2 vzipl LHS, RHS
-  2568364134U,  // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
-  2238573423U,  // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
-  3692349032U,  // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
-  2631214761U,  // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
-  2568367414U,  // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
-  2887028022U,  // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
-  1946996022U,  // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
-  2592257045U,  // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
-  1946996040U,  // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
-  3692349590U,  // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
-  3826878614U,  // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
-  3826878625U,  // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
-  3692349852U,  // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
-  3692349954U,  // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
-  3826878978U,  // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
-  4095200566U,  // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
-  3713583814U,  // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
-  3692350238U,  // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
-  2550464552U,  // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
-  3962194914U,  // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
-  3693677631U,  // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
-  3642124467U,  // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
-  2718715088U,  // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
-  2618608950U,  // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
-  2753137974U,  // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
-  3666015255U,  // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
-  2618609193U,  // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
-  2568388710U,  // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
-  2568389526U,  // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
-  3636159963U,  // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
-  2568390836U,  // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
-  2568391990U,  // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
-  2718715180U,  // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
-  1618136374U,  // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
-  2592281624U,  // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
-  1618136392U,  // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
-  2550480938U,  // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
-  3826880801U,  // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
-  2562426332U,  // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
-  3786190181U,  // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
-  2718715252U,  // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
-  3826881165U,  // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
-  2712669568U,  // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
-  2657760081U,  // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
-  2718715284U,  // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
-  3654090854U,  // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
-  3934229326U,  // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
-  3734156437U,  // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
-  3734820070U,  // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
-  3654094134U,  // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
-  2713259464U,  // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
-  2713333201U,  // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
-  3654095866U,  // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
-  2713259464U,  // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
-  2568413286U,  // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
-  2618611502U,  // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
-  2753140526U,  // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
-  2568415415U,  // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
-  2568416566U,  // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
-  1817423158U,  // <0,4,u,5>: Cost 2 vzipl LHS, RHS
-  1947438390U,  // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
-  2592306203U,  // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
-  1947438408U,  // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
-  3630219264U,  // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
-  2625912934U,  // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
-  3692355748U,  // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
-  3693019384U,  // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
-  3630222646U,  // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
-  3699655062U,  // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
-  2718715508U,  // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
-  3087011126U,  // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
-  2625913501U,  // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
-  1500659814U,  // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
-  2886520528U,  // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
-  2574403176U,  // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
-  2574403734U,  // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
-  1500662674U,  // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
-  2886520836U,  // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
-  2886520930U,  // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
-  2718715600U,  // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
-  1500665646U,  // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
-  2556493926U,  // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
-  2244546120U,  // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
-  3692357256U,  // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
-  2568439994U,  // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
-  2556497206U,  // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
-  3020738564U,  // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
-  4027877161U,  // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
-  3093220662U,  // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
-  3093220663U,  // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
-  3699656854U,  // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
-  3699656927U,  // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
-  3699657006U,  // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
-  3699657116U,  // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
-  2637859284U,  // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
-  3790319453U,  // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
-  3699657354U,  // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
-  2716725103U,  // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
-  2716798840U,  // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
-  2661747602U,  // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
-  3630252810U,  // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
-  3636225507U,  // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
-  3716910172U,  // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
-  3962195892U,  // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
-  2625916214U,  // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
-  3718901071U,  // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
-  2718715846U,  // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
-  2625916457U,  // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
-  3791278034U,  // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
-  3791351771U,  // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
-  3318386260U,  // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
-  3791499245U,  // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
-  3318533734U,  // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
-  2718715908U,  // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
-  2657767522U,  // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
-  2718715928U,  // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
-  2718715937U,  // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
-  2592358502U,  // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
-  3792015404U,  // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
-  3731509754U,  // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
-  3785748546U,  // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
-  2592361782U,  // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
-  2592362594U,  // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
-  3785748576U,  // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
-  1644974178U,  // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
-  1645047915U,  // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
-  2562506854U,  // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
-  2562507670U,  // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
-  2562508262U,  // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
-  3636250774U,  // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
-  2562510134U,  // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
-  2718716072U,  // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
-  2718716074U,  // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
-  2719379635U,  // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
-  2562512686U,  // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
-  1500717158U,  // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
-  2625918766U,  // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
-  2719674583U,  // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
-  2568489152U,  // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
-  1500720025U,  // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
-  2625919130U,  // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
-  2586407243U,  // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
-  1646301444U,  // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
-  1646375181U,  // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
-  2586411110U,  // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
-  2619949158U,  // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
-  2619949220U,  // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
-  3785748789U,  // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
-  2619949386U,  // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
-  2586415202U,  // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
-  2586415436U,  // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
-  2952793398U,  // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
-  2619949725U,  // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
-  2562531430U,  // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
-  3693691700U,  // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
-  2886521338U,  // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
-  3693691864U,  // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
-  2562534710U,  // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
-  2580450932U,  // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
-  2886521656U,  // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
-  2966736182U,  // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
-  2966736183U,  // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
-  1500741734U,  // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
-  2250518817U,  // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
-  2574485096U,  // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
-  2631894694U,  // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
-  1500744604U,  // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
-  2574487248U,  // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
-  3020739384U,  // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
-  2954136886U,  // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
-  1500747566U,  // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
-  3693693078U,  // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
-  3705637136U,  // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
-  3705637192U,  // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
-  3693693340U,  // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
-  2637867477U,  // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
-  3705637424U,  // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
-  3666154056U,  // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
-  2722697800U,  // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
-  2722771537U,  // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
-  2562556006U,  // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
-  4095316257U,  // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
-  2562557420U,  // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
-  3636299926U,  // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
-  2562559286U,  // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
-  2619952438U,  // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
-  2723287696U,  // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
-  4027895094U,  // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
-  2619952681U,  // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
-  2718716594U,  // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
-  3648250774U,  // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
-  3792458436U,  // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
-  3705638767U,  // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
-  3648252831U,  // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
-  3797619416U,  // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
-  3792458472U,  // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
-  4035202358U,  // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
-  2718716594U,  // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
-  3786412796U,  // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
-  3792458504U,  // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
-  3728200126U,  // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
-  3798135575U,  // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
-  3786412836U,  // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
-  3792458543U,  // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
-  2718716728U,  // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
-  2718716738U,  // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
-  2718716747U,  // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
-  2718716750U,  // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
-  2724909910U,  // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
-  3636323823U,  // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
-  2725057384U,  // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
-  2718716790U,  // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
-  2718716800U,  // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
-  3792458629U,  // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
-  2725352332U,  // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
-  2718716822U,  // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
-  1500790886U,  // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
-  2619954990U,  // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
-  2562590192U,  // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
-  2725721017U,  // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
-  1500793762U,  // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
-  2619955354U,  // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
-  2725942228U,  // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
-  2954186038U,  // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
-  1500796718U,  // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
-  2256401391U,  // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
-  2632564838U,  // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
-  2256548865U,  // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
-  3700998396U,  // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
-  2718716952U,  // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
-  2718716962U,  // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
-  2621284845U,  // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
-  3904685542U,  // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
-  2632565405U,  // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
-  2256409584U,  // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
-  3706307380U,  // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
-  2632565654U,  // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
-  3769603168U,  // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
-  2256704532U,  // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
-  3769603184U,  // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
-  3700999366U,  // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
-  2886522476U,  // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
-  2256999480U,  // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
-  2586501222U,  // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
-  1182749690U,  // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
-  3636356595U,  // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
-  2727711916U,  // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
-  2586504502U,  // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
-  2632566606U,  // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
-  2586505559U,  // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
-  3020740204U,  // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
-  1183265849U,  // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
-  3701000342U,  // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
-  3706308849U,  // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
-  3330315268U,  // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
-  3706309020U,  // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
-  3706309122U,  // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
-  3712281127U,  // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
-  2639202936U,  // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
-  3802412321U,  // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
-  2640530202U,  // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
-  3654287462U,  // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
-  2256507900U,  // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
-  2256581637U,  // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
-  3660262008U,  // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
-  3786413405U,  // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
-  2632568118U,  // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
-  3718917457U,  // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
-  3787003255U,  // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
-  2632568361U,  // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
-  3706310268U,  // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
-  3792459156U,  // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
-  3330331654U,  // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
-  3722899255U,  // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
-  2256737304U,  // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
-  3724226521U,  // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
-  2718717377U,  // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
-  2729997763U,  // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
-  2720044499U,  // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
-  3712946517U,  // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
-  2256524286U,  // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
-  3792459246U,  // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
-  3796440567U,  // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
-  3654307126U,  // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
-  2656457394U,  // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
-  3792459281U,  // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
-  2730661396U,  // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
-  2658448293U,  // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
-  3787003431U,  // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
-  3654312854U,  // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
-  3654313446U,  // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
-  3804771905U,  // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
-  3654315318U,  // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
-  3654315651U,  // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
-  3660288348U,  // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
-  2718717548U,  // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
-  2664420990U,  // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
-  2256466935U,  // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
-  1182798848U,  // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
-  2256614409U,  // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
-  2731693714U,  // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
-  2256761883U,  // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
-  2632571034U,  // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
-  2669066421U,  // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
-  2731988662U,  // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
-  1183315007U,  // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
-   135053414U,  // <0,u,0,0>: Cost 1 vdup0 LHS
-  1544896614U,  // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
-  1678999654U,  // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
-  2691880677U,  // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
-  1476988214U,  // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
-  2718791419U,  // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
-  3021248666U,  // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
-  2592535607U,  // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
-   135053414U,  // <0,u,0,u>: Cost 1 vdup0 LHS
-  1476993097U,  // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
-  1812780846U,  // <0,u,1,1>: Cost 2 vzipl LHS, LHS
-  1618138926U,  // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
-  2752742134U,  // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
-  1476996406U,  // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
-  1812781210U,  // <0,u,1,5>: Cost 2 vzipl LHS, RHS
-  2887006416U,  // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
-  2966736200U,  // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
-  1812781413U,  // <0,u,1,u>: Cost 2 vzipl LHS, LHS
-  1482973286U,  // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
-  1482973987U,  // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
-  1946998574U,  // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
-      835584U,  // <0,u,2,3>: Cost 0 copy LHS
-  1482976566U,  // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
-  3020781631U,  // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
-  1946998938U,  // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
-  1518810169U,  // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
-      835584U,  // <0,u,2,u>: Cost 0 copy LHS
-  2618640534U,  // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
-  2752743574U,  // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
-  2636556597U,  // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
-  2752743836U,  // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
-  2618640898U,  // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
-  2752743938U,  // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
-  2639202936U,  // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
-  2639874762U,  // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
-  2752743637U,  // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
-  2562703462U,  // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
-  2888455982U,  // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
-  3021575982U,  // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
-  2568677591U,  // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
-  2562706742U,  // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
-  1544899894U,  // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
-  1679002934U,  // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
-  2718718033U,  // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
-  1679002952U,  // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
-  2568683622U,  // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
-  2568684438U,  // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
-  3765622902U,  // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
-  2691881087U,  // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
-  2568686902U,  // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
-  2650492890U,  // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
-  1618139290U,  // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
-  2824834358U,  // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
-  1618139308U,  // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
-  2592579686U,  // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
-  2262496983U,  // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
-  2654474688U,  // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
-  2691881168U,  // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
-  2592582966U,  // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
-  2656465587U,  // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
-  2657129220U,  // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
-  1584051029U,  // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
-  1584714662U,  // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
-  2562728038U,  // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
-  2562728854U,  // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
-  2562729473U,  // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
-  2661111018U,  // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
-  2562731318U,  // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
-  2718718258U,  // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
-  2586620261U,  // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
-  2657793644U,  // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
-  2562733870U,  // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
-   135053414U,  // <0,u,u,0>: Cost 1 vdup0 LHS
-  1544902446U,  // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
-  1679005486U,  // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
-      835584U,  // <0,u,u,3>: Cost 0 copy LHS
-  1483025718U,  // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
-  1544902810U,  // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
-  1679005850U,  // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
-  1518859327U,  // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
-      835584U,  // <0,u,u,u>: Cost 0 copy LHS
-  2689744896U,  // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
-  1610694666U,  // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
-  2689744916U,  // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
-  2619310332U,  // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
-  2684657701U,  // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
-  2620637598U,  // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
-  3708977654U,  // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
-  3666351168U,  // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
-  1611210825U,  // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
-  2556780646U,  // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
-  2556781355U,  // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
-  1616003174U,  // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
-  3693052888U,  // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
-  2556783926U,  // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
-  2580672143U,  // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
-  2724839566U,  // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
-  3654415354U,  // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
-  1616003228U,  // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
-  2685690019U,  // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
-  2685763756U,  // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
-  2698297524U,  // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
-  2685911230U,  // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
-  2689745100U,  // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
-  3764814038U,  // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
-  2724839640U,  // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
-  2592625658U,  // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
-  2686279915U,  // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
-  3087843328U,  // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
-  3087843338U,  // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
-    67944550U,  // <1,0,3,2>: Cost 1 vrev LHS
-  2568743135U,  // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
-  2562772278U,  // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
-  4099850454U,  // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
-  3704998538U,  // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
-  2592633923U,  // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
-    68386972U,  // <1,0,3,u>: Cost 1 vrev LHS
-  2620640146U,  // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
-  2689745234U,  // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
-  2689745244U,  // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
-  3760980320U,  // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
-  3761054057U,  // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
-  2619313462U,  // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
-  3761201531U,  // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
-  3666383940U,  // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
-  2619313705U,  // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
-  4029300736U,  // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
-  2895249510U,  // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
-  3028287590U,  // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
-  3642501345U,  // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
-  2215592058U,  // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
-  3724242907U,  // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
-  3724906540U,  // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
-  3911118134U,  // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
-  3028287644U,  // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
-  3762086375U,  // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
-  2698297846U,  // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
-  3760022015U,  // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
-  3642509538U,  // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
-  3762381323U,  // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
-  3730215604U,  // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
-  3730879237U,  // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
-  2657801046U,  // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
-  2658464679U,  // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
-  2659128312U,  // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
-  4047898278U,  // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
-  2215460970U,  // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
-  3734861035U,  // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
-  3731543398U,  // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
-  3736188301U,  // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
-  2663110110U,  // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
-  3731543660U,  // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
-  2664437376U,  // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
-  3087884288U,  // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
-  1616003730U,  // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
-    67985515U,  // <1,0,u,2>: Cost 1 vrev LHS
-  2689893028U,  // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
-  2689745586U,  // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
-  2619316378U,  // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
-  2669082807U,  // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
-  2592674888U,  // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
-    68427937U,  // <1,0,u,u>: Cost 1 vrev LHS
-  1543585802U,  // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
-  1548894310U,  // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
-  2618654892U,  // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
-  2689745654U,  // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
-  2622636370U,  // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
-  2620645791U,  // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
-  3696378367U,  // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
-  3666424905U,  // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
-  1548894866U,  // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
-  1483112550U,  // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
-   202162278U,  // <1,1,1,1>: Cost 1 vdup1 LHS
-  2622636950U,  // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
-  2622637016U,  // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
-  1483115830U,  // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
-  2622637200U,  // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
-  2622637263U,  // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
-  2592691274U,  // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
-   202162278U,  // <1,1,1,u>: Cost 1 vdup1 LHS
-  2550890588U,  // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
-  2617329183U,  // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
-  2622637672U,  // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
-  2622637734U,  // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
-  2550893878U,  // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
-  3696379744U,  // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
-  2622638010U,  // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
-  3804554170U,  // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
-  2622638139U,  // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
-  2622638230U,  // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
-  3087844148U,  // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
-  4161585244U,  // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
-  2014101606U,  // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
-  2622638594U,  // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
-  2689745920U,  // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
-  3763487753U,  // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
-  2592707660U,  // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
-  2014101611U,  // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
-  2556878950U,  // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
-  2221335351U,  // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
-  3696380988U,  // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
-  3763487805U,  // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
-  2556882230U,  // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
-  1548897590U,  // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
-  2758184246U,  // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
-  3666457677U,  // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
-  1548897833U,  // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
-  2693653615U,  // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
-  2617331408U,  // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
-  4029302934U,  // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
-  2689746064U,  // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
-  2221564755U,  // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
-  2955559250U,  // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
-  2617331810U,  // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
-  2825293110U,  // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
-  2689746109U,  // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
-  3696382241U,  // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
-  2689746127U,  // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
-  2617332218U,  // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
-  3763487969U,  // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
-  3696382605U,  // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
-  4029309266U,  // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
-  2617332536U,  // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
-  2724840702U,  // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
-  2725504263U,  // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
-  2617332720U,  // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
-  2659800138U,  // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
-  3691074717U,  // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
-  4167811174U,  // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
-  2617333094U,  // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
-  3295396702U,  // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
-  3803891014U,  // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
-  2617333356U,  // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
-  2659800138U,  // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
-  1483112550U,  // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
-   202162278U,  // <1,1,u,1>: Cost 1 vdup1 LHS
-  2622642056U,  // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
-  2014142566U,  // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
-  1483115830U,  // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
-  1548900506U,  // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
-  2622642384U,  // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
-  2825293353U,  // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
-   202162278U,  // <1,1,u,u>: Cost 1 vdup1 LHS
-  2635251712U,  // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
-  1561509990U,  // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
-  2618663085U,  // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
-  2696529358U,  // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
-  2635252050U,  // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
-  3769533926U,  // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
-  2621317617U,  // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
-  2659140170U,  // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
-  1561510557U,  // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
-  2623308516U,  // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
-  2635252532U,  // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
-  2631271318U,  // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
-  2958180454U,  // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
-  2550959414U,  // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
-  2635252880U,  // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
-  2635252952U,  // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
-  3732882731U,  // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
-  2958180459U,  // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
-  2629281213U,  // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
-  2635253280U,  // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
-  2618664552U,  // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
-  2689746546U,  // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
-  3764815485U,  // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
-  3760023176U,  // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
-  2635253690U,  // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
-  2659141610U,  // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
-  2689746591U,  // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
-   403488870U,  // <1,2,3,0>: Cost 1 vext1 LHS, LHS
-  1477231350U,  // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  1477232232U,  // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
-  1477233052U,  // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
-   403492150U,  // <1,2,3,4>: Cost 1 vext1 LHS, RHS
-  1525010128U,  // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
-  1525010938U,  // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1525011450U,  // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
-   403494702U,  // <1,2,3,u>: Cost 1 vext1 LHS, LHS
-  2641226607U,  // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
-  3624723446U,  // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
-  3301123609U,  // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
-  2598759198U,  // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
-  2659142864U,  // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
-  1561513270U,  // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
-  2659143028U,  // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
-  2659143112U,  // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
-  1561513513U,  // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
-  2550988902U,  // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
-  2550989824U,  // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
-  3624732264U,  // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
-  2955559014U,  // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
-  2550992182U,  // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
-  2659143684U,  // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
-  2659143778U,  // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
-  2659143848U,  // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
-  2550994734U,  // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
-  2700289945U,  // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
-  2635256232U,  // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
-  2659144186U,  // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
-  2689746874U,  // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
-  3763488705U,  // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
-  3763488716U,  // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
-  2659144504U,  // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
-  2657817432U,  // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
-  2689746919U,  // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
-  1585402874U,  // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
-  2659144770U,  // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
-  3708998858U,  // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
-  2635257059U,  // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
-  2659145062U,  // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
-  3732886916U,  // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
-  3732886998U,  // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
-  2659145255U,  // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
-  1590711938U,  // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
-   403529835U,  // <1,2,u,0>: Cost 1 vext1 LHS, LHS
-  1477272310U,  // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  1477273192U,  // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
-  1477273750U,  // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
-   403533110U,  // <1,2,u,4>: Cost 1 vext1 LHS, RHS
-  1561516186U,  // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
-  1525051898U,  // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1525052410U,  // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
-   403535662U,  // <1,2,u,u>: Cost 1 vext1 LHS, LHS
-  2819407872U,  // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
-  1551564902U,  // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
-  2819408630U,  // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
-  2619334911U,  // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
-  2625306962U,  // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
-  3832725879U,  // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
-  3699048959U,  // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
-  3776538827U,  // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
-  1551565469U,  // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
-  2618671862U,  // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
-  2819408692U,  // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
-  2624643975U,  // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
-  1745666150U,  // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
-  2557005110U,  // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
-  2625307792U,  // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
-  3698386127U,  // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
-  2592838748U,  // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
-  1745666155U,  // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
-  2819408790U,  // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
-  2625308193U,  // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
-  2819408036U,  // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
-  2819851890U,  // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
-  2819408794U,  // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
-  3893149890U,  // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
-  2819408076U,  // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
-  3772041583U,  // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
-  2819408042U,  // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
-  1483276390U,  // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
-  1483277128U,  // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
-  2557019752U,  // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
-  2819408856U,  // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
-  1483279670U,  // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
-  2819409614U,  // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
-  2598826490U,  // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
-  3087844352U,  // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
-  1483282222U,  // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
-  2568970342U,  // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
-  2568971224U,  // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
-  3832761290U,  // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
-  2233428219U,  // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
-  2568973622U,  // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
-  1551568182U,  // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
-  2819410434U,  // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
-  3666605151U,  // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
-  1551568425U,  // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
-  2563006566U,  // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
-  2568979456U,  // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
-  2563008035U,  // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
-  2233436412U,  // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
-  2563009846U,  // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
-  2867187716U,  // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
-  2655834214U,  // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
-  1745669430U,  // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
-  1745669431U,  // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
-  2867187810U,  // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
-  3699052931U,  // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
-  2654507460U,  // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
-  3766291091U,  // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
-  2655834726U,  // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
-  3923384562U,  // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
-  2657161992U,  // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
-  2819852218U,  // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
-  2819852219U,  // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
-  2706926275U,  // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
-  2659816524U,  // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
-  3636766245U,  // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
-  2867187903U,  // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
-  2625312102U,  // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
-  2867188598U,  // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
-  3728250344U,  // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
-  2867187880U,  // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
-  2707516171U,  // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
-  1483317350U,  // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
-  1483318093U,  // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
-  2819410718U,  // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
-  1745666717U,  // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
-  1483320630U,  // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
-  1551571098U,  // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
-  2819410758U,  // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
-  1745669673U,  // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
-  1745666722U,  // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
-  2617352205U,  // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
-  2619342950U,  // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
-  3692421295U,  // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
-  2619343104U,  // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
-  2617352530U,  // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
-  1634880402U,  // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
-  2713930652U,  // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
-  3732898396U,  // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
-  1635101613U,  // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
-  3693085430U,  // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
-  2623988535U,  // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
-  3693085590U,  // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
-  3692422134U,  // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
-  3693085726U,  // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
-  2892401974U,  // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
-  3026619702U,  // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
-  3800206324U,  // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
-  2892402217U,  // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
-  3966978927U,  // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
-  3966979018U,  // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
-  3693086312U,  // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
-  2635269798U,  // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
-  3966979280U,  // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
-  2893204790U,  // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
-  3693086650U,  // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
-  3666662502U,  // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
-  2893205033U,  // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
-  2563063910U,  // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
-  2563064730U,  // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
-  2563065386U,  // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
-  3693087132U,  // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
-  2619345410U,  // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
-  3087843666U,  // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
-  3087843676U,  // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
-  3666670695U,  // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
-  3087843669U,  // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
-  2620672914U,  // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
-  3630842706U,  // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
-  3313069003U,  // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
-  3642788100U,  // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
-  2713930960U,  // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
-  2619346230U,  // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
-  2713930980U,  // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
-  3736882642U,  // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
-  2619346473U,  // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
-  2557108326U,  // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
-  2557109075U,  // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
-  2598913774U,  // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
-  3630852246U,  // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
-  2557111606U,  // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
-  2895252790U,  // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
-  1616006454U,  // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  3899059510U,  // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
-  1616006472U,  // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
-  2557116518U,  // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
-  2557117236U,  // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
-  3630859880U,  // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
-  2569062550U,  // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
-  2557119798U,  // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
-  3763490174U,  // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
-  3763490183U,  // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
-  2712751498U,  // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
-  2557122350U,  // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
-  2659161084U,  // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
-  3732903040U,  // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
-  3734230174U,  // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
-  3734893807U,  // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
-  3660729654U,  // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
-  3786493384U,  // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
-  2713341394U,  // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
-  3660731386U,  // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
-  2664470148U,  // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
-  2557132902U,  // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
-  2619348782U,  // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
-  2563106351U,  // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
-  2713783816U,  // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
-  2622666815U,  // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
-  1640189466U,  // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
-  1616006697U,  // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  2712751498U,  // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
-  1616006715U,  // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
-  2620014592U,  // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
-  1546272870U,  // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
-  2618687664U,  // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
-  3693093120U,  // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
-  1546273106U,  // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
-  2620678563U,  // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
-  2714668660U,  // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
-  3772042877U,  // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
-  1546273437U,  // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
-  2620015350U,  // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
-  2620015412U,  // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
-  2620015510U,  // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
-  2618688512U,  // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
-  2620015677U,  // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
-  2620015727U,  // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
-  2620015859U,  // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
-  3093728566U,  // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
-  2620015981U,  // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
-  3692430816U,  // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
-  2620016163U,  // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
-  2620016232U,  // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
-  2620016294U,  // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
-  3693758221U,  // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
-  3692431209U,  // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
-  2620016570U,  // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
-  4173598006U,  // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
-  2620016699U,  // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
-  2620016790U,  // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
-  2569110672U,  // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
-  3693758785U,  // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
-  2620017052U,  // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
-  2620017154U,  // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
-  3135623172U,  // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
-  4161587048U,  // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
-  2014104886U,  // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
-  2014104887U,  // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
-  2620017554U,  // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
-  2620017634U,  // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
-  3693759551U,  // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
-  3642861837U,  // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
-  2575092710U,  // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
-  1546276150U,  // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
-  2759855414U,  // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
-  2713931718U,  // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
-  1546276393U,  // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
-  2557182054U,  // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
-  2557182812U,  // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
-  3630925347U,  // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
-  4029301675U,  // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
-  2557185334U,  // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
-  2713931780U,  // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
-  2667794530U,  // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
-  2713931800U,  // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
-  2557187886U,  // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
-  2718208036U,  // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
-  2620019115U,  // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
-  2667794938U,  // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
-  3787673666U,  // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
-  3693761165U,  // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
-  3319279297U,  // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
-  2667795256U,  // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
-  2713931874U,  // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
-  2713931883U,  // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
-  2557198438U,  // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
-  2557199156U,  // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
-  2569143974U,  // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
-  2569144592U,  // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
-  2557201718U,  // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
-  2713931944U,  // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
-  3787673770U,  // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
-  2719387828U,  // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
-  2557204270U,  // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
-  2620020435U,  // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
-  1546278702U,  // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
-  2620020616U,  // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
-  2620020668U,  // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
-  1594054682U,  // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
-  1546279066U,  // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
-  2620020944U,  // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
-  2014145846U,  // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
-  2014145847U,  // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
-  3692437504U,  // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
-  2618695782U,  // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
-  2618695857U,  // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
-  3794161970U,  // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
-  2620023122U,  // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
-  2620686756U,  // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
-  2621350389U,  // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
-  4028599606U,  // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
-  2618696349U,  // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
-  3692438262U,  // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
-  2625995572U,  // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
-  3692438422U,  // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
-  3692438488U,  // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
-  2625995820U,  // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
-  3692438672U,  // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
-  3692438720U,  // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
-  2958183734U,  // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
-  2958183735U,  // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
-  2721526201U,  // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
-  3692439097U,  // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
-  3692439144U,  // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
-  3692439206U,  // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
-  3636948278U,  // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
-  3787674092U,  // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
-  2618697658U,  // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
-  2970799414U,  // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
-  2970799415U,  // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
-  2563211366U,  // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
-  3699738854U,  // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
-  2563212860U,  // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
-  3692439964U,  // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
-  2563214646U,  // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
-  4191820018U,  // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
-  2587103648U,  // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
-  3087845306U,  // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
-  3087845307U,  // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
-  3693767570U,  // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
-  3693767650U,  // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
-  3636962877U,  // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
-  3325088134U,  // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
-  3693767898U,  // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
-  2618699062U,  // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
-  3833670966U,  // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
-  4028632374U,  // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
-  2618699305U,  // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
-  3693768264U,  // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
-  3630998373U,  // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
-  3636971070U,  // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
-  3642943767U,  // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
-  3693768628U,  // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
-  3732918276U,  // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
-  2620690530U,  // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
-  2955562294U,  // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
-  2955562295U,  // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
-  2724180733U,  // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
-  3631006566U,  // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
-  3631007674U,  // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
-  3692442184U,  // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
-  3631009078U,  // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
-  3787674416U,  // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
-  2713932600U,  // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
-  2713932610U,  // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
-  2713932619U,  // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
-  1651102542U,  // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
-  2724918103U,  // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
-  2698302306U,  // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
-  3642960153U,  // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
-  2713932662U,  // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
-  2725213051U,  // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
-  2724844426U,  // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
-  4035956022U,  // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
-  1651692438U,  // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
-  1651766175U,  // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
-  2618701614U,  // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
-  3135663508U,  // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
-  3692443580U,  // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
-  2713932743U,  // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
-  2618701978U,  // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
-  2622683344U,  // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
-  3087886266U,  // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
-  1652356071U,  // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
-  2726171632U,  // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
-  2626666598U,  // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
-  3695100067U,  // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
-  3707044102U,  // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
-  2726466580U,  // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
-  3654921933U,  // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
-  2621358582U,  // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
-  2622022215U,  // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
-  2626667165U,  // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
-  2593128550U,  // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
-  2626667316U,  // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
-  3700409238U,  // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
-  2257294428U,  // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
-  2593131830U,  // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
-  2626667646U,  // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
-  2627331279U,  // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
-  2593133696U,  // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
-  2628658545U,  // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
-  2587164774U,  // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
-  3701073445U,  // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
-  3700409960U,  // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
-  2638612134U,  // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
-  2587168054U,  // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
-  3706382167U,  // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
-  2587169192U,  // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
-  3660911610U,  // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
-  2587170606U,  // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
-  1507459174U,  // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
-  2569257984U,  // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
-  2581202536U,  // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
-  2569259294U,  // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
-  1507462454U,  // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
-  1507462864U,  // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
-  2581205498U,  // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
-  2581206010U,  // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
-  1507465006U,  // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
-  2728826164U,  // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
-  3654951732U,  // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
-  3330987094U,  // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
-  3331060831U,  // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
-  3787674971U,  // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
-  2626669878U,  // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
-  3785979241U,  // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
-  3787085176U,  // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
-  2626670121U,  // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
-  2569273446U,  // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
-  2569274368U,  // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
-  3643016808U,  // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
-  2569275680U,  // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
-  2569276726U,  // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
-  4102034790U,  // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
-  2651222067U,  // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
-  3899378998U,  // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
-  2569279278U,  // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
-  2730153430U,  // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
-  2724845022U,  // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
-  3643025338U,  // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
-  3643025697U,  // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
-  3643026742U,  // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
-  3654971091U,  // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
-  3787675153U,  // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
-  2724845076U,  // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
-  2725508637U,  // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
-  2730817063U,  // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
-  3631088436U,  // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
-  3660949158U,  // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
-  3801904705U,  // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
-  3631090998U,  // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
-  2662503828U,  // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
-  3660951981U,  // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
-  2713933420U,  // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
-  2731406959U,  // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
-  1507500134U,  // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
-  2626672430U,  // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
-  2581243496U,  // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
-  2569300259U,  // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
-  1507503414U,  // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
-  1507503829U,  // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
-  2581246458U,  // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
-  2581246970U,  // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
-  1507505966U,  // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
-  1543643153U,  // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
-  1546297446U,  // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
-  2819448852U,  // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
-  2619375876U,  // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
-  1546297685U,  // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
-  1658771190U,  // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
-  2736789248U,  // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
-  2659189376U,  // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
-  1546298013U,  // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
-  1483112550U,  // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
-   202162278U,  // <1,u,1,1>: Cost 1 vdup1 LHS
-  1616009006U,  // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
-  1745707110U,  // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
-  1483115830U,  // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
-  2620040336U,  // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
-  3026622618U,  // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
-  2958183752U,  // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
-   202162278U,  // <1,u,1,u>: Cost 1 vdup1 LHS
-  2819449750U,  // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
-  2893207342U,  // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
-  2819448996U,  // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
-  2819450482U,  // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
-  2819449754U,  // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
-  2893207706U,  // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
-  2819449036U,  // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
-  2970799432U,  // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
-  2819449002U,  // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
-   403931292U,  // <1,u,3,0>: Cost 1 vext1 LHS, LHS
-  1477673718U,  // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-   115726126U,  // <1,u,3,2>: Cost 1 vrev LHS
-  2014102173U,  // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
-   403934518U,  // <1,u,3,4>: Cost 1 vext1 LHS, RHS
-  1507536601U,  // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
-  1525453306U,  // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  2014105129U,  // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
-   403937070U,  // <1,u,3,u>: Cost 1 vext1 LHS, LHS
-  2620042157U,  // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
-  2620042237U,  // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
-  2263217967U,  // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
-  2569341224U,  // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
-  2569342262U,  // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
-  1546300726U,  // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
-  2819449180U,  // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
-  2724845649U,  // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
-  1546300969U,  // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
-  2551431270U,  // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
-  2551432192U,  // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
-  3028293422U,  // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
-  2955559068U,  // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
-  2551434550U,  // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
-  2895255706U,  // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
-  1616009370U,  // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  1745710390U,  // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
-  1745710391U,  // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
-  2653221159U,  // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
-  2725509303U,  // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
-  2659193338U,  // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
-  2689751248U,  // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
-  2867228774U,  // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
-  3764820194U,  // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
-  2657202957U,  // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
-  2819450810U,  // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
-  2819450811U,  // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
-  1585452032U,  // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
-  2557420340U,  // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
-  2569365158U,  // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
-  2569365803U,  // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
-  2557422902U,  // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
-  2662512021U,  // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
-  2724845884U,  // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
-  2659194476U,  // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
-  1590761096U,  // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
-   403972257U,  // <1,u,u,0>: Cost 1 vext1 LHS, LHS
-   202162278U,  // <1,u,u,1>: Cost 1 vdup1 LHS
-   115767091U,  // <1,u,u,2>: Cost 1 vrev LHS
-  1745707677U,  // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
-   403975478U,  // <1,u,u,4>: Cost 1 vext1 LHS, RHS
-  1546303642U,  // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
-  1616009613U,  // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
-  1745710633U,  // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
-   403978030U,  // <1,u,u,u>: Cost 1 vext1 LHS, LHS
-  2551463936U,  // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
-  2685698058U,  // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
-  1610776596U,  // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
-  2619384069U,  // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
-  2551467318U,  // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
-  3899836596U,  // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
-  2621374968U,  // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
-  4168271334U,  // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
-  1611219018U,  // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
-  2551472138U,  // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
-  2690564186U,  // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
-  1611956326U,  // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
-  2826092646U,  // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
-  2551475510U,  // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
-  3692463248U,  // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
-  2587308473U,  // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
-  3661050874U,  // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
-  1611956380U,  // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
-  1477738598U,  // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
-  2551481078U,  // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
-  2551481796U,  // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
-  2551482518U,  // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
-  1477741878U,  // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
-  2551484112U,  // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
-  2551484759U,  // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
-  2551485434U,  // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
-  1477744430U,  // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
-  2953625600U,  // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
-  2953627302U,  // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
-  2953625764U,  // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
-  4027369695U,  // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
-  3625233718U,  // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
-  3899836110U,  // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
-  4032012618U,  // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
-  3899835392U,  // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
-  2953625770U,  // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
-  2551496806U,  // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
-  2685698386U,  // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
-  2685698396U,  // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
-  3625240726U,  // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
-  2551500086U,  // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
-  2618723638U,  // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
-  2765409590U,  // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
-  3799990664U,  // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
-  2685698450U,  // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
-  3625246822U,  // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
-  3289776304U,  // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
-  2690564526U,  // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
-  3289923778U,  // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
-  2216255691U,  // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
-  3726307332U,  // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
-  3726307426U,  // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
-  2826095926U,  // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
-  2216550639U,  // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
-  4162420736U,  // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
-  2901885030U,  // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
-  2685698559U,  // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
-  3643173171U,  // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
-  2216263884U,  // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
-  3730289341U,  // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
-  3726308152U,  // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
-  3899836346U,  // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
-  2216558832U,  // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
-  2659202049U,  // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
-  3726308437U,  // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
-  2726249034U,  // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
-  3734934772U,  // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
-  3726308710U,  // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
-  3726308814U,  // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
-  3736925671U,  // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
-  3726308972U,  // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
-  2659202049U,  // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
-  1477787750U,  // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
-  2953668262U,  // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
-  1611956893U,  // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
-  2551531670U,  // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
-  1477791030U,  // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
-  2618726554U,  // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
-  2765412506U,  // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
-  2826096169U,  // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
-  1611956947U,  // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
-  2569453670U,  // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
-  2619392102U,  // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
-  3759440619U,  // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
-  1616823030U,  // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
-  2569456950U,  // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
-  2690712328U,  // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
-  3661115841U,  // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
-  2622046794U,  // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
-  1617191715U,  // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
-  2551545958U,  // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
-  2685698868U,  // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
-  2628682646U,  // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
-  2685698888U,  // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
-  2551549238U,  // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
-  3693134992U,  // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
-  3661124034U,  // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
-  3625292794U,  // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
-  2685698933U,  // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
-  2551554150U,  // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
-  3893649571U,  // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
-  2551555688U,  // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
-  2685698966U,  // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
-  2551557430U,  // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
-  3763422123U,  // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
-  3693135802U,  // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
-  2726249402U,  // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
-  2685699011U,  // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
-  2551562342U,  // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
-  2953625610U,  // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
-  2953627798U,  // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
-  2953626584U,  // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
-  2551565622U,  // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
-  2953625938U,  // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
-  2587398596U,  // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
-  4032013519U,  // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
-  2953625617U,  // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
-  2690565154U,  // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
-  3625313270U,  // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
-  3771532340U,  // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
-  1148404634U,  // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
-  3625315638U,  // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
-  2619395382U,  // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
-  3837242678U,  // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
-  3799991394U,  // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
-  1148773319U,  // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
-  2551578726U,  // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
-  2551579648U,  // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
-  3625321952U,  // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
-  2685699216U,  // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
-  2551582006U,  // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
-  3740913668U,  // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
-  3661156806U,  // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
-  3893652790U,  // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
-  2685699261U,  // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
-  2551586918U,  // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
-  3625329398U,  // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
-  2551588794U,  // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
-  3088679014U,  // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
-  2551590198U,  // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
-  4029382994U,  // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
-  3625333560U,  // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
-  3731624800U,  // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
-  2551592750U,  // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
-  2622051322U,  // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
-  3733615699U,  // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
-  3795125538U,  // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
-  2222171037U,  // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
-  3740915046U,  // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
-  3296060335U,  // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
-  3736933864U,  // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
-  3805300055U,  // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
-  2669827714U,  // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
-  2551603302U,  // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
-  2953666570U,  // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
-  2953668758U,  // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
-  1148437406U,  // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
-  2551606582U,  // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
-  2953666898U,  // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
-  2587398596U,  // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
-  2669828370U,  // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
-  1148806091U,  // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
-  1543667732U,  // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
-  1548976230U,  // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
-  2685699524U,  // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
-  2685699535U,  // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
-  2551614774U,  // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
-  3704422830U,  // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
-  3893657642U,  // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
-  3770574323U,  // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
-  1548976796U,  // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
-  2622718710U,  // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
-  2622718772U,  // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
-  2622718870U,  // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
-  2819915878U,  // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
-  3625364790U,  // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
-  2622719120U,  // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
-  3760031292U,  // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
-  3667170468U,  // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
-  2819915883U,  // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
-  1489829990U,  // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
-  2563572470U,  // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
-   269271142U,  // <2,2,2,2>: Cost 1 vdup2 LHS
-  2685699698U,  // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
-  1489833270U,  // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
-  2685699720U,  // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
-  2622719930U,  // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
-  2593436837U,  // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
-   269271142U,  // <2,2,2,u>: Cost 1 vdup2 LHS
-  2685699750U,  // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
-  2690565806U,  // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
-  2953627240U,  // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
-  1879883878U,  // <2,2,3,3>: Cost 2 vzipr LHS, LHS
-  2685699790U,  // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
-  3893659342U,  // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
-  2958270812U,  // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
-  2593445030U,  // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
-  1879883883U,  // <2,2,3,u>: Cost 2 vzipr LHS, LHS
-  2551644262U,  // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
-  3625386742U,  // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
-  2551645902U,  // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
-  3759441686U,  // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
-  2551647542U,  // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
-  1548979510U,  // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
-  2764901686U,  // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
-  3667195047U,  // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
-  1548979753U,  // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
-  3696463432U,  // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
-  2617413328U,  // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
-  2685699936U,  // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
-  4027383910U,  // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
-  2228201085U,  // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
-  2617413636U,  // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
-  2617413730U,  // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
-  2819919158U,  // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
-  2819919159U,  // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
-  3625402554U,  // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
-  3760031652U,  // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
-  2617414138U,  // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
-  2685700026U,  // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
-  3625405750U,  // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
-  3760031692U,  // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
-  3088679116U,  // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
-  2657891169U,  // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
-  2685700071U,  // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
-  2726250474U,  // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
-  3704427616U,  // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
-  2660545701U,  // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
-  4030718054U,  // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
-  2617415014U,  // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
-  3302033032U,  // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
-  3661246929U,  // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
-  2617415276U,  // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
-  2731558962U,  // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
-  1489829990U,  // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
-  1548982062U,  // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
-   269271142U,  // <2,2,u,2>: Cost 1 vdup2 LHS
-  1879924838U,  // <2,2,u,3>: Cost 2 vzipr LHS, LHS
-  1489833270U,  // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
-  1548982426U,  // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
-  2953666908U,  // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
-  2819919401U,  // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
-   269271142U,  // <2,2,u,u>: Cost 1 vdup2 LHS
-  1544339456U,  // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
-   470597734U,  // <2,3,0,1>: Cost 1 vext2 LHS, LHS
-  1548984484U,  // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  2619408648U,  // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
-  1548984658U,  // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  2665857454U,  // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
-  2622726655U,  // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
-  2593494188U,  // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
-   470598301U,  // <2,3,0,u>: Cost 1 vext2 LHS, LHS
-  1544340214U,  // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  1544340276U,  // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
-  1544340374U,  // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
-  1548985304U,  // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  2551696694U,  // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
-  1548985488U,  // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  2622727375U,  // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
-  2665858347U,  // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
-  1548985709U,  // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
-  2622727613U,  // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
-  2622727711U,  // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
-  1544341096U,  // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
-  1544341158U,  // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
-  2622727958U,  // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
-  2622728032U,  // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
-  1548986298U,  // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  2665859050U,  // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
-  1548986427U,  // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
-  1548986518U,  // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
-  2622728415U,  // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
-  1489913458U,  // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
-  1544341916U,  // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
-  1548986882U,  // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
-  2665859632U,  // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
-  2234304870U,  // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
-  2958271632U,  // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
-  1548987166U,  // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
-  1483948134U,  // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
-  1483948954U,  // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
-  2622729276U,  // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
-  2557692054U,  // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
-  1483951414U,  // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
-   470601014U,  // <2,3,4,5>: Cost 1 vext2 LHS, RHS
-  1592118644U,  // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
-  2593526960U,  // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
-   470601257U,  // <2,3,4,u>: Cost 1 vext2 LHS, RHS
-  2551726182U,  // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
-  1592118992U,  // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
-  2665860862U,  // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
-  2551728642U,  // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
-  1592119238U,  // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
-  1592119300U,  // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
-  1592119394U,  // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
-  1592119464U,  // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
-  1592119545U,  // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
-  2622730529U,  // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
-  2557707164U,  // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
-  1592119802U,  // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
-  2665861682U,  // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
-  2622730893U,  // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
-  2665861810U,  // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
-  1592120120U,  // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
-  1592120142U,  // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
-  1592120223U,  // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
-  1592120314U,  // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
-  2659890261U,  // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
-  2660553894U,  // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
-  2665862371U,  // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
-  1592120678U,  // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
-  2665862534U,  // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
-  2665862614U,  // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
-  1592120940U,  // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
-  1592120962U,  // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
-  1548990163U,  // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
-   470603566U,  // <2,3,u,1>: Cost 1 vext2 LHS, LHS
-  1548990341U,  // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
-  1548990396U,  // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
-  1548990527U,  // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
-   470603930U,  // <2,3,u,5>: Cost 1 vext2 LHS, RHS
-  1548990672U,  // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
-  1592121600U,  // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
-   470604133U,  // <2,3,u,u>: Cost 1 vext2 LHS, LHS
-  2617425942U,  // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
-  2618753126U,  // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
-  2618753208U,  // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
-  2619416841U,  // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
-  2587593628U,  // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
-  2712832914U,  // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
-  1634962332U,  // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
-  3799993252U,  // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
-  1634962332U,  // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
-  2619417334U,  // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
-  3692495668U,  // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
-  2625389466U,  // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
-  2826125414U,  // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
-  3699794995U,  // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
-  3692496016U,  // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
-  3763424238U,  // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
-  3667317942U,  // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
-  2826125419U,  // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
-  2629371336U,  // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
-  3699131946U,  // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
-  2630698602U,  // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
-  2618754766U,  // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
-  2826126234U,  // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
-  2899119414U,  // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
-  3033337142U,  // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
-  3800214597U,  // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
-  2899119657U,  // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
-  2635344033U,  // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
-  4032012325U,  // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
-  3692497228U,  // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
-  3692497308U,  // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
-  3001404624U,  // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
-  2953627342U,  // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
-  2953625804U,  // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
-  3899868160U,  // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
-  2953625806U,  // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
-  2710916266U,  // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
-  3899869648U,  // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
-  3899869658U,  // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
-  3899868930U,  // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
-  2712833232U,  // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
-  2618756406U,  // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
-  2765737270U,  // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
-  4168304426U,  // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
-  2618756649U,  // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
-  2551800011U,  // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
-  2569716470U,  // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
-  2563745405U,  // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
-  2569718102U,  // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
-  2551803190U,  // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
-  3625545732U,  // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
-  1611959606U,  // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  2826128694U,  // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
-  1611959624U,  // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
-  1478066278U,  // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
-  2551808758U,  // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
-  2551809516U,  // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
-  2551810198U,  // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
-  1478069558U,  // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
-  2901888310U,  // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
-  2551812920U,  // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
-  2726251914U,  // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
-  1478072110U,  // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
-  2659234821U,  // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
-  3786722726U,  // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
-  3734303911U,  // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
-  3734967544U,  // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
-  3727005030U,  // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
-  2726251976U,  // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
-  2726251986U,  // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
-  3727005292U,  // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
-  2659234821U,  // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
-  1478082662U,  // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
-  2618758958U,  // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
-  2551826024U,  // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
-  2551826582U,  // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
-  1478085942U,  // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
-  2953668302U,  // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
-  1611959849U,  // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  2826128937U,  // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
-  1611959867U,  // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
-  3691839488U,  // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
-  2618097766U,  // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
-  2620088484U,  // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
-  2619425034U,  // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
-  2620088667U,  // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
-  2620752300U,  // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
-  3693830655U,  // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
-  3094531382U,  // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
-  2618098333U,  // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
-  3691840246U,  // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
-  3691840308U,  // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
-  2626061206U,  // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
-  2618098688U,  // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
-  2626061364U,  // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
-  3691840656U,  // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
-  3789082310U,  // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
-  2712833744U,  // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
-  2628715896U,  // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
-  3693831613U,  // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
-  4026698642U,  // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
-  2632033896U,  // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
-  3691841190U,  // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
-  2632034061U,  // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
-  3691841352U,  // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
-  3691841466U,  // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
-  3088354614U,  // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
-  3088354615U,  // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
-  2557829222U,  // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
-  2557830059U,  // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
-  2575746766U,  // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
-  3691841948U,  // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
-  2619427330U,  // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
-  2581720847U,  // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
-  2953628162U,  // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  2953626624U,  // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
-  2953626625U,  // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
-  2569781350U,  // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
-  3631580076U,  // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
-  2569782990U,  // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
-  2569783646U,  // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
-  2569784630U,  // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
-  2618101046U,  // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
-  3893905922U,  // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
-  3094564150U,  // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
-  2618101289U,  // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
-  2551873638U,  // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
-  3637560320U,  // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
-  3637560966U,  // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
-  3723030343U,  // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
-  2551876918U,  // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
-  2712834052U,  // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
-  4028713474U,  // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
-  2712834072U,  // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
-  2712834081U,  // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
-  2575769702U,  // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
-  3631596462U,  // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
-  2655924730U,  // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
-  3643541856U,  // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
-  2655924849U,  // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
-  3787755607U,  // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
-  4029385218U,  // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
-  3088682294U,  // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
-  3088682295U,  // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
-  2563833958U,  // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
-  2551890678U,  // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
-  2563835528U,  // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
-  3637577878U,  // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
-  2563837238U,  // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
-  2712834216U,  // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
-  2712834220U,  // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
-  4174449974U,  // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
-  2563839790U,  // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
-  2563842150U,  // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
-  2618103598U,  // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
-  2563843721U,  // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
-  2569816418U,  // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
-  2622748735U,  // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
-  2618103962U,  // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
-  2953669122U,  // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  2953667584U,  // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
-  2618104165U,  // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
-  2620096512U,  // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
-  1546354790U,  // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
-  2620096676U,  // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
-  3693838588U,  // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
-  1546355036U,  // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
-  3694502317U,  // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
-  2551911246U,  // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
-  2720723287U,  // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
-  1546355357U,  // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
-  2620097270U,  // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
-  2620097332U,  // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
-  2620097430U,  // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
-  2820243558U,  // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
-  2620097598U,  // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
-  2620097680U,  // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
-  3693839585U,  // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
-  2721386920U,  // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
-  2820243563U,  // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
-  2714014137U,  // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
-  2712834500U,  // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
-  2620098152U,  // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
-  2620098214U,  // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
-  2632042254U,  // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
-  2712834540U,  // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
-  2820243660U,  // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
-  2958265654U,  // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
-  2620098619U,  // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
-  2620098710U,  // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
-  3893986982U,  // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
-  2569848762U,  // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
-  2620098972U,  // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
-  2620099074U,  // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
-  3893987022U,  // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
-  3001404644U,  // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
-  1879887158U,  // <2,6,3,7>: Cost 2 vzipr LHS, RHS
-  1879887159U,  // <2,6,3,u>: Cost 2 vzipr LHS, RHS
-  2620099484U,  // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
-  2620099566U,  // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
-  2620099644U,  // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
-  3643599207U,  // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
-  2575830080U,  // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
-  1546358070U,  // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
-  2667875700U,  // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
-  4028042550U,  // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
-  1546358313U,  // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
-  3693841992U,  // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
-  2667876048U,  // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
-  2712834756U,  // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
-  3643607400U,  // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
-  2252091873U,  // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
-  2667876356U,  // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
-  2667876450U,  // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
-  2820246838U,  // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
-  2820246839U,  // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
-  2563899494U,  // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
-  3893988683U,  // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
-  2563901072U,  // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
-  3893987236U,  // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
-  2563902774U,  // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
-  3893988723U,  // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
-  2712834872U,  // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
-  2955644214U,  // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
-  2955644215U,  // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
-  2712834894U,  // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
-  2724926296U,  // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
-  2725000033U,  // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
-  2702365544U,  // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
-  2712834934U,  // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
-  3776107393U,  // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
-  2725294981U,  // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
-  2726253452U,  // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
-  2712834966U,  // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
-  2620102355U,  // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
-  1546360622U,  // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
-  2620102536U,  // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
-  2820244125U,  // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
-  1594136612U,  // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
-  1546360986U,  // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
-  2620102864U,  // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
-  1879928118U,  // <2,6,u,7>: Cost 2 vzipr LHS, RHS
-  1879928119U,  // <2,6,u,u>: Cost 2 vzipr LHS, RHS
-  2726179825U,  // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
-  1652511738U,  // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
-  2621431972U,  // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
-  2257949868U,  // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
-  2726474773U,  // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
-  2620768686U,  // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
-  2621432319U,  // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
-  2599760953U,  // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
-  1653027897U,  // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
-  2639348470U,  // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
-  3695174452U,  // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
-  3695174550U,  // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
-  3694511104U,  // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
-  3713090594U,  // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
-  3693184144U,  // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
-  2627405016U,  // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
-  3799995519U,  // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
-  2639348470U,  // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
-  3695175101U,  // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
-  3643655168U,  // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
-  2257892517U,  // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
-  3695175334U,  // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
-  3695175465U,  // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
-  2632714080U,  // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
-  2633377713U,  // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
-  3695175658U,  // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
-  2634704979U,  // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
-  1514094694U,  // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
-  2569921680U,  // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
-  2587838056U,  // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
-  2569922927U,  // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
-  1514097974U,  // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
-  2581868321U,  // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
-  1514099194U,  // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
-  2587841530U,  // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
-  1514100526U,  // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
-  2708706617U,  // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
-  3649643418U,  // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
-  3649644330U,  // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
-  2257982640U,  // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
-  3649645641U,  // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
-  2621435190U,  // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
-  2712835441U,  // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
-  3799995762U,  // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
-  2621435433U,  // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
-  2729497990U,  // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
-  3643679744U,  // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
-  3637708424U,  // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
-  3643681137U,  // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
-  2599800118U,  // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
-  3786577334U,  // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
-  3786577345U,  // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
-  2599802214U,  // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
-  2599802670U,  // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
-  2581889126U,  // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
-  3643687936U,  // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
-  2663240186U,  // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
-  3643689330U,  // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
-  2581892406U,  // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
-  2581892900U,  // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
-  2587865597U,  // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
-  3786577428U,  // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
-  2581894958U,  // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
-  2726254119U,  // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
-  3804640817U,  // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
-  3637724826U,  // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
-  3734992123U,  // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
-  2552040758U,  // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
-  3799995992U,  // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
-  2663241198U,  // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
-  2712835692U,  // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
-  2731562607U,  // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
-  1514135654U,  // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
-  1657820802U,  // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
-  2587879016U,  // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
-  2569963892U,  // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
-  1514138934U,  // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
-  2621438106U,  // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
-  1514140159U,  // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
-  2587882490U,  // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
-  1514141486U,  // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
-  1544380416U,  // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
-   470638699U,  // <2,u,0,1>: Cost 1 vext2 LHS, LHS
-  1544380580U,  // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  1658631909U,  // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
-  1544380754U,  // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  2665898414U,  // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
-  1658853120U,  // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
-  3094531625U,  // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
-   470639261U,  // <2,u,0,u>: Cost 1 vext2 LHS, LHS
-  1544381174U,  // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  1544381236U,  // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
-  1544381334U,  // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
-  1544381400U,  // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  2618123325U,  // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
-  1544381584U,  // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  2618123489U,  // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
-  2726254427U,  // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
-  1544381823U,  // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
-  1478328422U,  // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
-  2618123807U,  // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
-   269271142U,  // <2,u,2,2>: Cost 1 vdup2 LHS
-  1544382118U,  // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
-  1478331702U,  // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
-  2618124136U,  // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
-  1544382394U,  // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  3088354857U,  // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
-   269271142U,  // <2,u,2,u>: Cost 1 vdup2 LHS
-  1544382614U,  // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
-  2953627374U,  // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
-  1490282143U,  // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
-  1879883932U,  // <2,u,3,3>: Cost 2 vzipr LHS, LHS
-  1544382978U,  // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
-  2953627378U,  // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
-  1514172931U,  // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
-  1879887176U,  // <2,u,3,7>: Cost 2 vzipr LHS, RHS
-  1879883937U,  // <2,u,3,u>: Cost 2 vzipr LHS, LHS
-  1484316774U,  // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
-  1484317639U,  // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
-  2552088270U,  // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
-  1190213513U,  // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
-  1484320054U,  // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
-   470641974U,  // <2,u,4,5>: Cost 1 vext2 LHS, RHS
-  1592159604U,  // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
-  3094564393U,  // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
-   470642217U,  // <2,u,4,u>: Cost 1 vext2 LHS, RHS
-  2552094959U,  // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
-  1592159952U,  // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
-  2564040353U,  // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
-  2690275455U,  // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
-  1592160198U,  // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
-  1592160260U,  // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
-  1611962522U,  // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  1592160424U,  // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
-  1611962540U,  // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
-  1478361190U,  // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
-  2552103670U,  // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
-  1592160762U,  // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
-  2685704400U,  // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
-  1478364470U,  // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
-  2901891226U,  // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
-  1592161080U,  // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
-  1592161102U,  // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
-  1478367022U,  // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
-  1592161274U,  // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
-  2659931226U,  // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
-  2564056739U,  // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
-  2665903331U,  // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
-  1592161638U,  // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
-  2665903494U,  // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
-  2587947527U,  // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
-  1592161900U,  // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
-  1592161922U,  // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
-  1478377574U,  // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
-   470644526U,  // <2,u,u,1>: Cost 1 vext2 LHS, LHS
-   269271142U,  // <2,u,u,2>: Cost 1 vdup2 LHS
-  1879924892U,  // <2,u,u,3>: Cost 2 vzipr LHS, LHS
-  1478380854U,  // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
-   470644890U,  // <2,u,u,5>: Cost 1 vext2 LHS, RHS
-  1611962765U,  // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
-  1879928136U,  // <2,u,u,7>: Cost 2 vzipr LHS, RHS
-   470645093U,  // <2,u,u,u>: Cost 1 vext2 LHS, LHS
-  1611448320U,  // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
-  1611890698U,  // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
-  1611890708U,  // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
-  3763576860U,  // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
-  2689835045U,  // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
-  3698508206U,  // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
-  3763576887U,  // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
-  3667678434U,  // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
-  1616093258U,  // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
-  1490337894U,  // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
-  2685632602U,  // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
-   537706598U,  // <3,0,1,2>: Cost 1 vext3 LHS, LHS
-  2624766936U,  // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
-  1490341174U,  // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
-  2624767120U,  // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
-  2732966030U,  // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
-  2593944803U,  // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
-   537706652U,  // <3,0,1,u>: Cost 1 vext3 LHS, LHS
-  1611890852U,  // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
-  2685632684U,  // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
-  2685632692U,  // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
-  2685632702U,  // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
-  1611890892U,  // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
-  2732966102U,  // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
-  2624767930U,  // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
-  2685632744U,  // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
-  1611890924U,  // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
-  2624768150U,  // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
-  2685632764U,  // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
-  2685632774U,  // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
-  2624768412U,  // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
-  2624768514U,  // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
-  3702491714U,  // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
-  2624768632U,  // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
-  3702491843U,  // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
-  2686959934U,  // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
-  2689835336U,  // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
-  1611891026U,  // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
-  1611891036U,  // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
-  3763577184U,  // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
-  2689835374U,  // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
-  1551027510U,  // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
-  2666573172U,  // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
-  3667711206U,  // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
-  1616093586U,  // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
-  2685190556U,  // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
-  2666573520U,  // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
-  3040886886U,  // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
-  3625912834U,  // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
-  2666573766U,  // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
-  2666573828U,  // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
-  2732966354U,  // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
-  2666573992U,  // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
-  3040886940U,  // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
-  2685190637U,  // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
-  2732966390U,  // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
-  2689835519U,  // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
-  3667724438U,  // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
-  3763577355U,  // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
-  3806708243U,  // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
-  2666574648U,  // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
-  2657948520U,  // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
-  2689835573U,  // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
-  2666574842U,  // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
-  2685633095U,  // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
-  2660603052U,  // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
-  3643844997U,  // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
-  2666575206U,  // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
-  3655790391U,  // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
-  3731690968U,  // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
-  2666575468U,  // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
-  2664584850U,  // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
-  1616093834U,  // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
-  1611891346U,  // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
-   537707165U,  // <3,0,u,2>: Cost 1 vext3 LHS, LHS
-  2689835684U,  // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
-  1616093874U,  // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
-  1551030426U,  // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
-  2624772304U,  // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
-  2594002154U,  // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
-   537707219U,  // <3,0,u,u>: Cost 1 vext3 LHS, LHS
-  2552201318U,  // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
-  2618802278U,  // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
-  2618802366U,  // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
-  1611449078U,  // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
-  2552204598U,  // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
-  2732966663U,  // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
-  3906258396U,  // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
-  3667752171U,  // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
-  1611891491U,  // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
-  2689835819U,  // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
-  1611449140U,  // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
-  2624775063U,  // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
-  1611891528U,  // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
-  2689835859U,  // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
-  2689835868U,  // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
-  3763577701U,  // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
-  3765273452U,  // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
-  1611891573U,  // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
-  2629420494U,  // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
-  2689835911U,  // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
-  2564163248U,  // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
-  1611449238U,  // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
-  2564164918U,  // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
-  2689835947U,  // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
-  3692545978U,  // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
-  2732966842U,  // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
-  1611891651U,  // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
-  1484456038U,  // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
-  1611891672U,  // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
-  2685633502U,  // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
-  2685633512U,  // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
-  1484459318U,  // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
-  1611891712U,  // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
-  2689836041U,  // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
-  2733409294U,  // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
-  1611891735U,  // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
-  2552234086U,  // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
-  2732966955U,  // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
-  2732966964U,  // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
-  2685633597U,  // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
-  2552237366U,  // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
-  2618805558U,  // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
-  2769472822U,  // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
-  3667784943U,  // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
-  2685633642U,  // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
-  2689836143U,  // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
-  2564187280U,  // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
-  2564187827U,  // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
-  1611891856U,  // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
-  2689836183U,  // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
-  3759375522U,  // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
-  3720417378U,  // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
-  2832518454U,  // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
-  1611891901U,  // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
-  3763578048U,  // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
-  2689836239U,  // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
-  2732967128U,  // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
-  2685633761U,  // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
-  3763578088U,  // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
-  2689836275U,  // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
-  3763578108U,  // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
-  2732967166U,  // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
-  2685633806U,  // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
-  3631972454U,  // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
-  2659947612U,  // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
-  4036102294U,  // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
-  3095396454U,  // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
-  3631975734U,  // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
-  2222982144U,  // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
-  3296797705U,  // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
-  3720418924U,  // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
-  3095396459U,  // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
-  1484496998U,  // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
-  1611892077U,  // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
-  2685633907U,  // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
-  1611892092U,  // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
-  1484500278U,  // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
-  1611892117U,  // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
-  2685633950U,  // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
-  2832518697U,  // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
-  1611892140U,  // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
-  2623455232U,  // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
-  1549713510U,  // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
-  2689836484U,  // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
-  2685633997U,  // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
-  2623455570U,  // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
-  2732967398U,  // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
-  2689836524U,  // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
-  2229044964U,  // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
-  1549714077U,  // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
-  1549714166U,  // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
-  2623456052U,  // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
-  2623456150U,  // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
-  2685634079U,  // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
-  2552286518U,  // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
-  2623456400U,  // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
-  2689836604U,  // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
-  3667834101U,  // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
-  1155385070U,  // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
-  2689836629U,  // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
-  2689836640U,  // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
-  1611449960U,  // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
-  1611892338U,  // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
-  2689836669U,  // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
-  2689836680U,  // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
-  2689836688U,  // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
-  3763578518U,  // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
-  1611892383U,  // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
-  1611450022U,  // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
-  2685191854U,  // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
-  2685191865U,  // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
-  2685191875U,  // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
-  1611450062U,  // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
-  2732967635U,  // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
-  2732967645U,  // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
-  2732967652U,  // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
-  1611450094U,  // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
-  2558279782U,  // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
-  2558280602U,  // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
-  2732967692U,  // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
-  2685634326U,  // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
-  2558283062U,  // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
-  1549716790U,  // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
-  2689836844U,  // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
-  2229077736U,  // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
-  1549717033U,  // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
-  2552316006U,  // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
-  2228643507U,  // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
-  2689836896U,  // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
-  2685634408U,  // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
-  1155122894U,  // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
-  2665263108U,  // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
-  2689836932U,  // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
-  2665263272U,  // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
-  1155417842U,  // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
-  2689836953U,  // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
-  2689836964U,  // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
-  2689836976U,  // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
-  1611892666U,  // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
-  2689836993U,  // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
-  2689837004U,  // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
-  2689837013U,  // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
-  2665263950U,  // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
-  1611892711U,  // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
-  2665264122U,  // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
-  2623460419U,  // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
-  4169138340U,  // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
-  2962358374U,  // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
-  2665264486U,  // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
-  2228954841U,  // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
-  2229028578U,  // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
-  2665264748U,  // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
-  2962358379U,  // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
-  1611892795U,  // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
-  1549719342U,  // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
-  1611449960U,  // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
-  1611892824U,  // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
-  1611892835U,  // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
-  1549719706U,  // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
-  2689837168U,  // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
-  2665265408U,  // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
-  1611892867U,  // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
-  2685192331U,  // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
-  1611450518U,  // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
-  2685634717U,  // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
-  2564294806U,  // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
-  2685634736U,  // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
-  2732968122U,  // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
-  3763579075U,  // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
-  4034053264U,  // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
-  1611450581U,  // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
-  2685192415U,  // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
-  1550385992U,  // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
-  2685192433U,  // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
-  2685634808U,  // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
-  2558332214U,  // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
-  2685634828U,  // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
-  3759376661U,  // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
-  2703477022U,  // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
-  1555031423U,  // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
-  2564309094U,  // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
-  2630100513U,  // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
-  1557022322U,  // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
-  2685192520U,  // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
-  2564312374U,  // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
-  2732968286U,  // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
-  2685634918U,  // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
-  2704140655U,  // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
-  1561004120U,  // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
-  1496547430U,  // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
-  2624129256U,  // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
-  2630764866U,  // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
-   336380006U,  // <3,3,3,3>: Cost 1 vdup3 LHS
-  1496550710U,  // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
-  2732968368U,  // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
-  2624129683U,  // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
-  2594182400U,  // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
-   336380006U,  // <3,3,3,u>: Cost 1 vdup3 LHS
-  2558353510U,  // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
-  2558354411U,  // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
-  2564327108U,  // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
-  2564327938U,  // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
-  2960343962U,  // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
-  1611893250U,  // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
-  2771619126U,  // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
-  4034086032U,  // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
-  1611893277U,  // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
-  2558361702U,  // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
-  2558362604U,  // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
-  2558363342U,  // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
-  2732968512U,  // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
-  2558364982U,  // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
-  3101279950U,  // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
-  2665934946U,  // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
-  2826636598U,  // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
-  2826636599U,  // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
-  2732968568U,  // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
-  3763579521U,  // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
-  2732968586U,  // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
-  2732968595U,  // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
-  2732968604U,  // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
-  3763579557U,  // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
-  2732968621U,  // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
-  2657973099U,  // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
-  2658636732U,  // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
-  2558378086U,  // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
-  2558378990U,  // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
-  2564351687U,  // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
-  2661291264U,  // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
-  2558381366U,  // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
-  2732968694U,  // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
-  3781126907U,  // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
-  3095397376U,  // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
-  2558383918U,  // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
-  1496547430U,  // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
-  1611893534U,  // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
-  1592858504U,  // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
-   336380006U,  // <3,3,u,3>: Cost 1 vdup3 LHS
-  1496550710U,  // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
-  1611893574U,  // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
-  2690280268U,  // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
-  2826636841U,  // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
-   336380006U,  // <3,3,u,u>: Cost 1 vdup3 LHS
-  2624798720U,  // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
-  1551056998U,  // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
-  2624798884U,  // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
-  3693232384U,  // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
-  2624799058U,  // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
-  1659227026U,  // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
-  1659227036U,  // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
-  3667973382U,  // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
-  1551057565U,  // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
-  2624799478U,  // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
-  2624799540U,  // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
-  1551057818U,  // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
-  2624799704U,  // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
-  2564377910U,  // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
-  2689838050U,  // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
-  2689838062U,  // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
-  2628117807U,  // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
-  1555039616U,  // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
-  3626180710U,  // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
-  2624800298U,  // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
-  2624800360U,  // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
-  2624800422U,  // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
-  2624800514U,  // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
-  2709965878U,  // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
-  2689838140U,  // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
-  2634090504U,  // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
-  2689838158U,  // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
-  2624800918U,  // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
-  2636081403U,  // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
-  2636745036U,  // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
-  2624801180U,  // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
-  2624801232U,  // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
-  2905836854U,  // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
-  3040054582U,  // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
-  3702524611U,  // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
-  2624801566U,  // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
-  2564399206U,  // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
-  2564400026U,  // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
-  2564400845U,  // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
-  2570373542U,  // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
-  1659227344U,  // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
-  1551060278U,  // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
-  1659227364U,  // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
-  3668006154U,  // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
-  1551060521U,  // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
-  1490665574U,  // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
-  2689838341U,  // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
-  1490667214U,  // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
-  2564409494U,  // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
-  1490668854U,  // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
-  2689838381U,  // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
-   537709878U,  // <3,4,5,6>: Cost 1 vext3 LHS, RHS
-  2594272523U,  // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
-   537709896U,  // <3,4,5,u>: Cost 1 vext3 LHS, RHS
-  2689838411U,  // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
-  2558444534U,  // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
-  2666607098U,  // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
-  2558446082U,  // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
-  1659227508U,  // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
-  2689838462U,  // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
-  2689838471U,  // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
-  2657981292U,  // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
-  1659227540U,  // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
-  2666607610U,  // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
-  3702527072U,  // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
-  2660635824U,  // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
-  3644139945U,  // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
-  2666607974U,  // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
-  2732969416U,  // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
-  2732969425U,  // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
-  2666608236U,  // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
-  2664617622U,  // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
-  1490690150U,  // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
-  1551062830U,  // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
-  1490691793U,  // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
-  2624804796U,  // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
-  1490693430U,  // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
-  1551063194U,  // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
-   537710121U,  // <3,4,u,6>: Cost 1 vext3 LHS, RHS
-  2594297102U,  // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
-   537710139U,  // <3,4,u,u>: Cost 1 vext3 LHS, RHS
-  3692576768U,  // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
-  2618835046U,  // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
-  2618835138U,  // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
-  3692577024U,  // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
-  2689838690U,  // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
-  2732969579U,  // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
-  2732969588U,  // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
-  2246963055U,  // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
-  2618835613U,  // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
-  2594308198U,  // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
-  3692577588U,  // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
-  2624807835U,  // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
-  2625471468U,  // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
-  2626135101U,  // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
-  2594311888U,  // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
-  3699877107U,  // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
-  1641680592U,  // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
-  1641754329U,  // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
-  3692578274U,  // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
-  2630116899U,  // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
-  3692578408U,  // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
-  2625472206U,  // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
-  2632107798U,  // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
-  2715938575U,  // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
-  3692578746U,  // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
-  2716086049U,  // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
-  2634762330U,  // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
-  3692578966U,  // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
-  2636089596U,  // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
-  3699214668U,  // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
-  2638080412U,  // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
-  2618837506U,  // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
-  2832844494U,  // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
-  4033415682U,  // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
-  3095072054U,  // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
-  3095072055U,  // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
-  2600304742U,  // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
-  3763580815U,  // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
-  2564474582U,  // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
-  3699879044U,  // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
-  2600308022U,  // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
-  2618838326U,  // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
-  2772454710U,  // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
-  1659228102U,  // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
-  1659228111U,  // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
-  2570453094U,  // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
-  2624810704U,  // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
-  2570454734U,  // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
-  2570455472U,  // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
-  2570456374U,  // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
-  1659228164U,  // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
-  2732969998U,  // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
-  1659228184U,  // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
-  1659228193U,  // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
-  2732970020U,  // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
-  2732970035U,  // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
-  2564490968U,  // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
-  2732970050U,  // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
-  2732970060U,  // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
-  2732970071U,  // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
-  2732970080U,  // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
-  1659228258U,  // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
-  1659228267U,  // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
-  1484783718U,  // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
-  1484784640U,  // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
-  2558527080U,  // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
-  2558527638U,  // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
-  1484786998U,  // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
-  1659228328U,  // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
-  2732970154U,  // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
-  2558531180U,  // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
-  1484789550U,  // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
-  1484791910U,  // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
-  1484792833U,  // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
-  2558535272U,  // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
-  2558535830U,  // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
-  1484795190U,  // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
-  1659228409U,  // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
-  2772457626U,  // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
-  1646326023U,  // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
-  1484797742U,  // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
-  2558541926U,  // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
-  2689839393U,  // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
-  2689839404U,  // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
-  3706519808U,  // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
-  2689839420U,  // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
-  2732970314U,  // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
-  2732970316U,  // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
-  2960313654U,  // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
-  2689839456U,  // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
-  3763581290U,  // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
-  3763581297U,  // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
-  2624816028U,  // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
-  3763581315U,  // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
-  2626143294U,  // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
-  3763581335U,  // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
-  2721321376U,  // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
-  2721395113U,  // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
-  2628797826U,  // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
-  2594390118U,  // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
-  2721616324U,  // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
-  2630788725U,  // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
-  3763581395U,  // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
-  2632115991U,  // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
-  2632779624U,  // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
-  2594394618U,  // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
-  1648316922U,  // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
-  1648390659U,  // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
-  3693914262U,  // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
-  3638281176U,  // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
-  3696568678U,  // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
-  2638088604U,  // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
-  2632780290U,  // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
-  3712494145U,  // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
-  3698559612U,  // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
-  2959674678U,  // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
-  2959674679U,  // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
-  3763581536U,  // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
-  2722943590U,  // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
-  2732970609U,  // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
-  3698560147U,  // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
-  2732970628U,  // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
-  2689839757U,  // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
-  2732970640U,  // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
-  2960346422U,  // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
-  2689839784U,  // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
-  2576498790U,  // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
-  3650241270U,  // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
-  2732970692U,  // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
-  2576501250U,  // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
-  2576501906U,  // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
-  3650244622U,  // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
-  4114633528U,  // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
-  2732970735U,  // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
-  2576504622U,  // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
-  2732970749U,  // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
-  2724270856U,  // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
-  2624819706U,  // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
-  3656223234U,  // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
-  2732970788U,  // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
-  2732970800U,  // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
-  1659228984U,  // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
-  1659228994U,  // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
-  1659229003U,  // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
-  1659229006U,  // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
-  2558600201U,  // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
-  2558601146U,  // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
-  2725081963U,  // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
-  1659229046U,  // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
-  2715423611U,  // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
-  2722059141U,  // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
-  2962361654U,  // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
-  1659229078U,  // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
-  1659229087U,  // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
-  2689840041U,  // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
-  2558609339U,  // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
-  2576525853U,  // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
-  1659229127U,  // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
-  2689840081U,  // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
-  1659228984U,  // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
-  1652298720U,  // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
-  1659229159U,  // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
-  2626813952U,  // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
-  1553072230U,  // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
-  2626814116U,  // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
-  3700556028U,  // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
-  2626814290U,  // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
-  2582507375U,  // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
-  2588480072U,  // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
-  2732971055U,  // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
-  1553072797U,  // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
-  2626814710U,  // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
-  2626814772U,  // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
-  2626814870U,  // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
-  2625487854U,  // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
-  2582514998U,  // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
-  1553073296U,  // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
-  2627478753U,  // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
-  2727367810U,  // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
-  1555064195U,  // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
-  2588491878U,  // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
-  3700557318U,  // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
-  2626815592U,  // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
-  2626815654U,  // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
-  2588495158U,  // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
-  2632787817U,  // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
-  1559709626U,  // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
-  2728031443U,  // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
-  1561036892U,  // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
-  2626816150U,  // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
-  2626816268U,  // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
-  2633451878U,  // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
-  2626816412U,  // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
-  2626816514U,  // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
-  2638760514U,  // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
-  2639424147U,  // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
-  2826961920U,  // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
-  2626816798U,  // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
-  2582536294U,  // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
-  2582537360U,  // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
-  2588510138U,  // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
-  3700558996U,  // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
-  2582539574U,  // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
-  1553075510U,  // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
-  2588512844U,  // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
-  2564625766U,  // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
-  1553075753U,  // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
-  2732971398U,  // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
-  2626817744U,  // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
-  3700559649U,  // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
-  2626817903U,  // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
-  2258728203U,  // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
-  2732971446U,  // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
-  2732971457U,  // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
-  2826964278U,  // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
-  2826964279U,  // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
-  2732971478U,  // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
-  2732971486U,  // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
-  2633454074U,  // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
-  2633454152U,  // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
-  2732971518U,  // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
-  2732971526U,  // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
-  2732971537U,  // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
-  2732971540U,  // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
-  2726041124U,  // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
-  2570616934U,  // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
-  2570617856U,  // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
-  2564646635U,  // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
-  2570619332U,  // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
-  2570620214U,  // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
-  2582564726U,  // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
-  2588537423U,  // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
-  1659229804U,  // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
-  1659229804U,  // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
-  2626819795U,  // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
-  1553078062U,  // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
-  2626819973U,  // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
-  2826961565U,  // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
-  2626820159U,  // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
-  1553078426U,  // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
-  1595545808U,  // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
-  1659229804U,  // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
-  1553078629U,  // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
-  1611448320U,  // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
-  1611896531U,  // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
-  1659672284U,  // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
-  1616099045U,  // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
-  2685638381U,  // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
-  1663874806U,  // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
-  1663874816U,  // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
-  2960313672U,  // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
-  1611896594U,  // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
-  1549763324U,  // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
-  1550426957U,  // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
-   537712430U,  // <3,u,1,2>: Cost 1 vext3 LHS, LHS
-  1616541495U,  // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
-  1490930998U,  // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
-  1553081489U,  // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
-  2627486946U,  // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
-  1659230043U,  // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
-   537712484U,  // <3,u,1,u>: Cost 1 vext3 LHS, LHS
-  1611890852U,  // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
-  2624833102U,  // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
-  1557063287U,  // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
-  1616099205U,  // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
-  1611890892U,  // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
-  2689841054U,  // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
-  1559717819U,  // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
-  1659230124U,  // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
-  1616541618U,  // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
-  1611896764U,  // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
-  1484973079U,  // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
-  2685638607U,  // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
-   336380006U,  // <3,u,3,3>: Cost 1 vdup3 LHS
-  1611896804U,  // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
-  1616541679U,  // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
-  2690283512U,  // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
-  2959674696U,  // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
-   336380006U,  // <3,u,3,u>: Cost 1 vdup3 LHS
-  2558722150U,  // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
-  1659672602U,  // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
-  1659672612U,  // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
-  2689841196U,  // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
-  1659227344U,  // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
-  1611896895U,  // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
-  1663875144U,  // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
-  1659230289U,  // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
-  1611896922U,  // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
-  1490960486U,  // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
-  2689841261U,  // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
-  1490962162U,  // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
-  1616541823U,  // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
-  1490963766U,  // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
-  1659228164U,  // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
-   537712794U,  // <3,u,5,6>: Cost 1 vext3 LHS, RHS
-  1659230371U,  // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
-   537712812U,  // <3,u,5,u>: Cost 1 vext3 LHS, RHS
-  2689841327U,  // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
-  2558739482U,  // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
-  2689841351U,  // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
-  1616099536U,  // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
-  1659227508U,  // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
-  2690283746U,  // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
-  1659228984U,  // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
-  1659230445U,  // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
-  1616099581U,  // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
-  1485004902U,  // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
-  1485005851U,  // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
-  2558748264U,  // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
-  3095397021U,  // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
-  1485008182U,  // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
-  1659228328U,  // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
-  2722060599U,  // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
-  1659229804U,  // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
-  1485010734U,  // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
-  1616099665U,  // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
-  1611897179U,  // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
-   537712997U,  // <3,u,u,2>: Cost 1 vext3 LHS, LHS
-   336380006U,  // <3,u,u,3>: Cost 1 vdup3 LHS
-  1616099705U,  // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
-  1611897219U,  // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
-   537713037U,  // <3,u,u,6>: Cost 1 vext3 LHS, RHS
-  1659230607U,  // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
-   537713051U,  // <3,u,u,u>: Cost 1 vext3 LHS, LHS
-  2691907584U,  // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
-  2691907594U,  // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
-  2691907604U,  // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
-  3709862144U,  // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
-  2684682280U,  // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
-  3694600633U,  // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
-  3291431290U,  // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
-  3668342067U,  // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
-  2691907657U,  // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
-  2570715238U,  // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
-  2570716058U,  // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
-  1618165862U,  // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  2570717648U,  // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
-  2570718518U,  // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
-  2594607206U,  // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
-  3662377563U,  // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
-  2594608436U,  // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
-  1618165916U,  // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
-  2685714598U,  // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
-  3759530159U,  // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
-  2685862072U,  // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
-  2631476937U,  // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
-  2685714636U,  // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
-  3765649622U,  // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
-  2686157020U,  // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
-  3668358453U,  // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
-  2686304494U,  // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
-  3632529510U,  // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
-  2686451968U,  // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
-  2686525705U,  // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
-  3760341266U,  // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
-  3632532790U,  // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
-  3913254606U,  // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
-  3705219740U,  // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
-  3713845990U,  // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
-  2686451968U,  // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
-  2552823910U,  // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
-  2691907922U,  // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
-  2691907932U,  // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
-  3626567830U,  // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
-  2552827190U,  // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
-  2631478582U,  // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
-  3626570017U,  // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
-  3668374839U,  // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
-  2552829742U,  // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
-  2558804070U,  // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
-  1839644774U,  // <4,0,5,1>: Cost 2 vzipl RHS, LHS
-  2913386660U,  // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
-  2570750420U,  // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
-  2558807350U,  // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
-  3987128750U,  // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
-  3987128822U,  // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
-  2594641208U,  // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
-  1839645341U,  // <4,0,5,u>: Cost 2 vzipl RHS, LHS
-  2552840294U,  // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
-  3047604234U,  // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
-  1973862502U,  // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
-  2570758613U,  // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
-  2552843574U,  // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
-  2217664887U,  // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
-  3662418528U,  // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
-  2658022257U,  // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
-  1973862556U,  // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
-  3731764218U,  // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
-  3988324454U,  // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
-  4122034278U,  // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
-  3735082246U,  // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
-  3731764536U,  // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
-  3937145718U,  // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
-  3737073145U,  // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
-  3731764844U,  // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
-  4122034332U,  // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
-  2552856678U,  // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
-  1841635430U,  // <4,0,u,1>: Cost 2 vzipl RHS, LHS
-  1618166429U,  // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  2570774999U,  // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
-  2552859958U,  // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
-  2631481498U,  // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
-  2686157020U,  // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
-  2594665787U,  // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
-  1618166483U,  // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
-  2617548837U,  // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
-  2622857318U,  // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
-  3693281484U,  // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
-  2691908342U,  // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
-  2622857554U,  // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
-  3764470538U,  // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
-  3695272459U,  // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
-  3733094980U,  // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
-  2622857885U,  // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
-  3696599798U,  // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
-  2691097399U,  // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
-  2631484314U,  // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
-  2691908424U,  // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
-  3696600125U,  // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
-  3696600175U,  // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
-  3696600307U,  // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
-  3668423997U,  // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
-  2691908469U,  // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
-  2570797158U,  // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
-  2570797978U,  // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
-  3696600680U,  // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
-  1618166682U,  // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
-  2570800438U,  // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
-  3765650347U,  // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
-  3696601018U,  // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
-  3668432190U,  // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
-  1618535367U,  // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
-  2564833382U,  // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
-  2691908568U,  // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
-  2691908578U,  // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
-  2692572139U,  // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
-  2564836662U,  // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
-  2691908608U,  // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
-  2588725862U,  // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
-  3662468090U,  // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
-  2691908631U,  // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
-  3760194590U,  // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
-  3693947874U,  // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
-  3765650484U,  // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
-  3113877606U,  // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
-  3760194630U,  // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
-  2622860598U,  // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
-  3297436759U,  // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
-  3800007772U,  // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
-  2622860841U,  // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
-  1479164006U,  // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
-  2552906486U,  // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
-  2552907299U,  // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
-  2552907926U,  // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
-  1479167286U,  // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
-  2913387664U,  // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
-  2600686074U,  // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
-  2600686586U,  // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
-  1479169838U,  // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
-  2552914022U,  // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
-  2558886708U,  // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
-  4028205206U,  // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
-  3089858662U,  // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
-  2552917302U,  // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
-  2223637584U,  // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
-  4121347081U,  // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
-  3721155406U,  // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
-  2552919854U,  // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
-  2659357716U,  // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
-  3733763173U,  // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
-  3734426806U,  // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
-  2695226671U,  // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
-  3721155942U,  // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
-  3721155976U,  // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
-  3662500458U,  // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
-  3721156204U,  // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
-  2659357716U,  // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
-  1479188582U,  // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
-  2552931062U,  // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
-  2552931944U,  // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
-  1622148480U,  // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
-  1479191862U,  // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
-  2622863514U,  // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
-  2588725862U,  // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
-  2600686586U,  // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
-  1479194414U,  // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
-  2617557030U,  // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
-  2622865510U,  // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
-  2622865612U,  // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
-  3693289753U,  // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
-  2635473244U,  // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
-  3765650918U,  // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
-  2696775148U,  // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
-  3695944285U,  // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
-  2622866077U,  // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
-  3696607990U,  // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
-  3696608052U,  // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
-  3696608150U,  // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
-  3895574630U,  // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
-  2691909162U,  // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
-  3696608400U,  // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
-  3760784956U,  // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
-  3773908549U,  // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
-  2691909162U,  // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
-  3696608748U,  // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
-  3696608828U,  // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
-  2691909224U,  // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
-  2691909234U,  // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
-  3759605368U,  // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
-  3696609156U,  // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
-  3760785040U,  // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
-  3668505927U,  // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
-  2691909279U,  // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
-  2691909286U,  // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
-  3764840111U,  // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
-  3765651129U,  // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
-  2698544836U,  // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
-  2685863630U,  // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
-  2698692310U,  // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
-  3772507871U,  // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
-  2698839784U,  // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
-  2691909358U,  // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
-  2564915302U,  // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
-  2564916122U,  // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
-  2564917004U,  // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
-  2699208469U,  // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
-  2564918582U,  // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
-  2622868790U,  // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
-  2229667632U,  // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
-  3800082229U,  // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
-  2622869033U,  // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
-  2552979558U,  // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
-  2558952342U,  // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
-  2564925032U,  // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
-  2967060582U,  // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
-  2552982838U,  // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
-  3987130190U,  // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
-  2913388474U,  // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
-  3895577910U,  // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
-  2552985390U,  // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
-  1479245926U,  // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
-  2552988406U,  // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
-  2552989288U,  // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
-  2954461286U,  // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
-  1479249206U,  // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
-  2229610281U,  // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
-  2600767994U,  // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
-  2600768506U,  // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
-  1479251758U,  // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
-  2659365909U,  // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
-  3733771366U,  // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
-  3734434999U,  // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
-  2701199368U,  // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
-  4175774618U,  // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
-  3303360298U,  // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
-  3727136217U,  // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
-  3727136364U,  // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
-  2659365909U,  // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
-  1479262310U,  // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
-  2553004790U,  // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
-  2553005672U,  // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
-  2954477670U,  // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
-  1479265590U,  // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
-  2622871706U,  // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
-  2229700404U,  // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
-  2600784890U,  // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
-  1479268142U,  // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
-  3765651595U,  // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
-  2691909782U,  // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
-  2702452897U,  // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
-  3693297946U,  // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
-  3760711856U,  // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
-  2235533820U,  // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
-  3309349381U,  // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
-  3668563278U,  // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
-  2691909845U,  // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
-  2235173328U,  // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
-  3764840678U,  // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
-  2630173594U,  // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
-  2703190267U,  // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
-  3760195840U,  // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
-  3765651724U,  // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
-  3309357574U,  // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
-  3769633054U,  // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
-  2703558952U,  // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
-  3626770534U,  // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
-  2630174250U,  // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
-  3765651777U,  // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
-  2703853900U,  // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
-  3626773814U,  // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
-  2704001374U,  // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
-  3765651814U,  // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
-  3769633135U,  // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
-  2634819681U,  // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
-  3765651839U,  // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
-  3765651848U,  // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
-  3710552404U,  // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
-  2691910044U,  // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
-  2704591270U,  // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
-  3769633202U,  // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
-  3703917212U,  // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
-  3769633220U,  // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
-  2691910044U,  // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
-  2691910096U,  // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
-  2691910106U,  // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
-  2564990741U,  // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
-  3765651946U,  // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
-  2691910136U,  // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
-  2686454274U,  // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
-  2235640329U,  // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
-  3801483792U,  // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
-  2691910168U,  // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
-  2559025254U,  // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
-  2559026237U,  // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
-  2564998862U,  // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
-  2570971548U,  // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
-  2559028534U,  // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
-  4163519477U,  // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
-  3309390346U,  // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
-  2706139747U,  // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
-  2559031086U,  // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
-  2559033446U,  // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
-  2559034430U,  // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
-  2565007127U,  // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
-  2570979740U,  // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
-  2559036726U,  // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
-  1161841154U,  // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
-  4028203932U,  // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
-  2706803380U,  // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
-  1162062365U,  // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
-  3769633475U,  // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
-  3769633488U,  // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
-  3638757144U,  // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
-  3769633508U,  // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
-  3769633515U,  // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
-  3769633526U,  // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
-  3662647932U,  // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
-  3781208837U,  // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
-  3769633547U,  // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
-  2559049830U,  // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
-  2691910430U,  // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
-  2565023513U,  // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
-  2707835698U,  // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
-  2559053110U,  // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
-  1161857540U,  // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
-  2235673101U,  // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
-  2708130646U,  // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
-  1162078751U,  // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
-  2617573416U,  // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
-  1570373734U,  // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
-  2779676774U,  // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
-  3760196480U,  // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
-  2576977100U,  // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
-  2718747538U,  // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
-  2718747548U,  // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
-  3668637015U,  // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
-  1570374301U,  // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
-  2644116214U,  // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
-  2644116276U,  // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
-  2691910602U,  // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
-  2644116440U,  // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
-  2711227356U,  // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
-  2709310438U,  // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
-  3765652462U,  // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
-  3768970231U,  // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
-  2695891968U,  // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
-  3703260634U,  // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
-  3765652499U,  // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
-  2644117096U,  // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
-  2631509709U,  // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
-  2644117269U,  // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
-  3705251698U,  // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
-  2710047808U,  // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
-  3783863369U,  // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
-  2634827874U,  // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
-  2644117654U,  // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
-  3638797210U,  // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
-  3638798082U,  // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
-  2637482406U,  // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
-  2638146039U,  // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
-  3913287374U,  // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
-  3765652625U,  // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
-  3713878762U,  // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
-  2637482406U,  // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
-  1503264870U,  // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
-  2577007514U,  // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
-  2577008232U,  // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
-  2571037175U,  // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
-   161926454U,  // <4,4,4,4>: Cost 1 vdup0 RHS
-  1570377014U,  // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
-  2779680054U,  // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
-  2594927963U,  // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
-   161926454U,  // <4,4,4,u>: Cost 1 vdup0 RHS
-  2571042918U,  // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
-  2571043738U,  // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
-  3638814495U,  // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
-  2571045368U,  // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
-  2571046198U,  // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
-  1839648054U,  // <4,4,5,5>: Cost 2 vzipl RHS, RHS
-  1618169142U,  // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  2594936156U,  // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
-  1618169160U,  // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
-  2553135206U,  // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
-  3626877686U,  // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
-  2565080782U,  // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
-  2571053561U,  // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
-  2553138486U,  // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
-  2241555675U,  // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
-  1973865782U,  // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
-  2658055029U,  // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
-  1973865800U,  // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
-  2644120570U,  // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
-  3638829978U,  // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
-  3638830881U,  // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
-  3735115018U,  // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
-  2662036827U,  // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
-  2713292236U,  // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
-  2713365973U,  // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
-  2644121196U,  // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
-  2662036827U,  // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
-  1503297638U,  // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
-  1570379566U,  // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
-  2779682606U,  // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
-  2571069947U,  // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
-   161926454U,  // <4,4,u,4>: Cost 1 vdup0 RHS
-  1841638710U,  // <4,4,u,5>: Cost 2 vzipl RHS, RHS
-  1618169385U,  // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  2594960735U,  // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
-   161926454U,  // <4,4,u,u>: Cost 1 vdup0 RHS
-  2631516160U,  // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
-  1557774438U,  // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
-  2618908875U,  // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
-  2571078140U,  // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
-  2626871634U,  // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
-  3705258414U,  // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
-  2594968438U,  // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
-  2594968928U,  // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
-  1557775005U,  // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
-  2631516918U,  // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
-  2624217939U,  // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
-  2631517078U,  // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
-  2821341286U,  // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
-  3895086054U,  // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
-  2626872471U,  // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
-  3895083131U,  // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
-  2718748368U,  // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
-  2821341291U,  // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
-  2571092070U,  // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
-  3699287585U,  // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
-  2630854269U,  // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
-  1557776078U,  // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
-  2631517974U,  // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
-  3692652384U,  // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
-  2631518138U,  // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
-  4164013366U,  // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
-  1561094243U,  // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
-  2631518358U,  // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
-  3895084710U,  // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
-  2631518540U,  // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
-  2631518620U,  // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
-  2631518716U,  // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
-  2631518784U,  // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
-  2658060980U,  // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
-  2640145131U,  // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
-  2631519006U,  // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
-  2571108454U,  // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
-  3632907342U,  // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
-  2571110094U,  // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
-  2571110912U,  // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
-  2571111734U,  // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
-  1557777718U,  // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
-  2645454195U,  // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
-  2718748614U,  // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
-  1557777961U,  // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
-  1503346790U,  // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
-  2913398480U,  // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
-  2631519998U,  // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
-  2577090710U,  // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
-  1503349978U,  // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
-  2631520260U,  // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
-  2913390690U,  // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
-  2821344566U,  // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
-  1503352622U,  // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
-  1497383014U,  // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
-  2559181904U,  // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
-  2565154601U,  // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
-  1497385474U,  // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
-  1497386294U,  // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
-  3047608324U,  // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
-  2571129656U,  // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
-    27705344U,  // <4,5,6,7>: Cost 0 copy RHS
-    27705344U,  // <4,5,6,u>: Cost 0 copy RHS
-  2565161062U,  // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
-  2565161882U,  // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
-  2565162794U,  // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
-  2661381387U,  // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
-  2565164342U,  // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
-  2718748840U,  // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
-  2718748846U,  // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
-  2719412407U,  // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
-  2565166894U,  // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
-  1497399398U,  // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
-  1557780270U,  // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
-  2631522181U,  // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
-  1497401860U,  // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
-  1497402678U,  // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
-  1557780634U,  // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
-  2631522512U,  // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
-    27705344U,  // <4,5,u,7>: Cost 0 copy RHS
-    27705344U,  // <4,5,u,u>: Cost 0 copy RHS
-  2618916864U,  // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
-  1545175142U,  // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
-  1545175244U,  // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
-  3692658940U,  // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
-  2618917202U,  // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
-  3852910806U,  // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
-  2253525648U,  // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
-  4040764726U,  // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
-  1545175709U,  // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
-  2618917622U,  // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
-  2618917684U,  // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
-  2618917782U,  // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
-  2618917848U,  // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
-  3692659773U,  // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
-  2618918032U,  // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
-  3692659937U,  // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
-  4032146742U,  // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
-  2618918253U,  // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
-  2618918380U,  // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
-  2618918460U,  // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
-  2618918504U,  // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
-  2618918566U,  // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
-  2618918679U,  // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
-  2618918788U,  // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
-  2618918842U,  // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
-  2718749178U,  // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
-  2618918971U,  // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
-  2618919062U,  // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
-  2636171526U,  // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
-  3692661057U,  // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
-  2618919324U,  // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
-  2618919426U,  // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
-  2638826058U,  // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
-  3913303030U,  // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
-  2722730572U,  // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
-  2618919710U,  // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
-  2565210214U,  // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
-  2718749286U,  // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
-  2565211952U,  // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
-  2571184649U,  // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
-  2565213494U,  // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
-  1545178422U,  // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
-  1705430326U,  // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
-  2595075437U,  // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
-  1545178665U,  // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
-  2565218406U,  // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
-  2645462736U,  // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
-  2913399290U,  // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
-  3913305394U,  // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
-  2645462982U,  // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
-  2779172868U,  // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
-  2913391416U,  // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
-  2821426486U,  // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
-  2821426487U,  // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
-  1503428710U,  // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
-  2577171190U,  // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
-  2645463546U,  // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
-  2577172630U,  // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
-  1503431908U,  // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
-  2253501069U,  // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
-  2618921784U,  // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
-  2954464566U,  // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
-  1503434542U,  // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
-  2645464058U,  // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
-  2779173882U,  // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
-  3638978355U,  // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
-  2725090156U,  // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
-  2645464422U,  // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
-  2779174246U,  // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
-  3852915914U,  // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
-  2779174508U,  // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
-  2779173945U,  // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
-  1503445094U,  // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
-  1545180974U,  // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
-  1705432878U,  // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
-  2618922940U,  // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
-  1503448294U,  // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
-  1545181338U,  // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
-  1705433242U,  // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
-  2954480950U,  // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
-  1545181541U,  // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
-  3706601472U,  // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
-  2632859750U,  // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
-  2726343685U,  // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
-  3701293312U,  // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
-  3706601810U,  // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
-  2259424608U,  // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
-  3695321617U,  // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
-  3800454194U,  // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
-  2632860317U,  // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
-  2259064116U,  // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
-  3700630324U,  // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
-  2632860570U,  // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
-  3769635936U,  // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
-  3656920374U,  // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
-  3700630681U,  // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
-  3701294314U,  // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
-  3793818754U,  // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
-  2259654012U,  // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
-  3656925286U,  // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
-  3706603050U,  // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
-  3706603112U,  // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
-  2727744688U,  // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
-  3705939745U,  // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
-  2632861554U,  // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
-  3706603450U,  // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
-  3792491731U,  // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
-  2634852453U,  // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
-  3706603670U,  // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
-  3662906266U,  // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
-  3725183326U,  // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
-  3706603932U,  // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
-  3701295618U,  // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
-  2638834251U,  // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
-  2639497884U,  // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
-  3802445093U,  // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
-  2640825150U,  // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
-  2718750004U,  // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
-  3706604490U,  // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
-  3656943474U,  // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
-  3779884371U,  // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
-  2259383643U,  // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
-  2632863030U,  // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
-  2259531117U,  // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
-  3907340074U,  // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
-  2632863273U,  // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
-  2913391610U,  // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
-  3645006848U,  // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
-  2589181646U,  // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
-  3645008403U,  // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
-  2913391974U,  // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
-  2583211973U,  // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
-  2589184670U,  // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
-  2913392236U,  // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
-  2913392258U,  // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
-  1509474406U,  // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
-  3047609338U,  // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
-  2583217768U,  // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
-  2583218326U,  // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
-  1509477686U,  // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
-  1509478342U,  // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
-  2583220730U,  // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
-  3047609964U,  // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
-  1509480238U,  // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
-  3650994278U,  // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
-  3650995098U,  // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
-  3650996010U,  // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
-  3804804677U,  // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
-  3650997486U,  // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
-  2662725039U,  // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
-  3662942880U,  // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
-  2718750316U,  // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
-  2664715938U,  // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
-  1509490790U,  // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
-  2632865582U,  // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
-  2583234152U,  // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
-  2583234710U,  // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
-  1509494070U,  // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
-  1509494728U,  // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
-  2583237114U,  // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
-  3047757420U,  // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
-  1509496622U,  // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
-  2618933248U,  // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
-  1545191526U,  // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
-  1545191630U,  // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
-  2691913445U,  // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
-  2618933586U,  // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
-  2265397305U,  // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
-  2595189625U,  // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
-  2595190139U,  // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
-  1545192093U,  // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
-  2618934006U,  // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
-  2618934068U,  // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
-  1618171694U,  // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  2618934232U,  // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
-  2695894848U,  // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
-  2618934416U,  // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
-  3692676321U,  // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
-  2718750555U,  // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
-  1618171748U,  // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
-  2553397350U,  // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
-  2630215215U,  // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
-  2618934888U,  // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
-  1557800657U,  // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
-  2618935065U,  // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
-  2733864859U,  // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
-  2618935226U,  // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
-  2718750636U,  // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
-  1561118822U,  // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
-  2618935446U,  // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
-  2779318422U,  // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
-  2636851545U,  // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
-  2618935708U,  // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
-  2618935810U,  // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
-  2691913711U,  // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
-  2588725862U,  // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
-  2640169710U,  // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
-  2618936094U,  // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
-  1503559782U,  // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
-  2692282391U,  // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
-  2565359426U,  // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
-  2571332123U,  // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
-   161926454U,  // <4,u,4,4>: Cost 1 vdup0 RHS
-  1545194806U,  // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
-  1705577782U,  // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
-  2718750801U,  // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
-   161926454U,  // <4,u,4,u>: Cost 1 vdup0 RHS
-  1479164006U,  // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
-  1839650606U,  // <4,u,5,1>: Cost 2 vzipl RHS, LHS
-  2565367502U,  // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
-  3089777309U,  // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
-  1479167286U,  // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
-  1839650970U,  // <4,u,5,5>: Cost 2 vzipl RHS, RHS
-  1618172058U,  // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
-  3089780265U,  // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
-  1618172076U,  // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
-  1479688294U,  // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
-  2553430774U,  // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
-  1973868334U,  // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
-  1497606685U,  // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
-  1479691574U,  // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
-  1509552079U,  // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
-  1973868698U,  // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
-    27705344U,  // <4,u,6,7>: Cost 0 copy RHS
-    27705344U,  // <4,u,6,u>: Cost 0 copy RHS
-  2565382246U,  // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
-  2565383066U,  // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
-  2565384005U,  // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
-  2661405966U,  // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
-  2565385526U,  // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
-  2779321702U,  // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
-  2589274793U,  // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
-  2779321964U,  // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
-  2565388078U,  // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
-  1479704678U,  // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
-  1545197358U,  // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
-  1618172261U,  // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
-  1497623071U,  // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
-   161926454U,  // <4,u,u,4>: Cost 1 vdup0 RHS
-  1545197722U,  // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
-  1618172301U,  // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
-    27705344U,  // <4,u,u,7>: Cost 0 copy RHS
-    27705344U,  // <4,u,u,u>: Cost 0 copy RHS
-  2687123456U,  // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
-  2687123466U,  // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
-  2687123476U,  // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
-  3710599434U,  // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
-  2642166098U,  // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
-  3657060306U,  // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
-  3292094923U,  // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
-  3669005700U,  // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
-  2687123530U,  // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
-  2559434854U,  // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
-  2559435887U,  // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
-  1613381734U,  // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  3698656256U,  // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
-  2559438134U,  // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
-  2583326675U,  // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
-  3715908851U,  // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
-  3657069562U,  // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
-  1613381788U,  // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
-  2686017700U,  // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
-  2685796528U,  // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
-  2698625208U,  // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
-  2685944002U,  // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
-  2686017739U,  // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
-  2686091476U,  // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
-  2725167324U,  // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
-  2595280230U,  // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
-  2686312687U,  // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
-  3760128248U,  // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
-  3759685888U,  // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
-  2686533898U,  // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
-  3760349459U,  // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
-  2638187004U,  // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
-  3776348452U,  // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
-  3713256094U,  // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
-  3914064896U,  // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
-  2686976320U,  // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
-  2559459430U,  // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
-  1613381970U,  // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
-  2687123804U,  // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
-  3761013092U,  // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
-  2559462710U,  // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
-  2638187830U,  // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
-  3761234303U,  // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
-  2646150600U,  // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
-  1613381970U,  // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
-  3766763926U,  // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
-  2919268454U,  // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
-  3053486182U,  // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
-  3723210589U,  // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
-  3766763966U,  // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
-  2650796031U,  // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
-  3719893090U,  // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
-  3914067254U,  // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
-  2919269021U,  // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
-  4047519744U,  // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
-  2920038502U,  // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
-  3759759871U,  // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
-  3645164070U,  // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
-  3762414095U,  // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
-  3993780690U,  // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
-  3719893816U,  // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
-  2662077302U,  // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
-  2920039069U,  // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
-  2565455974U,  // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
-  2565456790U,  // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
-  2565457742U,  // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
-  3639199894U,  // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
-  2565459254U,  // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
-  2589347938U,  // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
-  2589348530U,  // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
-  4188456422U,  // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
-  2565461806U,  // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
-  2687124106U,  // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
-  1616036502U,  // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
-  1613382301U,  // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  2689925800U,  // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
-  2687124146U,  // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
-  2638190746U,  // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
-  2589356723U,  // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
-  2595280230U,  // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
-  1613382355U,  // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
-  2646818816U,  // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
-  1573077094U,  // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
-  2646818980U,  // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
-  2687124214U,  // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
-  2641510738U,  // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
-  2641510814U,  // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
-  3720561142U,  // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
-  3298141357U,  // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
-  1573077661U,  // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
-  2223891567U,  // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
-  2687124276U,  // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
-  2646819734U,  // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
-  2687124296U,  // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
-  2691326803U,  // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
-  2691400540U,  // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
-  3765216101U,  // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
-  3765289838U,  // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
-  2687124341U,  // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
-  3297641584U,  // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
-  3763520391U,  // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
-  2646820456U,  // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
-  2687124374U,  // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
-  2691990436U,  // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
-  2687124395U,  // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
-  2646820794U,  // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
-  3808199610U,  // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
-  2687124419U,  // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
-  2577440870U,  // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
-  2687124440U,  // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
-  3759686627U,  // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
-  2692580332U,  // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
-  2687124469U,  // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
-  2685207552U,  // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
-  3760866313U,  // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
-  2692875280U,  // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
-  2687124503U,  // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
-  1567771538U,  // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
-  2693096491U,  // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
-  2693170228U,  // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
-  2687124541U,  // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
-  2646822096U,  // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
-  1573080374U,  // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
-  2646822260U,  // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
-  3298174129U,  // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
-  1573080602U,  // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
-  2687124591U,  // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
-  2646822543U,  // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
-  3760866433U,  // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
-  2687124624U,  // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
-  2687124631U,  // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
-  2646822916U,  // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
-  2646823010U,  // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
-  2646823080U,  // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
-  2687124663U,  // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
-  2553577574U,  // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
-  3763520719U,  // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
-  2646823418U,  // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
-  3760866529U,  // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
-  2553580854U,  // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
-  2687124723U,  // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
-  2646823736U,  // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
-  2646823758U,  // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
-  2646823839U,  // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
-  2559557734U,  // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
-  2559558452U,  // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
-  2571503270U,  // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
-  2040971366U,  // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
-  2559561014U,  // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
-  2595393232U,  // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
-  4188455035U,  // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
-  2646824556U,  // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
-  2040971371U,  // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
-  1591662326U,  // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
-  1573082926U,  // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
-  2695824760U,  // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
-  2040979558U,  // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
-  2687124874U,  // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
-  1573083290U,  // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
-  2646825168U,  // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
-  2646825216U,  // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
-  2040979563U,  // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
-  3702652928U,  // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
-  2628911206U,  // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
-  2641518756U,  // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
-  3759760847U,  // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
-  3760866775U,  // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
-  3759539680U,  // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
-  3760866796U,  // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
-  3304114054U,  // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
-  2628911773U,  // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
-  2623603464U,  // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
-  3698008921U,  // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
-  3633325603U,  // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
-  2687125027U,  // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
-  3633327414U,  // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
-  3759539760U,  // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
-  3760866876U,  // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
-  3304122247U,  // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
-  2687125072U,  // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
-  3633332326U,  // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
-  3759760992U,  // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
-  2687125096U,  // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
-  2687125106U,  // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
-  2697963133U,  // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
-  3759466120U,  // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
-  3760866960U,  // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
-  3771926168U,  // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
-  2687125151U,  // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
-  2687125158U,  // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
-  2698405555U,  // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
-  2577516238U,  // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
-  3759687365U,  // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
-  1624884942U,  // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
-  2698700503U,  // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
-  3772368608U,  // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
-  3702655716U,  // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
-  1625179890U,  // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
-  2641521555U,  // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
-  3772368642U,  // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
-  2699142925U,  // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
-  2698626838U,  // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
-  2698626848U,  // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
-  2628914486U,  // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
-  2645503353U,  // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
-  3304146826U,  // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
-  2628914729U,  // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
-  2553643110U,  // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
-  3758950227U,  // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
-  3759761248U,  // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
-  2982396006U,  // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
-  2553646390U,  // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
-  2553647108U,  // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
-  3760867204U,  // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
-  3702657141U,  // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
-  2982396011U,  // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
-  3627393126U,  // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
-  3760867236U,  // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
-  2645504506U,  // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
-  2687125434U,  // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
-  2700617665U,  // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
-  3760867276U,  // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
-  3763521493U,  // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
-  3719246670U,  // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
-  2687125479U,  // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
-  2565603430U,  // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
-  2553660150U,  // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
-  2565605216U,  // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
-  2961178726U,  // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
-  2565606710U,  // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
-  4034920552U,  // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
-  3114713292U,  // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
-  3702658668U,  // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
-  2961178731U,  // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
-  2687125563U,  // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
-  2628917038U,  // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
-  2565613409U,  // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
-  2687125592U,  // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
-  1628203107U,  // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
-  2628917402U,  // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
-  2702092405U,  // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
-  3304179598U,  // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
-  1628498055U,  // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
-  3760867467U,  // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
-  2687125654U,  // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
-  3759761565U,  // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
-  3633391766U,  // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
-  2687125680U,  // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
-  3760277690U,  // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
-  3310013014U,  // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
-  2236344927U,  // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
-  2687125717U,  // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
-  3760867551U,  // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
-  3760867558U,  // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
-  2624938923U,  // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
-  2703198460U,  // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
-  3760867587U,  // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
-  2636219536U,  // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
-  3698681075U,  // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
-  2703493408U,  // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
-  2628920721U,  // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
-  3766765870U,  // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
-  3698681379U,  // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
-  3760867649U,  // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
-  2698627404U,  // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
-  2703935830U,  // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
-  2698627422U,  // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
-  3760867686U,  // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
-  3769788783U,  // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
-  2701945209U,  // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
-  3760867711U,  // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
-  2636220684U,  // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
-  3772369298U,  // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
-  2687125916U,  // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
-  2704599463U,  // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
-  2704673200U,  // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
-  3709962935U,  // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
-  3772369346U,  // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
-  2704894411U,  // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
-  2704968148U,  // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
-  3698682850U,  // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
-  2642857014U,  // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
-  2705189359U,  // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
-  2705263096U,  // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
-  2685946370U,  // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
-  3779152394U,  // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
-  2236377699U,  // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
-  2687126045U,  // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
-  2571632742U,  // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
-  2559689870U,  // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
-  2571634382U,  // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
-  2571635264U,  // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
-  2571636022U,  // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
-  2559692804U,  // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
-  3720581218U,  // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
-  2236385892U,  // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
-  2571638574U,  // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
-  2565668966U,  // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
-  3633439887U,  // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
-  2565670760U,  // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
-  2565671426U,  // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
-  2565672246U,  // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
-  3639414630U,  // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
-  4047521640U,  // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
-  2725169844U,  // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
-  2565674798U,  // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
-  1485963366U,  // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
-  1485964432U,  // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
-  2559706728U,  // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
-  2559707286U,  // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
-  1485966646U,  // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
-  2559708880U,  // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
-  2601513466U,  // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
-  3114714112U,  // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
-  1485969198U,  // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
-  1485971558U,  // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
-  1485972625U,  // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
-  2559714920U,  // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
-  2559715478U,  // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
-  1485974838U,  // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
-  2687126342U,  // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
-  2601521658U,  // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
-  2236410471U,  // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
-  1485977390U,  // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
-  3627491430U,  // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
-  2636890214U,  // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
-  3703333028U,  // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
-  3782249348U,  // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
-  2642198866U,  // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
-  2687126418U,  // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
-  2242243887U,  // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
-  3316059448U,  // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
-  2636890781U,  // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
-  2241809658U,  // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
-  3698025307U,  // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
-  3698688940U,  // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
-  3698689024U,  // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
-  3700016206U,  // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
-  2687126498U,  // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
-  3760868336U,  // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
-  3316067641U,  // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
-  2242399554U,  // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
-  3703334371U,  // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
-  3703998004U,  // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
-  3704661637U,  // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
-  2636891854U,  // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
-  3705988903U,  // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
-  2698628150U,  // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
-  3760868415U,  // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
-  3783871562U,  // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
-  2666752099U,  // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
-  3639459942U,  // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
-  3709970701U,  // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
-  2636892510U,  // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
-  3710634396U,  // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
-  2638219776U,  // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
-  3766987908U,  // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
-  2710719634U,  // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
-  3914097664U,  // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
-  2640874308U,  // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
-  2583642214U,  // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
-  2642201574U,  // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
-  3710635062U,  // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
-  3717270664U,  // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
-  2713963728U,  // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
-  1637567706U,  // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
-  2242276659U,  // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
-  2646183372U,  // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
-  1637788917U,  // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
-  2559762534U,  // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
-  2559763607U,  // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
-  2698628366U,  // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
-  3633506454U,  // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
-  2559765814U,  // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
-  2583654395U,  // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
-  1613385014U,  // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
-  3901639990U,  // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
-  1613385032U,  // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
-  2559770726U,  // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
-  2559771648U,  // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
-  3633514088U,  // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
-  2571717122U,  // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
-  2559774006U,  // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
-  2712636796U,  // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
-  3760868743U,  // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
-  2712784270U,  // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
-  2559776558U,  // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
-  2565750886U,  // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
-  2565751706U,  // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
-  2565752690U,  // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
-  2571725387U,  // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
-  2565754166U,  // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
-  3114713426U,  // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
-    94817590U,  // <5,4,7,6>: Cost 1 vrev RHS
-  2595616175U,  // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
-    94965064U,  // <5,4,7,u>: Cost 1 vrev RHS
-  2559787110U,  // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
-  2559788186U,  // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
-  2242014483U,  // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
-  2667419628U,  // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
-  2559790390U,  // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
-  1640222238U,  // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
-    94825783U,  // <5,4,u,6>: Cost 1 vrev RHS
-  2714111536U,  // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
-    94973257U,  // <5,4,u,u>: Cost 1 vrev RHS
-  2646851584U,  // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
-  1573109862U,  // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
-  2646851748U,  // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
-  3760279130U,  // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
-  2687127138U,  // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
-  2248142847U,  // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
-  3720593910U,  // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
-  4182502710U,  // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
-  1573110429U,  // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
-  2646852342U,  // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
-  2624291676U,  // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
-  2646852502U,  // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
-  2646852568U,  // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
-  2715217591U,  // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
-  2628936848U,  // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
-  3698033907U,  // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
-  2713964240U,  // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
-  2628937107U,  // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
-  3645497446U,  // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
-  3760869099U,  // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
-  2646853224U,  // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
-  2698628862U,  // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
-  3772370694U,  // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
-  2713964303U,  // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
-  2646853562U,  // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
-  4038198272U,  // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
-  2701946667U,  // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
-  2646853782U,  // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
-  3698034922U,  // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
-  3702679919U,  // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
-  2637564336U,  // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
-  2646854146U,  // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
-  2638891602U,  // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
-  3702680247U,  // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
-  3702680259U,  // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
-  2646854430U,  // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
-  2646854546U,  // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
-  2642209767U,  // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
-  3711306806U,  // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
-  3645516369U,  // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
-  1570458842U,  // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
-  1573113142U,  // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
-  2645527932U,  // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
-  2713964486U,  // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
-  1573113374U,  // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
-  1509982310U,  // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
-  2646855376U,  // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
-  2583725672U,  // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
-  2583726230U,  // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
-  1509985590U,  // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
-   229035318U,  // <5,5,5,5>: Cost 1 vdup1 RHS
-  2646855778U,  // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
-  2646855848U,  // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
-   229035318U,  // <5,5,5,u>: Cost 1 vdup1 RHS
-  2577760358U,  // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
-  3633587361U,  // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
-  2646856186U,  // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
-  3633588738U,  // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
-  2718535756U,  // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
-  2644202223U,  // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
-  2973780482U,  // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
-  2646856526U,  // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
-  2646856607U,  // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
-  2571796582U,  // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
-  3633595392U,  // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
-  2571798222U,  // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
-  2571799124U,  // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
-  2571799862U,  // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
-  3114717188U,  // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
-  4034923010U,  // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
-  2040974646U,  // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
-  2040974647U,  // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
-  1509982310U,  // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
-  1573115694U,  // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
-  2571806414U,  // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
-  2571807317U,  // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
-  1509985590U,  // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
-   229035318U,  // <5,5,u,5>: Cost 1 vdup1 RHS
-  2646857936U,  // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
-  2040982838U,  // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
-   229035318U,  // <5,5,u,u>: Cost 1 vdup1 RHS
-  2638233600U,  // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
-  1564491878U,  // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
-  2632261796U,  // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
-  2638233856U,  // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
-  2638233938U,  // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
-  3706003885U,  // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
-  3706003967U,  // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
-  4047473974U,  // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
-  1564492445U,  // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
-  2638234358U,  // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
-  2638234420U,  // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
-  2638234518U,  // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
-  2638234584U,  // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
-  2626290768U,  // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
-  2638234768U,  // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
-  3700032719U,  // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
-  2982366518U,  // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
-  2628945300U,  // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
-  3706004925U,  // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
-  3711976966U,  // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
-  2638235240U,  // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
-  2638235302U,  // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
-  2632263465U,  // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
-  2638235496U,  // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
-  2638235578U,  // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
-  2713965050U,  // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
-  2634917997U,  // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
-  2638235798U,  // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
-  3711977695U,  // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
-  3710650720U,  // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
-  2638236060U,  // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
-  1564494338U,  // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
-  2638236234U,  // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
-  3711978104U,  // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
-  4034227510U,  // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
-  1567148870U,  // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
-  2577817702U,  // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
-  3700034544U,  // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
-  2723033713U,  // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
-  2638236818U,  // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
-  2644208859U,  // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
-  1564495158U,  // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
-  2645536125U,  // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
-  2723402398U,  // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
-  1564495401U,  // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
-  2577825894U,  // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
-  2662125264U,  // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
-  3775836867U,  // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
-  3711979343U,  // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
-  2650181556U,  // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
-  2662125572U,  // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
-  2638237732U,  // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
-  2982399286U,  // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
-  2982399287U,  // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
-  2583806054U,  // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
-  3711979910U,  // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
-  2662126074U,  // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
-  2583808514U,  // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
-  2583809334U,  // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
-  2583810062U,  // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
-  2638238520U,  // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
-  2973781302U,  // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
-  2973781303U,  // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
-   430358630U,  // <5,6,7,0>: Cost 1 vext1 RHS, LHS
-  1504101110U,  // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
-  1504101992U,  // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1504102550U,  // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
-   430361910U,  // <5,6,7,4>: Cost 1 vext1 RHS, RHS
-  1504104390U,  // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
-  1504105272U,  // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
-  1504106092U,  // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
-   430364462U,  // <5,6,7,u>: Cost 1 vext1 RHS, LHS
-   430366822U,  // <5,6,u,0>: Cost 1 vext1 RHS, LHS
-  1564497710U,  // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
-  1504110184U,  // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1504110742U,  // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
-   430370103U,  // <5,6,u,4>: Cost 1 vext1 RHS, RHS
-  1564498074U,  // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
-  1504113146U,  // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
-  1504113658U,  // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
-   430372654U,  // <5,6,u,u>: Cost 1 vext1 RHS, LHS
-  2625634304U,  // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
-  1551892582U,  // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625634468U,  // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
-  2571889247U,  // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
-  2625634642U,  // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
-  2595778728U,  // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
-  3699376639U,  // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
-  2260235715U,  // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
-  1551893149U,  // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625635062U,  // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
-  2624308020U,  // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
-  2625635222U,  // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
-  1551893504U,  // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
-  2571898166U,  // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
-  2625635472U,  // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
-  2627626227U,  // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
-  3702031684U,  // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
-  1555211669U,  // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
-  2629617126U,  // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
-  3699377670U,  // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
-  2625635944U,  // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
-  2625636006U,  // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
-  2632271658U,  // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
-  2625636201U,  // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
-  2625636282U,  // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
-  3708004381U,  // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
-  2625636411U,  // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
-  2625636502U,  // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
-  2625636604U,  // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
-  3699378478U,  // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
-  2625636764U,  // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
-  2625636866U,  // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
-  2625636959U,  // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
-  3699378808U,  // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
-  2640235254U,  // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
-  2625637150U,  // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
-  2571919462U,  // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
-  2571920384U,  // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
-  3699379260U,  // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
-  2571922019U,  // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
-  2571922742U,  // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
-  1551895862U,  // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
-  2846277980U,  // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
-  2646207951U,  // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
-  1551896105U,  // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
-  2583871590U,  // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
-  2652180176U,  // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
-  2625638177U,  // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
-  2625638262U,  // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
-  2583874870U,  // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
-  2846281732U,  // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
-  2651517015U,  // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
-  1772539190U,  // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
-  1772539191U,  // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
-  2846281826U,  // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
-  3699380615U,  // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
-  2846281108U,  // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
-  2589854210U,  // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
-  2846281830U,  // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
-  2725467658U,  // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
-  2846281076U,  // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
-  2846279610U,  // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
-  2846279611U,  // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
-  1510146150U,  // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
-  2846282574U,  // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
-  2583889512U,  // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
-  2846281919U,  // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
-  1510149430U,  // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
-  1510150168U,  // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
-  2583892474U,  // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
-  2625640044U,  // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
-  1510151982U,  // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
-  1510154342U,  // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
-  1551898414U,  // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625640325U,  // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
-  1772536477U,  // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
-  1510157622U,  // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
-  1551898778U,  // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
-  2625640656U,  // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
-  1772539433U,  // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
-  1551898981U,  // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
-  2625642496U,  // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
-  1551900774U,  // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
-  2625642660U,  // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
-  2698630885U,  // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
-  2687129325U,  // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
-  2689783542U,  // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
-  2266134675U,  // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
-  2595853772U,  // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
-  1551901341U,  // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
-  2625643254U,  // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
-  2625643316U,  // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
-  1613387566U,  // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  1551901697U,  // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
-  2626307154U,  // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
-  2689783622U,  // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
-  2627634420U,  // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
-  2982366536U,  // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
-  1613387620U,  // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
-  2846286742U,  // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
-  2685796528U,  // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
-  2625644136U,  // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
-  2687129480U,  // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
-  2632279851U,  // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
-  2625644394U,  // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
-  2625644474U,  // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
-  2713966508U,  // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
-  2625644603U,  // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
-  2687129532U,  // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
-  2636261649U,  // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
-  2636925282U,  // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
-  2625644956U,  // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
-  1564510724U,  // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
-  2625645160U,  // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
-  2734610422U,  // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
-  2640243447U,  // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
-  1567165256U,  // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
-  1567828889U,  // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
-  1661163546U,  // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
-  2734463012U,  // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
-  2698631212U,  // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
-  1570458842U,  // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
-  1551904054U,  // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
-  2846286172U,  // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
-  2646216144U,  // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
-  1551904297U,  // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
-  1509982310U,  // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
-  2560058555U,  // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
-  2698926194U,  // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
-  2698631295U,  // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
-  1509985590U,  // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
-   229035318U,  // <5,u,5,5>: Cost 1 vdup1 RHS
-  1613387930U,  // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
-  1772547382U,  // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
-   229035318U,  // <5,u,5,u>: Cost 1 vdup1 RHS
-  2566037606U,  // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
-  2920044334U,  // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
-  2566039445U,  // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
-  2687129808U,  // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
-  2566040886U,  // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
-  2920044698U,  // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
-  2846289268U,  // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
-  2973781320U,  // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
-  2687129853U,  // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
-   430506086U,  // <5,u,7,0>: Cost 1 vext1 RHS, LHS
-  1486333117U,  // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
-  1504249448U,  // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  2040971933U,  // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
-   430509384U,  // <5,u,7,4>: Cost 1 vext1 RHS, RHS
-  1504251600U,  // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
-   118708378U,  // <5,u,7,6>: Cost 1 vrev RHS
-  2040974889U,  // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
-   430511918U,  // <5,u,7,u>: Cost 1 vext1 RHS, LHS
-   430514278U,  // <5,u,u,0>: Cost 1 vext1 RHS, LHS
-  1551906606U,  // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
-  1613388133U,  // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
-  1772544669U,  // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
-   430517577U,  // <5,u,u,4>: Cost 1 vext1 RHS, RHS
-   229035318U,  // <5,u,u,5>: Cost 1 vdup1 RHS
-   118716571U,  // <5,u,u,6>: Cost 1 vrev RHS
-  1772547625U,  // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
-   430520110U,  // <5,u,u,u>: Cost 1 vext1 RHS, LHS
-  2686025728U,  // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
-  2686025738U,  // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
-  2686025748U,  // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
-  3779084320U,  // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
-  2642903388U,  // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
-  3657723939U,  // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
-  3926676514U,  // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
-  3926675786U,  // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
-  2686025802U,  // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
-  2566070374U,  // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
-  3759767642U,  // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
-  1612284006U,  // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  2583988738U,  // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
-  2566073654U,  // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
-  2583990308U,  // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
-  2589963005U,  // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
-  2595935702U,  // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
-  1612284060U,  // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
-  2686025892U,  // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
-  2685804721U,  // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
-  3759620282U,  // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
-  2705342658U,  // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
-  1612284108U,  // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
-  3706029956U,  // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
-  2686173406U,  // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
-  3651769338U,  // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
-  1612579056U,  // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
-  3706030230U,  // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
-  2705342720U,  // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
-  2705342730U,  // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
-  3706030492U,  // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
-  2644896258U,  // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
-  3718638154U,  // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
-  3729918619U,  // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
-  3926672384U,  // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
-  2705342784U,  // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
-  2687058250U,  // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
-  2686026066U,  // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
-  1613463900U,  // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
-  3761021285U,  // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
-  2687353198U,  // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
-  2632289590U,  // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
-  2645560704U,  // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
-  2646224337U,  // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
-  1613906322U,  // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
-  3651788902U,  // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
-  2687795620U,  // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
-  3761611181U,  // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
-  3723284326U,  // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
-  2646224838U,  // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
-  3718639630U,  // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
-  2652196962U,  // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
-  2852932918U,  // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
-  2852932919U,  // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
-  2852933730U,  // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
-  2925985894U,  // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
-  3060203622U,  // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
-  3718640178U,  // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
-  2656178832U,  // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
-  3725939378U,  // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
-  2657506098U,  // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
-  2619020110U,  // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
-  2925986461U,  // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
-  2572091494U,  // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
-  2572092310U,  // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
-  2980495524U,  // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
-  2572094072U,  // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
-  2572094774U,  // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
-  4054238242U,  // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
-  3645837653U,  // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
-  4054239054U,  // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
-  2572097326U,  // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
-  2686026378U,  // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
-  2686026386U,  // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
-  1612284573U,  // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  2705343144U,  // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
-  1616265906U,  // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
-  2632292506U,  // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
-  2590020356U,  // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
-  2852933161U,  // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
-  1612284627U,  // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
-  2595995750U,  // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
-  2646229094U,  // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
-  3694092492U,  // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
-  2686026486U,  // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
-  2595999030U,  // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
-  3767730952U,  // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
-  2596000590U,  // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
-  2596001246U,  // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
-  2686026531U,  // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
-  3763602219U,  // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
-  2686026548U,  // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
-  3764929346U,  // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
-  2686026568U,  // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
-  2691334996U,  // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
-  3760874332U,  // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
-  3765224294U,  // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
-  3669751263U,  // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
-  2686026613U,  // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
-  2554208358U,  // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
-  3763602311U,  // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
-  3639895971U,  // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
-  2686026646U,  // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
-  2554211638U,  // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
-  3760874411U,  // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
-  2554212858U,  // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
-  3802973114U,  // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
-  2686026691U,  // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
-  2566160486U,  // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
-  2686026712U,  // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
-  2686026724U,  // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
-  3759768552U,  // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
-  2692662262U,  // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
-  2686026752U,  // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
-  2590053128U,  // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
-  3663795194U,  // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
-  2686026775U,  // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
-  2641587099U,  // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
-  2693104684U,  // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
-  3639912357U,  // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
-  2687206462U,  // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
-  3633941814U,  // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
-  2693399632U,  // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
-  3765077075U,  // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
-  2646232530U,  // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
-  2687206507U,  // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
-  2647559796U,  // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
-  3765077118U,  // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
-  3767583878U,  // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
-  2686026896U,  // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
-  2693989528U,  // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
-  3767805089U,  // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
-  2652868706U,  // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
-  3908250934U,  // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
-  2686026941U,  // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
-  2554241126U,  // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
-  3763602639U,  // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
-  3759547607U,  // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
-  3115221094U,  // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
-  2554244406U,  // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
-  3760874739U,  // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
-  2554245944U,  // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
-  3719975758U,  // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
-  3115221099U,  // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
-  2560221286U,  // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
-  2560222415U,  // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
-  2980497558U,  // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
-  3103211622U,  // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
-  2560224566U,  // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
-  2980495698U,  // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
-  3633967526U,  // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
-  4054237686U,  // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
-  2560227118U,  // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
-  2560229478U,  // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
-  2686027117U,  // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
-  2686027129U,  // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
-  2686027132U,  // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
-  2687206795U,  // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
-  2686027157U,  // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
-  2590094093U,  // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
-  2596066790U,  // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
-  2686027177U,  // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
-  2646900736U,  // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
-  1573159014U,  // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
-  2646900900U,  // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
-  3759769037U,  // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
-  2641592668U,  // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
-  3779085794U,  // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
-  2686027244U,  // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
-  3669816807U,  // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
-  1573159581U,  // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
-  2230527897U,  // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
-  2646901556U,  // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
-  2646901654U,  // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
-  2847047782U,  // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
-  3771049517U,  // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
-  2646901904U,  // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
-  2686027324U,  // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
-  3669825000U,  // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
-  2231117793U,  // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
-  3763603029U,  // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
-  3759769184U,  // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
-  2686027368U,  // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
-  2686027378U,  // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
-  2697971326U,  // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
-  3759769224U,  // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
-  2698118800U,  // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
-  3920794092U,  // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
-  2686027423U,  // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
-  2686027430U,  // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
-  3759769262U,  // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
-  2698487485U,  // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
-  2705344196U,  // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
-  2686027470U,  // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
-  2698708696U,  // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
-  2724660961U,  // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
-  2729232104U,  // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
-  2686027502U,  // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
-  1567853468U,  // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
-  3759769351U,  // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
-  2699151118U,  // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
-  2686027543U,  // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
-  2699298592U,  // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
-  1573162294U,  // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
-  2686027564U,  // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
-  3719982547U,  // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
-  1573162532U,  // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
-  3779086154U,  // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
-  2646904528U,  // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
-  3759769440U,  // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
-  2699888488U,  // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
-  2230855617U,  // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
-  2646904836U,  // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
-  2646904930U,  // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
-  2847051062U,  // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
-  2700257173U,  // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
-  2687207321U,  // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
-  2686027684U,  // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
-  2566260656U,  // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
-  2685806522U,  // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
-  2687207361U,  // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
-  2686027724U,  // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
-  2646905656U,  // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
-  2646905678U,  // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
-  2686027751U,  // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
-  2554323046U,  // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
-  2572239606U,  // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
-  2566268849U,  // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
-  1906753638U,  // <6,2,7,3>: Cost 2 vzipr RHS, LHS
-  2554326326U,  // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
-  3304687564U,  // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
-  2980495708U,  // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
-  2646906476U,  // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
-  1906753643U,  // <6,2,7,u>: Cost 2 vzipr RHS, LHS
-  1591744256U,  // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
-  1573164846U,  // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
-  2701805650U,  // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
-  1906761830U,  // <6,2,u,3>: Cost 2 vzipr RHS, LHS
-  2686027875U,  // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
-  1573165210U,  // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
-  2686322800U,  // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
-  2847051305U,  // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
-  1906761835U,  // <6,2,u,u>: Cost 2 vzipr RHS, LHS
-  3759769739U,  // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
-  2686027926U,  // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
-  2686027937U,  // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
-  3640027286U,  // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
-  2687207601U,  // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
-  2705344698U,  // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
-  3663917847U,  // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
-  2237008560U,  // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
-  2686027989U,  // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
-  3759769823U,  // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
-  3759769830U,  // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
-  3759769841U,  // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
-  3759769848U,  // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
-  2703280390U,  // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
-  3759769868U,  // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
-  3704063194U,  // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
-  3767732510U,  // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
-  2703280390U,  // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
-  3704063468U,  // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
-  2630321724U,  // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
-  3759769921U,  // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
-  3759769928U,  // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
-  3704063767U,  // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
-  3704063876U,  // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
-  2636957626U,  // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
-  3777907058U,  // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
-  2630321724U,  // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
-  3759769983U,  // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
-  3710036245U,  // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
-  2636958054U,  // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
-  2686028188U,  // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
-  2704607656U,  // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
-  3773041072U,  // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
-  3711363731U,  // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
-  3767732676U,  // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
-  2707999179U,  // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
-  2584232038U,  // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
-  2642267118U,  // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
-  2642930751U,  // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
-  2705197552U,  // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
-  2584235318U,  // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
-  1631603202U,  // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
-  2654211444U,  // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
-  2237041332U,  // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
-  1631824413U,  // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
-  3640066150U,  // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
-  3772746288U,  // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
-  3640067790U,  // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
-  3773041216U,  // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
-  2705934922U,  // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
-  3773041236U,  // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
-  3779086940U,  // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
-  3767732831U,  // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
-  2706229870U,  // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
-  2602164326U,  // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
-  2654212512U,  // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
-  2566334393U,  // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
-  3704066588U,  // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
-  2602167524U,  // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
-  3710702321U,  // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
-  2724661933U,  // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
-  3710702465U,  // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
-  2602170158U,  // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
-  1492598886U,  // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
-  2560369889U,  // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
-  1492600762U,  // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
-  2566342806U,  // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
-  1492602166U,  // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
-  2602176208U,  // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
-  2566345210U,  // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
-  2980496528U,  // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
-  1492604718U,  // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
-  1492607078U,  // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
-  2686028574U,  // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
-  1492608955U,  // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
-  2566350998U,  // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
-  1492610358U,  // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
-  1634257734U,  // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
-  2566353489U,  // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
-  2980504720U,  // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
-  1492612910U,  // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
-  3703406592U,  // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
-  2629664870U,  // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
-  2629664972U,  // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
-  3779087232U,  // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
-  2642936156U,  // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
-  2712570770U,  // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
-  2687208348U,  // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
-  3316723081U,  // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
-  2629665437U,  // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
-  2242473291U,  // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
-  3700089652U,  // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
-  3703407510U,  // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
-  2852962406U,  // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
-  3628166454U,  // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
-  3760876514U,  // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
-  2687208430U,  // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
-  3316731274U,  // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
-  2243063187U,  // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
-  2629666284U,  // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
-  3703408188U,  // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
-  3703408232U,  // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
-  3703408294U,  // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
-  2632320816U,  // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
-  2923384118U,  // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
-  2687208508U,  // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
-  3760950341U,  // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
-  2634975348U,  // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
-  3703408790U,  // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
-  3316305238U,  // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
-  3703408947U,  // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
-  3703409052U,  // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
-  2644929026U,  // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
-  3718670922U,  // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
-  2705345682U,  // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
-  3926705152U,  // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
-  2668817222U,  // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
-  2590277734U,  // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
-  3716017135U,  // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
-  2642938944U,  // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
-  3717344401U,  // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
-  2712571088U,  // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
-  2629668150U,  // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
-  1637649636U,  // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
-  2646257109U,  // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
-  1637649636U,  // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
-  2566398054U,  // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
-  3760876805U,  // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
-  2566399937U,  // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
-  2584316418U,  // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
-  2566401334U,  // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
-  2584318028U,  // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
-  1612287286U,  // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
-  2852965686U,  // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
-  1612287304U,  // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
-  1504608358U,  // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
-  2578350838U,  // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
-  2578351720U,  // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
-  2578352278U,  // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
-  1504611638U,  // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
-  2578353872U,  // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
-  2578354682U,  // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
-  2578355194U,  // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
-  1504614190U,  // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
-  2572386406U,  // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
-  2572387226U,  // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
-  3640157902U,  // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
-  2572389020U,  // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
-  2572389686U,  // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
-  2980497102U,  // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
-  2980495564U,  // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
-  4054239090U,  // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
-  2572392238U,  // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
-  1504608358U,  // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
-  2629670702U,  // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
-  2566424516U,  // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
-  2584340994U,  // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
-  1640156694U,  // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
-  2629671066U,  // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
-  1612287529U,  // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
-  2852965929U,  // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
-  1612287547U,  // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
-  3708723200U,  // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
-  2634981478U,  // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
-  3694125260U,  // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
-  3779087962U,  // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
-  3760877154U,  // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
-  4195110916U,  // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
-  3696779775U,  // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
-  1175212130U,  // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
-  1175285867U,  // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
-  2248445988U,  // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
-  3698107237U,  // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
-  3708724118U,  // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
-  3908575334U,  // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
-  3716023376U,  // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
-  3708724368U,  // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
-  3767733960U,  // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
-  2712571600U,  // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
-  2712571609U,  // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
-  2578391142U,  // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
-  3704079934U,  // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
-  3708724840U,  // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
-  3705407182U,  // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
-  2578394422U,  // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
-  3717351272U,  // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
-  2634983354U,  // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
-  3115486518U,  // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
-  2634983541U,  // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
-  3708725398U,  // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
-  3710052631U,  // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
-  3708725606U,  // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
-  3708725660U,  // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
-  2643610114U,  // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
-  3717352010U,  // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
-  3773632358U,  // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
-  2248978533U,  // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
-  2249052270U,  // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
-  2596323430U,  // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
-  3716025328U,  // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
-  3716688961U,  // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
-  2643610770U,  // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
-  2596326710U,  // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
-  2634984758U,  // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
-  3767734199U,  // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
-  1643696070U,  // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
-  1643769807U,  // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
-  2578415718U,  // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
-  3652158198U,  // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
-  3652159080U,  // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
-  3652159638U,  // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
-  2578418998U,  // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
-  2712571908U,  // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
-  2718027790U,  // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
-  2712571928U,  // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
-  2712571937U,  // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
-  2705346596U,  // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
-  3767144496U,  // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
-  3773116473U,  // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
-  2705346626U,  // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
-  2705346636U,  // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
-  3908577217U,  // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
-  2578428728U,  // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
-  2712572002U,  // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
-  2705346668U,  // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
-  2560516198U,  // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
-  2560517363U,  // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
-  2566490060U,  // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
-  3634260118U,  // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
-  2560519478U,  // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
-  2980498650U,  // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
-  2980497922U,  // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
-  3103214902U,  // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
-  2560522030U,  // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
-  2560524390U,  // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
-  2560525556U,  // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
-  2566498253U,  // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
-  2646931439U,  // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
-  2560527670U,  // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
-  2634987674U,  // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
-  2980506114U,  // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
-  1175277674U,  // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
-  1175351411U,  // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
-  2578448486U,  // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
-  1573191782U,  // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
-  2686030124U,  // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
-  3779088690U,  // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
-  2687209788U,  // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
-  3652194000U,  // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
-  2254852914U,  // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
-  4041575734U,  // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
-  1573192349U,  // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
-  2646934262U,  // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
-  2646934324U,  // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
-  2646934422U,  // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
-  2846785638U,  // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
-  3760951694U,  // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
-  2646934672U,  // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
-  2712572320U,  // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
-  3775549865U,  // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
-  2846785643U,  // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
-  3759772094U,  // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
-  3704751676U,  // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
-  2631009936U,  // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
-  2646935206U,  // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
-  3759772127U,  // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
-  3704752004U,  // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
-  2646935482U,  // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
-  2712572410U,  // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
-  2712572419U,  // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
-  2646935702U,  // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
-  3777024534U,  // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
-  3704752453U,  // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
-  2646935964U,  // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
-  2705347122U,  // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
-  3779678778U,  // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
-  2657553069U,  // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
-  4039609654U,  // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
-  2708001366U,  // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
-  2578481254U,  // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
-  3652223734U,  // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
-  3760951922U,  // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
-  3779089019U,  // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
-  1570540772U,  // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
-  1573195062U,  // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
-  2712572560U,  // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
-  2723410591U,  // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
-  1573195304U,  // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
-  3640287334U,  // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
-  2646937296U,  // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
-  3640289235U,  // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
-  3720679279U,  // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
-  2646937542U,  // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
-  2646937604U,  // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
-  2646937698U,  // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
-  2846788918U,  // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
-  2846788919U,  // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
-  1516699750U,  // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
-  2590442230U,  // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
-  2646938106U,  // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
-  2590443670U,  // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
-  1516703030U,  // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
-  2590445264U,  // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
-   296144182U,  // <6,6,6,6>: Cost 1 vdup2 RHS
-  2712572738U,  // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
-   296144182U,  // <6,6,6,u>: Cost 1 vdup2 RHS
-  2566561894U,  // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
-  3634332924U,  // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
-  2566563797U,  // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
-  2584480258U,  // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
-  2566565174U,  // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
-  2717438846U,  // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
-  2980500280U,  // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
-  1906756918U,  // <6,6,7,7>: Cost 2 vzipr RHS, RHS
-  1906756919U,  // <6,6,7,u>: Cost 2 vzipr RHS, RHS
-  1516699750U,  // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
-  1573197614U,  // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
-  2566571990U,  // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
-  2846786205U,  // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
-  1516703030U,  // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
-  1573197978U,  // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
-   296144182U,  // <6,6,u,6>: Cost 1 vdup2 RHS
-  1906765110U,  // <6,6,u,7>: Cost 2 vzipr RHS, RHS
-   296144182U,  // <6,6,u,u>: Cost 1 vdup2 RHS
-  1571209216U,  // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
-   497467494U,  // <6,7,0,1>: Cost 1 vext2 RHS, LHS
-  1571209380U,  // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
-  2644951292U,  // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
-  1571209554U,  // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
-  1510756450U,  // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
-  2644951542U,  // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
-  2584499194U,  // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
-   497468061U,  // <6,7,0,u>: Cost 1 vext2 RHS, LHS
-  1571209974U,  // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
-  1571210036U,  // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
-  1571210134U,  // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
-  1571210200U,  // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
-  2644952098U,  // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
-  1571210384U,  // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
-  2644952271U,  // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
-  2578535418U,  // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
-  1571210605U,  // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
-  2644952509U,  // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
-  2644952582U,  // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
-  1571210856U,  // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
-  1571210918U,  // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
-  2644952828U,  // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
-  2633009028U,  // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
-  1571211194U,  // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
-  2668840938U,  // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
-  1571211323U,  // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
-  1571211414U,  // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
-  2644953311U,  // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
-  2644953390U,  // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
-  1571211676U,  // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
-  1571211778U,  // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
-  2644953648U,  // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
-  2644953720U,  // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
-  2644953795U,  // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
-  1571212062U,  // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
-  1573202834U,  // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
-  2644954058U,  // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
-  2644954166U,  // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
-  2644954258U,  // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
-  1571212496U,  // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
-   497470774U,  // <6,7,4,5>: Cost 1 vext2 RHS, RHS
-  1573203316U,  // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  2646281688U,  // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
-   497471017U,  // <6,7,4,u>: Cost 1 vext2 RHS, RHS
-  2644954696U,  // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
-  1573203664U,  // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  2644954878U,  // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
-  2644954991U,  // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
-  1571213254U,  // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
-  1571213316U,  // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
-  1571213410U,  // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
-  1573204136U,  // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
-  1573204217U,  // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
-  2644955425U,  // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
-  2644955561U,  // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
-  1573204474U,  // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  2644955698U,  // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
-  2644955789U,  // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
-  2644955889U,  // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
-  1571214136U,  // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
-  1571214158U,  // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
-  1573204895U,  // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
-  1573204986U,  // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
-  2572608656U,  // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
-  2644956362U,  // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
-  2572610231U,  // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
-  1573205350U,  // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
-  2646947220U,  // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
-  1516786498U,  // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
-  1571214956U,  // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
-  1573205634U,  // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
-  1571215059U,  // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
-   497473326U,  // <6,7,u,1>: Cost 1 vext2 RHS, LHS
-  1571215237U,  // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
-  1571215292U,  // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
-  1571215423U,  // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
-   497473690U,  // <6,7,u,5>: Cost 1 vext2 RHS, RHS
-  1571215568U,  // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
-  1573206272U,  // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
-   497473893U,  // <6,7,u,u>: Cost 1 vext2 RHS, LHS
-  1571217408U,  // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
-   497475686U,  // <6,u,0,1>: Cost 1 vext2 RHS, LHS
-  1571217572U,  // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
-  2689865445U,  // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
-  1571217746U,  // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
-  1510830187U,  // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
-  2644959734U,  // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
-  1193130221U,  // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
-   497476253U,  // <6,u,0,u>: Cost 1 vext2 RHS, LHS
-  1571218166U,  // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
-  1571218228U,  // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
-  1612289838U,  // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  1571218392U,  // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
-  2566663478U,  // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
-  1571218576U,  // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
-  2644960463U,  // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
-  2717439835U,  // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
-  1612289892U,  // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
-  1504870502U,  // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
-  2644960774U,  // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
-  1571219048U,  // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
-  1571219110U,  // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
-  1504873782U,  // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
-  2633017221U,  // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
-  1571219386U,  // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
-  2712573868U,  // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
-  1571219515U,  // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
-  1571219606U,  // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
-  2644961503U,  // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
-  2566678499U,  // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
-  1571219868U,  // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
-  1571219970U,  // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
-  2689865711U,  // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
-  2708002806U,  // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
-  2644961987U,  // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
-  1571220254U,  // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
-  1571220370U,  // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
-  2644962250U,  // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
-  1661245476U,  // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
-  2686031917U,  // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
-  1571220688U,  // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
-   497478967U,  // <6,u,4,5>: Cost 1 vext2 RHS, RHS
-  1571220852U,  // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  1661614161U,  // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
-   497479209U,  // <6,u,4,u>: Cost 1 vext2 RHS, RHS
-  2566692966U,  // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
-  1571221200U,  // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  2566694885U,  // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
-  2689865855U,  // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
-  1571221446U,  // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
-  1571221508U,  // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
-  1612290202U,  // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
-  1571221672U,  // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
-  1612290220U,  // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
-  1504903270U,  // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
-  2644963752U,  // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
-  1571222010U,  // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  2686032080U,  // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
-  1504906550U,  // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
-  2644964079U,  // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
-   296144182U,  // <6,u,6,6>: Cost 1 vdup2 RHS
-  1571222350U,  // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
-   296144182U,  // <6,u,6,u>: Cost 1 vdup2 RHS
-  1492967526U,  // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
-  2560738574U,  // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
-  1492969447U,  // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
-  1906753692U,  // <6,u,7,3>: Cost 2 vzipr RHS, LHS
-  1492970806U,  // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
-  2980495761U,  // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
-  1516860235U,  // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
-  1906756936U,  // <6,u,7,7>: Cost 2 vzipr RHS, RHS
-  1492973358U,  // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
-  1492975718U,  // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
-   497481518U,  // <6,u,u,1>: Cost 1 vext2 RHS, LHS
-  1612290405U,  // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
-  1571223484U,  // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
-  1492978998U,  // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
-   497481882U,  // <6,u,u,5>: Cost 1 vext2 RHS, RHS
-   296144182U,  // <6,u,u,6>: Cost 1 vdup2 RHS
-  1906765128U,  // <6,u,u,7>: Cost 2 vzipr RHS, RHS
-   497482085U,  // <6,u,u,u>: Cost 1 vext2 RHS, LHS
-  1638318080U,  // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
-  1638318090U,  // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
-  1638318100U,  // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
-  3646442178U,  // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
-  2712059941U,  // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
-  2651603364U,  // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
-  2590618445U,  // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
-  3785801798U,  // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
-  1638318153U,  // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
-  1516879974U,  // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
-  2693922911U,  // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
-   564576358U,  // <7,0,1,2>: Cost 1 vext3 RHS, LHS
-  2638996480U,  // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
-  1516883254U,  // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
-  2649613456U,  // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
-  1516884814U,  // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
-  2590626808U,  // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
-   564576412U,  // <7,0,1,u>: Cost 1 vext3 RHS, LHS
-  1638318244U,  // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
-  2692743344U,  // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
-  2712060084U,  // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
-  2712060094U,  // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
-  1638318284U,  // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
-  2712060118U,  // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
-  2651604922U,  // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
-  2686255336U,  // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
-  1638318316U,  // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
-  2651605142U,  // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
-  2712060156U,  // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
-  2712060165U,  // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
-  2651605404U,  // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
-  2651605506U,  // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
-  2638998111U,  // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
-  2639661744U,  // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
-  3712740068U,  // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
-  2640989010U,  // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
-  2712060232U,  // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
-  1638318418U,  // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
-  1638318428U,  // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
-  3646474950U,  // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
-  2712060270U,  // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
-  1577864502U,  // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
-  2651606388U,  // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
-  3787792776U,  // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
-  1638318481U,  // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
-  2590654566U,  // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
-  2651606736U,  // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
-  2712060334U,  // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
-  2649616239U,  // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
-  2651606982U,  // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
-  2651607044U,  // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
-  1577865314U,  // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
-  2651607208U,  // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
-  1579192580U,  // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
-  2688393709U,  // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
-  2712060406U,  // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
-  2688541183U,  // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
-  2655588936U,  // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
-  3762430481U,  // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
-  2651607730U,  // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
-  2651607864U,  // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
-  2651607886U,  // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
-  2688983605U,  // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
-  2651608058U,  // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
-  2932703334U,  // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
-  3066921062U,  // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
-  3712742678U,  // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
-  2651608422U,  // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
-  2651608513U,  // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
-  2663552532U,  // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
-  2651608684U,  // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
-  2651608706U,  // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
-  1638318730U,  // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
-  1638318738U,  // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
-   564576925U,  // <7,0,u,2>: Cost 1 vext3 RHS, LHS
-  2572765898U,  // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
-  1638318770U,  // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
-  1577867418U,  // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
-  1516942165U,  // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
-  2651609344U,  // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
-   564576979U,  // <7,0,u,u>: Cost 1 vext3 RHS, LHS
-  2590687334U,  // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
-  2639003750U,  // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
-  2793357414U,  // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
-  1638318838U,  // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
-  2590690614U,  // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
-  2712060679U,  // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
-  2590692182U,  // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
-  3785802521U,  // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
-  1638318883U,  // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
-  2712060715U,  // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
-  1638318900U,  // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
-  3774300994U,  // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
-  1638318920U,  // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
-  2712060755U,  // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
-  2691416926U,  // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
-  2590700375U,  // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
-  3765158766U,  // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
-  1638318965U,  // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
-  2712060796U,  // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
-  2712060807U,  // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
-  3712747112U,  // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
-  1638318998U,  // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
-  2712060836U,  // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
-  2712060843U,  // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
-  2590708568U,  // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
-  2735948730U,  // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
-  1638319043U,  // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
-  2712060876U,  // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
-  1638319064U,  // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
-  2712060894U,  // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
-  2692596718U,  // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
-  2712060917U,  // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
-  1619002368U,  // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
-  2692817929U,  // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
-  2735948814U,  // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
-  1619223579U,  // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
-  2712060962U,  // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
-  2712060971U,  // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
-  2712060980U,  // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
-  2712060989U,  // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
-  3785802822U,  // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
-  2639007030U,  // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
-  2645642634U,  // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
-  3719384520U,  // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
-  2639007273U,  // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
-  2572812390U,  // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
-  2693776510U,  // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
-  3774301318U,  // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
-  1620182160U,  // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
-  2572815670U,  // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
-  3766486178U,  // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
-  2651615331U,  // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
-  2652278964U,  // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
-  1620550845U,  // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
-  3768108230U,  // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
-  2694440143U,  // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
-  2712061144U,  // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
-  2694587617U,  // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
-  3768403178U,  // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
-  2694735091U,  // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
-  3768550652U,  // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
-  2652279630U,  // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
-  2694956302U,  // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
-  2645644282U,  // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
-  2859062094U,  // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
-  3779462437U,  // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
-  3121938534U,  // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
-  2554916150U,  // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
-  3769140548U,  // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
-  3726022164U,  // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
-  2554918508U,  // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
-  3121938539U,  // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
-  2572836966U,  // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
-  1638319469U,  // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
-  2712061299U,  // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
-  1622173059U,  // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
-  2572840246U,  // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
-  1622320533U,  // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
-  2696136094U,  // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
-  2859060777U,  // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
-  1622541744U,  // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
-  2712061364U,  // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
-  2712061373U,  // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
-  2712061380U,  // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
-  2712061389U,  // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
-  2712061404U,  // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
-  2696725990U,  // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
-  2712061417U,  // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
-  3785803251U,  // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
-  2696947201U,  // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
-  2712061446U,  // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
-  3785803276U,  // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
-  3785803285U,  // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
-  2712061471U,  // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
-  2712061482U,  // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
-  3766486576U,  // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
-  2712061500U,  // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
-  2602718850U,  // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
-  2712061516U,  // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
-  2712061525U,  // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
-  2712061536U,  // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
-  1638319720U,  // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
-  1638319730U,  // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
-  2712061565U,  // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
-  2698053256U,  // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
-  2712061584U,  // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
-  3771795096U,  // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
-  1638319775U,  // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
-  1638319782U,  // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
-  2693924531U,  // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
-  2700560061U,  // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
-  2693924551U,  // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
-  1638319822U,  // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
-  2698716889U,  // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
-  2712061665U,  // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
-  2735949540U,  // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
-  1638319854U,  // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
-  2712061692U,  // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
-  2712061698U,  // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
-  2712061708U,  // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
-  2712061718U,  // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
-  2712061728U,  // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
-  2699380522U,  // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
-  2712061740U,  // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
-  3809691445U,  // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
-  2699601733U,  // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
-  2699675470U,  // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
-  3766486867U,  // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
-  2699822944U,  // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
-  2692745065U,  // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
-  2699970418U,  // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
-  3766486907U,  // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
-  2700117892U,  // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
-  3771795334U,  // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
-  2692745110U,  // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
-  2572894310U,  // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
-  2712061860U,  // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
-  2700486577U,  // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
-  1626818490U,  // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
-  2572897590U,  // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
-  2700707788U,  // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
-  2700781525U,  // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
-  3774597086U,  // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
-  1627187175U,  // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
-  2735949802U,  // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
-  3780200434U,  // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
-  3773564928U,  // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
-  2986541158U,  // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
-  2554989878U,  // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
-  3775113245U,  // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
-  4060283228U,  // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
-  2554992236U,  // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
-  2986541163U,  // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
-  1638320187U,  // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
-  2693924936U,  // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
-  1638319720U,  // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
-  1628145756U,  // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
-  1638320227U,  // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
-  2702035054U,  // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
-  2702108791U,  // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
-  2735949945U,  // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
-  1628514441U,  // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
-  2712062091U,  // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
-  1638320278U,  // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
-  2712062109U,  // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
-  2590836886U,  // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
-  2712062128U,  // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
-  2712062138U,  // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
-  2590839656U,  // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
-  3311414017U,  // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
-  1638320341U,  // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
-  2237164227U,  // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
-  2712062182U,  // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
-  2712062193U,  // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
-  2692745468U,  // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
-  2712062214U,  // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
-  2693925132U,  // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
-  3768183059U,  // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
-  2692745504U,  // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
-  2696063273U,  // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
-  2712062254U,  // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
-  2712062262U,  // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
-  2712062273U,  // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
-  2712062280U,  // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
-  2712062294U,  // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
-  2712062302U,  // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
-  2700560742U,  // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
-  2712062319U,  // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
-  2712062325U,  // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
-  2712062335U,  // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
-  2636368158U,  // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
-  2637031791U,  // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
-  1638320540U,  // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
-  2712062374U,  // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
-  2704689586U,  // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
-  2590864235U,  // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
-  2704837060U,  // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
-  1638320540U,  // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
-  2712062416U,  // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
-  2712062426U,  // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
-  2566981640U,  // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
-  2712062447U,  // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
-  2712062456U,  // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
-  1638320642U,  // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
-  2648313204U,  // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
-  3311446789U,  // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
-  1638320669U,  // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
-  2602819686U,  // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
-  1574571728U,  // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
-  2648977185U,  // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
-  2705869378U,  // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
-  2237491947U,  // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
-  2706016852U,  // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
-  2648313954U,  // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
-  2692745823U,  // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
-  1579217159U,  // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
-  2706311800U,  // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
-  2654286249U,  // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
-  1581208058U,  // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
-  2706533011U,  // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
-  2706606748U,  // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
-  3780422309U,  // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
-  2712062637U,  // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
-  2706827959U,  // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
-  1585189856U,  // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
-  2693925571U,  // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
-  2693925584U,  // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
-  2700561114U,  // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
-  2572978916U,  // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
-  2693925611U,  // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
-  2707344118U,  // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
-  2654950894U,  // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
-  2648315500U,  // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
-  2693925643U,  // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
-  2237221578U,  // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
-  1638320926U,  // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
-  1593153452U,  // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
-  1638320540U,  // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
-  2237516526U,  // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
-  1638320966U,  // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
-  2712062796U,  // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
-  2692967250U,  // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
-  1638320989U,  // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
-  2651635712U,  // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
-  1577893990U,  // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
-  2651635876U,  // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
-  3785804672U,  // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
-  2651636050U,  // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
-  1638468498U,  // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
-  1638468508U,  // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
-  3787795364U,  // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
-  1640459181U,  // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
-  2651636470U,  // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
-  2651636532U,  // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
-  2712062922U,  // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
-  2639029248U,  // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
-  2712062940U,  // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
-  2712062946U,  // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
-  2712062958U,  // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
-  3785804791U,  // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
-  2712062973U,  // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
-  3785804807U,  // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
-  3785804818U,  // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
-  2651637352U,  // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
-  2651637414U,  // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
-  3716753194U,  // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
-  2712063030U,  // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
-  2712063036U,  // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
-  3773123658U,  // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
-  2712063054U,  // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
-  2651637910U,  // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
-  3712772348U,  // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
-  3785804906U,  // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
-  2651638172U,  // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
-  2651638274U,  // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
-  2639030883U,  // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
-  2712063122U,  // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
-  3712772836U,  // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
-  2641021782U,  // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
-  2714053802U,  // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
-  3785804978U,  // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
-  3716754505U,  // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
-  3785804998U,  // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
-  1638321360U,  // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
-  1638468826U,  // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
-  1638468836U,  // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
-  3785215214U,  // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
-  1640459509U,  // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
-  1517207654U,  // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
-  2573034640U,  // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
-  2712063246U,  // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
-  2573036267U,  // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
-  1517210934U,  // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
-  2711989549U,  // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
-   564579638U,  // <7,4,5,6>: Cost 1 vext3 RHS, RHS
-  2651639976U,  // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
-   564579656U,  // <7,4,5,u>: Cost 1 vext3 RHS, RHS
-  2712063307U,  // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
-  3767668056U,  // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
-  2651640314U,  // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
-  2655621708U,  // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
-  1638468980U,  // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
-  2712063358U,  // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
-  2712063367U,  // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
-  2712210826U,  // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
-  1638469012U,  // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
-  2651640826U,  // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
-  3773713830U,  // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
-  3773713842U,  // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
-  3780349372U,  // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
-  2651641140U,  // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
-  2712210888U,  // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
-  2712210898U,  // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
-  2651641452U,  // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
-  2713538026U,  // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
-  1517232230U,  // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
-  1577899822U,  // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
-  2712063489U,  // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
-  2573060846U,  // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
-  1640312342U,  // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
-  1638469146U,  // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
-   564579881U,  // <7,4,u,6>: Cost 1 vext3 RHS, RHS
-  2714054192U,  // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
-   564579899U,  // <7,4,u,u>: Cost 1 vext3 RHS, RHS
-  2579038310U,  // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
-  2636382310U,  // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
-  2796339302U,  // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
-  3646810719U,  // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
-  2712063586U,  // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
-  2735951467U,  // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
-  2735951476U,  // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
-  2579043322U,  // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
-  2636382877U,  // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
-  2712211087U,  // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
-  3698180916U,  // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
-  3710124950U,  // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
-  2636383232U,  // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
-  2712211127U,  // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
-  2590994128U,  // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
-  2590995323U,  // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
-  1638469328U,  // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
-  1638469337U,  // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
-  3785805536U,  // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
-  3785805544U,  // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
-  3704817288U,  // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
-  2712063742U,  // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
-  3716761386U,  // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
-  2714054415U,  // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
-  3774304024U,  // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
-  2712063777U,  // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
-  2712063787U,  // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
-  3634888806U,  // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
-  2636384544U,  // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
-  3710790001U,  // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
-  3710126492U,  // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
-  3634892086U,  // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
-  2639039076U,  // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
-  3713444533U,  // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
-  2693926767U,  // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
-  2712063864U,  // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
-  2579071078U,  // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
-  3646841856U,  // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
-  3716762698U,  // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
-  3646843491U,  // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
-  2579074358U,  // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
-  2636385590U,  // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
-  2645675406U,  // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
-  1638322118U,  // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
-  1638469583U,  // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
-  2714054611U,  // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
-  2652974800U,  // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
-  3710127905U,  // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
-  3785805808U,  // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
-  2712211450U,  // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
-  1638322180U,  // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
-  2712064014U,  // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
-  1638469656U,  // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
-  1638469665U,  // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
-  2712064036U,  // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
-  2714054707U,  // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
-  3785805879U,  // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
-  2712064066U,  // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
-  2712064076U,  // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
-  2714054743U,  // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
-  2712064096U,  // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
-  1638322274U,  // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
-  1638469739U,  // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
-  1511325798U,  // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
-  2692747392U,  // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
-  2585069160U,  // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
-  2573126390U,  // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
-  1511329078U,  // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
-  1638469800U,  // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
-  2712211626U,  // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
-  2712211636U,  // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
-  1638469823U,  // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
-  1511333990U,  // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
-  2636388142U,  // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
-  2712211671U,  // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
-  2573134583U,  // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
-  1511337270U,  // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
-  1638469881U,  // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
-  2712064258U,  // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
-  1638469892U,  // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
-  1638469904U,  // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
-  2650324992U,  // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
-  1576583270U,  // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
-  2712064300U,  // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
-  2255295336U,  // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
-  2712064316U,  // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
-  2585088098U,  // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
-  2735952204U,  // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
-  2712211799U,  // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
-  1576583837U,  // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
-  1181340494U,  // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
-  2650325812U,  // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
-  2650325910U,  // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
-  2650325976U,  // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
-  2579123510U,  // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
-  2650326160U,  // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
-  2714055072U,  // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
-  2712064425U,  // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
-  1181930390U,  // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
-  2712211897U,  // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
-  2714055108U,  // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
-  2650326632U,  // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
-  2650326694U,  // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
-  2714055137U,  // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
-  2714055148U,  // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
-  2650326970U,  // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
-  1638470138U,  // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
-  1638470147U,  // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
-  2650327190U,  // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
-  2255172441U,  // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
-  2255246178U,  // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
-  2650327452U,  // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
-  2712064562U,  // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
-  2650327627U,  // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
-  3713452726U,  // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
-  2700563016U,  // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
-  2712064593U,  // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
-  2650327954U,  // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
-  2735952486U,  // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
-  2735952497U,  // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
-  2255328108U,  // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
-  2712212100U,  // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
-  1576586550U,  // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
-  2714055312U,  // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
-  2712212126U,  // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
-  1576586793U,  // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
-  2579152998U,  // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
-  2650328784U,  // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
-  2714055364U,  // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
-  3785806538U,  // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
-  1576587206U,  // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
-  2650329092U,  // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
-  2650329186U,  // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
-  2712064753U,  // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
-  1181963162U,  // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
-  2714055421U,  // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
-  2714055432U,  // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
-  2650329594U,  // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
-  3785806619U,  // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
-  2712212260U,  // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
-  2714055472U,  // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
-  1638323000U,  // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
-  1638470466U,  // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
-  1638470475U,  // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
-  1638323022U,  // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
-  2712064854U,  // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
-  2712064865U,  // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
-  2712064872U,  // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
-  1638323062U,  // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
-  2712064894U,  // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
-  2712064905U,  // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
-  2712064915U,  // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
-  1638323094U,  // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
-  1638470559U,  // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
-  1576589102U,  // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
-  2712212402U,  // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
-  2712212409U,  // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
-  1638470599U,  // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
-  1576589466U,  // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
-  1638323000U,  // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
-  1638470624U,  // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
-  1638470631U,  // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
-  2712065007U,  // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
-  1638323194U,  // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
-  2712065025U,  // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
-  3646958337U,  // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
-  2712065044U,  // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
-  2585161907U,  // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
-  2591134604U,  // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
-  2591134714U,  // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
-  1638323257U,  // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
-  2712065091U,  // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
-  2712065098U,  // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
-  2712065109U,  // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
-  2692748384U,  // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
-  2585169206U,  // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
-  2693928048U,  // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
-  2585170766U,  // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
-  2735953024U,  // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
-  2695918731U,  // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
-  3770471574U,  // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
-  3785807002U,  // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
-  2712065189U,  // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
-  2712065196U,  // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
-  3773125818U,  // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
-  3766490305U,  // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
-  2700563658U,  // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
-  2735953107U,  // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
-  2701890780U,  // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
-  2712065251U,  // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
-  3766490350U,  // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
-  3774305530U,  // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
-  2637728196U,  // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
-  2712065291U,  // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
-  2585186486U,  // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
-  2639719095U,  // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
-  2640382728U,  // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
-  2641046361U,  // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
-  2712212792U,  // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
-  3646989312U,  // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
-  3785807176U,  // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
-  3646991109U,  // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
-  2712065371U,  // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
-  1638323558U,  // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
-  2712212845U,  // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
-  2591167846U,  // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
-  1638323585U,  // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
-  2585198694U,  // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
-  2712212884U,  // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
-  3711471393U,  // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
-  2649673590U,  // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
-  2712065455U,  // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
-  1577259032U,  // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
-  2712065473U,  // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
-  2712212936U,  // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
-  1579249931U,  // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
-  2591178854U,  // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
-  2735953374U,  // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
-  2712212974U,  // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
-  2655646287U,  // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
-  2591182134U,  // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
-  2656973553U,  // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
-  1583895362U,  // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
-  2712065556U,  // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
-  1585222628U,  // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
-  1523417190U,  // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
-  2597159670U,  // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
-  2597160552U,  // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
-  2597161110U,  // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
-  1523420470U,  // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
-  2651002296U,  // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
-  2657637906U,  // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
-   363253046U,  // <7,7,7,7>: Cost 1 vdup3 RHS
-   363253046U,  // <7,7,7,u>: Cost 1 vdup3 RHS
-  1523417190U,  // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
-  1638471298U,  // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
-  2712213132U,  // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
-  2712213138U,  // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
-  1523420470U,  // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
-  1638471338U,  // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
-  1595840756U,  // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
-   363253046U,  // <7,7,u,7>: Cost 1 vdup3 RHS
-   363253046U,  // <7,7,u,u>: Cost 1 vdup3 RHS
-  1638318080U,  // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
-  1638323923U,  // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
-  1662211804U,  // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
-  1638323941U,  // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
-  2712065773U,  // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
-  1662359286U,  // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
-  1662359296U,  // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
-  2987150664U,  // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
-  1638323986U,  // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
-  1517469798U,  // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
-  1638318900U,  // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
-   564582190U,  // <7,u,1,2>: Cost 1 vext3 RHS, LHS
-  1638324023U,  // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
-  1517473078U,  // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
-  2693928777U,  // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
-  1517474710U,  // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
-  1640462171U,  // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
-   564582244U,  // <7,u,1,u>: Cost 1 vext3 RHS, LHS
-  1638318244U,  // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
-  2712065907U,  // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
-  1638319720U,  // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
-  1638324101U,  // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
-  1638318284U,  // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
-  2712065947U,  // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
-  2700564387U,  // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
-  1640314796U,  // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
-  1638324146U,  // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
-  1638324156U,  // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
-  1638319064U,  // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
-  2700564435U,  // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
-  1638320540U,  // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
-  1638324196U,  // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
-  1638324207U,  // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
-  2700564472U,  // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
-  2695919610U,  // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
-  1638324228U,  // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
-  2712066061U,  // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
-  1662212122U,  // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
-  1662212132U,  // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
-  2712066092U,  // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
-  1638321360U,  // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
-  1638324287U,  // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
-  1662359624U,  // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
-  1640314961U,  // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
-  1638324314U,  // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
-  1517502566U,  // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
-  1574612693U,  // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
-  2712066162U,  // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
-  1638324351U,  // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
-  1576603592U,  // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
-  1577267225U,  // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
-   564582554U,  // <7,u,5,6>: Cost 1 vext3 RHS, RHS
-  1640462499U,  // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
-   564582572U,  // <7,u,5,u>: Cost 1 vext3 RHS, RHS
-  2712066223U,  // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
-  2712066238U,  // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
-  1581249023U,  // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
-  1638324432U,  // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
-  1638468980U,  // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
-  2712066274U,  // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
-  1583903555U,  // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
-  1640315117U,  // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
-  1638324477U,  // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
-  1638471936U,  // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
-  2692970763U,  // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
-  2700933399U,  // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
-  2573347601U,  // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
-  1638471976U,  // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
-  1511551171U,  // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
-  2712213815U,  // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
-   363253046U,  // <7,u,7,7>: Cost 1 vdup3 RHS
-   363253046U,  // <7,u,7,u>: Cost 1 vdup3 RHS
-  1638324561U,  // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
-  1638324571U,  // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
-   564582757U,  // <7,u,u,2>: Cost 1 vext3 RHS, LHS
-  1638324587U,  // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
-  1638324601U,  // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
-  1638324611U,  // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
-   564582797U,  // <7,u,u,6>: Cost 1 vext3 RHS, RHS
-   363253046U,  // <7,u,u,7>: Cost 1 vdup3 RHS
-   564582811U,  // <7,u,u,u>: Cost 1 vext3 RHS, LHS
-   135053414U,  // <u,0,0,0>: Cost 1 vdup0 LHS
-  1611489290U,  // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
-  1611489300U,  // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
-  2568054923U,  // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
-  1481706806U,  // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
-  2555449040U,  // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
-  2591282078U,  // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
-  2591945711U,  // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
-   135053414U,  // <u,0,0,u>: Cost 1 vdup0 LHS
-  1493655654U,  // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
-  1860550758U,  // <u,0,1,1>: Cost 2 vzipl LHS, LHS
-   537747563U,  // <u,0,1,2>: Cost 1 vext3 LHS, LHS
-  2625135576U,  // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
-  1493658934U,  // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
-  2625135760U,  // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
-  1517548447U,  // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
-  2591290362U,  // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
-   537747612U,  // <u,0,1,u>: Cost 1 vext3 LHS, LHS
-  1611489444U,  // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
-  2685231276U,  // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
-  1994768486U,  // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
-  2685231294U,  // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
-  1611489484U,  // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
-  2712068310U,  // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
-  2625136570U,  // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
-  2591962097U,  // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
-  1611489516U,  // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
-  2954067968U,  // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
-  2685231356U,  // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
-    72589981U,  // <u,0,3,2>: Cost 1 vrev LHS
-  2625137052U,  // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
-  2625137154U,  // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
-  2639071848U,  // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
-  2639735481U,  // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
-  2597279354U,  // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
-    73032403U,  // <u,0,3,u>: Cost 1 vrev LHS
-  2687074636U,  // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
-  1611489618U,  // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
-  1611489628U,  // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
-  3629222038U,  // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
-  2555481398U,  // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
-  1551396150U,  // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
-  2651680116U,  // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
-  2646150600U,  // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
-  1611932050U,  // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
-  2561458278U,  // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
-  1863532646U,  // <u,0,5,1>: Cost 2 vzipl RHS, LHS
-  2712068526U,  // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
-  2649689976U,  // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
-  2220237489U,  // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
-  2651680772U,  // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
-  1577939051U,  // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
-  2830077238U,  // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
-  1579266317U,  // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
-  2555494502U,  // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
-  2712068598U,  // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
-  1997750374U,  // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
-  2655662673U,  // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
-  2555497782U,  // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
-  2651681459U,  // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
-  2651681592U,  // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
-  2651681614U,  // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
-  1997750428U,  // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
-  2567446630U,  // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
-  2567447446U,  // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
-  2567448641U,  // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
-  2573421338U,  // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
-  2567449910U,  // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
-  2651682242U,  // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
-  2591339429U,  // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
-  2651682412U,  // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
-  2567452462U,  // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
-   135053414U,  // <u,0,u,0>: Cost 1 vdup0 LHS
-  1611489938U,  // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
-   537748125U,  // <u,0,u,2>: Cost 1 vext3 LHS, LHS
-  2685674148U,  // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
-  1611932338U,  // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
-  1551399066U,  // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
-  1517605798U,  // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
-  2830077481U,  // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
-   537748179U,  // <u,0,u,u>: Cost 1 vext3 LHS, LHS
-  1544101961U,  // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
-  1558036582U,  // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
-  2619171051U,  // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
-  1611490038U,  // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
-  2555522358U,  // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
-  2712068871U,  // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
-  2591355815U,  // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
-  2597328512U,  // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
-  1611490083U,  // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
-  1481785446U,  // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
-   202162278U,  // <u,1,1,1>: Cost 1 vdup1 LHS
-  2555528808U,  // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
-  1611490120U,  // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
-  1481788726U,  // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
-  2689876828U,  // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
-  2591364008U,  // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
-  2592691274U,  // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
-   202162278U,  // <u,1,1,u>: Cost 1 vdup1 LHS
-  1499709542U,  // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
-  2689876871U,  // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
-  2631116445U,  // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
-      835584U,  // <u,1,2,3>: Cost 0 copy LHS
-  1499712822U,  // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
-  2689876907U,  // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
-  2631780282U,  // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
-  1523603074U,  // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
-      835584U,  // <u,1,2,u>: Cost 0 copy LHS
-  1487773798U,  // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
-  1611490264U,  // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
-  2685232094U,  // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
-  2018746470U,  // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
-  1487777078U,  // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
-  1611490304U,  // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
-  2685674505U,  // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
-  2640407307U,  // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
-  1611490327U,  // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
-  1567992749U,  // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
-  2693121070U,  // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
-  2693194807U,  // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
-  1152386432U,  // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
-  2555555126U,  // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
-  1558039862U,  // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
-  2645716371U,  // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
-  2597361284U,  // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
-  1152755117U,  // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
-  1481818214U,  // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
-  2555560694U,  // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
-  2555561576U,  // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
-  1611490448U,  // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
-  1481821494U,  // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
-  2651025435U,  // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
-  2651689068U,  // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
-  2823966006U,  // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
-  1611932861U,  // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
-  2555568230U,  // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
-  2689877199U,  // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
-  2712069336U,  // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
-  2685232353U,  // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
-  2555571510U,  // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
-  2689877235U,  // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
-  2657661765U,  // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
-  1584583574U,  // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
-  1585247207U,  // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
-  2561548390U,  // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
-  2561549681U,  // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
-  2573493926U,  // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
-  2042962022U,  // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
-  2561551670U,  // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
-  2226300309U,  // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
-  2658325990U,  // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
-  2658326124U,  // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
-  2042962027U,  // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
-  1481842790U,  // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
-   202162278U,  // <u,1,u,1>: Cost 1 vdup1 LHS
-  2685674867U,  // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
-      835584U,  // <u,1,u,3>: Cost 0 copy LHS
-  1481846070U,  // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
-  1611933077U,  // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
-  2685674910U,  // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
-  1523652232U,  // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
-      835584U,  // <u,1,u,u>: Cost 0 copy LHS
-  1544110154U,  // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
-  1545437286U,  // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
-  1545437420U,  // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
-  2685232589U,  // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
-  2619179346U,  // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
-  2712069606U,  // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
-  2689877484U,  // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
-  2659656273U,  // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
-  1545437853U,  // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
-  1550082851U,  // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
-  2619179828U,  // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
-  2619179926U,  // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
-  2685232671U,  // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
-  2555604278U,  // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
-  2619180176U,  // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
-  2689877564U,  // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
-  2602718850U,  // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
-  1158703235U,  // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
-  1481867366U,  // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
-  2555609846U,  // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
-   269271142U,  // <u,2,2,2>: Cost 1 vdup2 LHS
-  1611490930U,  // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
-  1481870646U,  // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
-  2689877640U,  // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
-  2619180986U,  // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
-  2593436837U,  // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
-   269271142U,  // <u,2,2,u>: Cost 1 vdup2 LHS
-   408134301U,  // <u,2,3,0>: Cost 1 vext1 LHS, LHS
-  1481876214U,  // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-  1481877096U,  // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
-  1880326246U,  // <u,2,3,3>: Cost 2 vzipr LHS, LHS
-   408137014U,  // <u,2,3,4>: Cost 1 vext1 LHS, RHS
-  1529654992U,  // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
-  1529655802U,  // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1529656314U,  // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
-   408139566U,  // <u,2,3,u>: Cost 1 vext1 LHS, LHS
-  1567853468U,  // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
-  2561598362U,  // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
-  2555627214U,  // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
-  2685232918U,  // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
-  2555628854U,  // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
-  1545440566U,  // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
-  1571982740U,  // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
-  2592125957U,  // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
-  1545440809U,  // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
-  2555633766U,  // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
-  2561606550U,  // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
-  2689877856U,  // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
-  2685233000U,  // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
-  1158441059U,  // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
-  2645725188U,  // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
-  2689877892U,  // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
-  2823900470U,  // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
-  1158736007U,  // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
-  1481900134U,  // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
-  2555642614U,  // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
-  2555643496U,  // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
-  1611491258U,  // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
-  1481903414U,  // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
-  2689877964U,  // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
-  2689877973U,  // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
-  2645726030U,  // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
-  1611933671U,  // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
-  1585919033U,  // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
-  2573566710U,  // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
-  2567596115U,  // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
-  1906901094U,  // <u,2,7,3>: Cost 2 vzipr RHS, LHS
-  2555653430U,  // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
-  2800080230U,  // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
-  2980643164U,  // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
-  2645726828U,  // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
-  1906901099U,  // <u,2,7,u>: Cost 2 vzipr RHS, LHS
-   408175266U,  // <u,2,u,0>: Cost 1 vext1 LHS, LHS
-  1545443118U,  // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
-   269271142U,  // <u,2,u,2>: Cost 1 vdup2 LHS
-  1611491416U,  // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
-   408177974U,  // <u,2,u,4>: Cost 1 vext1 LHS, RHS
-  1545443482U,  // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
-  1726339226U,  // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
-  1529697274U,  // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
-   408180526U,  // <u,2,u,u>: Cost 1 vext1 LHS, LHS
-  1544781824U,  // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
-   471040156U,  // <u,3,0,1>: Cost 1 vext2 LHS, LHS
-  1544781988U,  // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  2618523900U,  // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
-  1544782162U,  // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  2238188352U,  // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
-  2623169023U,  // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
-  2238335826U,  // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
-   471040669U,  // <u,3,0,u>: Cost 1 vext2 LHS, LHS
-  1544782582U,  // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-  1544782644U,  // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
-  1544782742U,  // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
-  1544782808U,  // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  2618524733U,  // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
-  1544782992U,  // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  2618524897U,  // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
-  2703517987U,  // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
-  1544783213U,  // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
-  1529716838U,  // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
-  1164167966U,  // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
-  1544783464U,  // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
-  1544783526U,  // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
-  1529720118U,  // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
-  2618525544U,  // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
-  1544783802U,  // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  2704181620U,  // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
-  1544783931U,  // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
-  1544784022U,  // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
-  1487922559U,  // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
-  1493895256U,  // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
-   336380006U,  // <u,3,3,3>: Cost 1 vdup3 LHS
-  1544784386U,  // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
-  2824054478U,  // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
-  2238286668U,  // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
-  2954069136U,  // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
-   336380006U,  // <u,3,3,u>: Cost 1 vdup3 LHS
-  1487929446U,  // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
-  1487930752U,  // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
-  2623171644U,  // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
-  2561673366U,  // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
-  1487932726U,  // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
-   471043382U,  // <u,3,4,5>: Cost 1 vext2 LHS, RHS
-  1592561012U,  // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
-  2238368598U,  // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
-   471043625U,  // <u,3,4,u>: Cost 1 vext2 LHS, RHS
-  2555707494U,  // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
-  1574645465U,  // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
-  2567653106U,  // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
-  2555709954U,  // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
-  1592561606U,  // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
-  1592561668U,  // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
-  1592561762U,  // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
-  1750314294U,  // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
-  1750314295U,  // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
-  2623172897U,  // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
-  2561688962U,  // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
-  1581281795U,  // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
-  2706541204U,  // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
-  2623173261U,  // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
-  1164495686U,  // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
-  1592562488U,  // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
-  1592562510U,  // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
-  1164716897U,  // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
-  1487954022U,  // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
-  1487955331U,  // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
-  1493928028U,  // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
-  2561697942U,  // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
-  1487957302U,  // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
-  2707352311U,  // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
-  2655024623U,  // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
-  1592563308U,  // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
-  1487959854U,  // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
-  1544787667U,  // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
-   471045934U,  // <u,3,u,1>: Cost 1 vext2 LHS, LHS
-  1549432709U,  // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
-   336380006U,  // <u,3,u,3>: Cost 1 vdup3 LHS
-  1544788031U,  // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
-   471046298U,  // <u,3,u,5>: Cost 1 vext2 LHS, RHS
-  1549433040U,  // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
-  1750314537U,  // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
-   471046501U,  // <u,3,u,u>: Cost 1 vext2 LHS, LHS
-  2625167360U,  // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
-  1551425638U,  // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
-  2619195630U,  // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
-  2619343104U,  // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
-  2625167698U,  // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
-  1638329234U,  // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
-  1638329244U,  // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
-  3787803556U,  // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
-  1551426205U,  // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
-  2555748454U,  // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
-  2625168180U,  // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
-  1551426503U,  // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
-  2625168344U,  // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
-  2555751734U,  // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
-  1860554038U,  // <u,4,1,5>: Cost 2 vzipl LHS, RHS
-  2689879022U,  // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
-  2592248852U,  // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
-  1555408301U,  // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
-  2555756646U,  // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
-  2625168943U,  // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
-  2625169000U,  // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
-  2619197134U,  // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
-  2555759926U,  // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
-  2712071222U,  // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
-  1994771766U,  // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
-  2592257045U,  // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
-  1994771784U,  // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
-  2625169558U,  // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
-  2567709594U,  // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
-  2567710817U,  // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
-  2625169820U,  // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
-  2625169922U,  // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
-  2954069710U,  // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
-  2954068172U,  // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
-  3903849472U,  // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
-  2954068174U,  // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
-  1505919078U,  // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
-  2567717831U,  // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
-  2567719010U,  // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
-  2570373542U,  // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
-   161926454U,  // <u,4,4,4>: Cost 1 vdup0 RHS
-  1551428918U,  // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
-  1638329572U,  // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
-  2594927963U,  // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
-   161926454U,  // <u,4,4,u>: Cost 1 vdup0 RHS
-  1493983334U,  // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
-  2689879301U,  // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
-  1493985379U,  // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
-  2567727254U,  // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
-  1493986614U,  // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
-  1863535926U,  // <u,4,5,5>: Cost 2 vzipl RHS, RHS
-   537750838U,  // <u,4,5,6>: Cost 1 vext3 LHS, RHS
-  2830110006U,  // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
-   537750856U,  // <u,4,5,u>: Cost 1 vext3 LHS, RHS
-  1482047590U,  // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
-  2555790070U,  // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
-  2555790952U,  // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
-  2555791510U,  // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
-  1482050870U,  // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
-  2689879422U,  // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
-  1997753654U,  // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
-  2712071562U,  // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
-  1482053422U,  // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
-  2567741542U,  // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
-  2567742362U,  // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
-  2567743589U,  // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
-  2573716286U,  // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
-  2567744822U,  // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
-  2712071624U,  // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
-    96808489U,  // <u,4,7,6>: Cost 1 vrev RHS
-  2651715180U,  // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
-    96955963U,  // <u,4,7,u>: Cost 1 vrev RHS
-  1482063974U,  // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
-  1551431470U,  // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
-  1494009958U,  // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
-  2555807894U,  // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
-   161926454U,  // <u,4,u,4>: Cost 1 vdup0 RHS
-  1551431834U,  // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
-   537751081U,  // <u,4,u,6>: Cost 1 vext3 LHS, RHS
-  2830110249U,  // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
-   537751099U,  // <u,4,u,u>: Cost 1 vext3 LHS, RHS
-  2631811072U,  // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
-  1558069350U,  // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
-  2619203823U,  // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
-  2619867456U,  // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
-  1546273106U,  // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
-  2733010539U,  // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
-  2597622682U,  // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
-  1176539396U,  // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
-  1558069917U,  // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
-  1505968230U,  // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
-  2624512887U,  // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
-  2631811990U,  // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
-  2618541056U,  // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
-  1505971510U,  // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
-  2627167419U,  // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
-  2579714554U,  // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
-  1638330064U,  // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
-  1638477529U,  // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
-  2561802342U,  // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
-  2561803264U,  // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
-  2631149217U,  // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
-  1558071026U,  // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
-  2561805622U,  // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
-  2714062607U,  // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
-  2631813050U,  // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
-  3092335926U,  // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
-  1561389191U,  // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
-  2561810534U,  // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
-  2561811857U,  // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
-  2631813474U,  // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
-  2631813532U,  // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
-  2619869698U,  // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
-  3001847002U,  // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
-  2954070530U,  // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
-  2018749750U,  // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
-  2018749751U,  // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
-  2573762662U,  // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
-  2620017634U,  // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
-  2573764338U,  // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
-  2573765444U,  // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
-  1570680053U,  // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
-  1558072630U,  // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
-  2645749143U,  // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
-  1638330310U,  // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
-  1558072873U,  // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
-  1506000998U,  // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
-  2561827984U,  // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
-  2579744360U,  // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
-  2579744918U,  // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
-  1506004278U,  // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
-   229035318U,  // <u,5,5,5>: Cost 1 vdup1 RHS
-  2712072206U,  // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
-  1638330392U,  // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
-   229035318U,  // <u,5,5,u>: Cost 1 vdup1 RHS
-  1500037222U,  // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
-  2561836436U,  // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
-  2567809133U,  // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
-  1500040006U,  // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
-  1500040502U,  // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
-  2714062935U,  // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
-  2712072288U,  // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
-    27705344U,  // <u,5,6,7>: Cost 0 copy RHS
-    27705344U,  // <u,5,6,u>: Cost 0 copy RHS
-  1488101478U,  // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
-  1488102805U,  // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
-  2561844840U,  // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
-  2561845398U,  // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
-  1488104758U,  // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
-  1638330536U,  // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
-  2712072362U,  // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
-  2042965302U,  // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
-  1488107310U,  // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
-  1488109670U,  // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
-  1488110998U,  // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
-  2561853032U,  // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
-  1500056392U,  // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
-  1488112950U,  // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
-   229035318U,  // <u,5,u,5>: Cost 1 vdup1 RHS
-  2954111490U,  // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
-    27705344U,  // <u,5,u,7>: Cost 0 copy RHS
-    27705344U,  // <u,5,u,u>: Cost 0 copy RHS
-  2619211776U,  // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
-  1545470054U,  // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
-  1545470192U,  // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
-  2255958969U,  // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
-  1546797458U,  // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
-  2720624971U,  // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
-  2256180180U,  // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
-  2960682294U,  // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
-  1545470621U,  // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
-  1182004127U,  // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
-  2619212596U,  // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
-  2619212694U,  // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
-  2619212760U,  // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
-  2626511979U,  // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
-  2619212944U,  // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
-  2714063264U,  // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
-  2967326006U,  // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
-  1182594023U,  // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
-  1506050150U,  // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
-  2579792630U,  // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
-  2619213416U,  // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
-  2619213478U,  // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
-  1506053430U,  // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
-  2633148309U,  // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
-  2619213754U,  // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
-  1638330874U,  // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
-  1638478339U,  // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
-  2619213974U,  // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
-  2255836074U,  // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
-  2255909811U,  // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
-  2619214236U,  // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
-  1564715549U,  // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
-  2639121006U,  // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
-  3001847012U,  // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
-  1880329526U,  // <u,6,3,7>: Cost 2 vzipr LHS, RHS
-  1880329527U,  // <u,6,3,u>: Cost 2 vzipr LHS, RHS
-  2567864422U,  // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
-  2733011558U,  // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
-  2567866484U,  // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
-  2638458005U,  // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
-  1570540772U,  // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
-  1545473334U,  // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
-  1572015512U,  // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
-  2960715062U,  // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
-  1545473577U,  // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
-  2567872614U,  // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
-  2645757648U,  // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
-  2567874490U,  // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
-  2576501250U,  // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
-  1576660943U,  // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
-  2645757956U,  // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
-  2645758050U,  // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
-  2824080694U,  // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
-  1182626795U,  // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
-  1506082918U,  // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
-  2579825398U,  // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
-  2645758458U,  // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
-  2579826838U,  // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
-  1506086198U,  // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
-  2579828432U,  // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
-   296144182U,  // <u,6,6,6>: Cost 1 vdup2 RHS
-  1638331202U,  // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
-   296144182U,  // <u,6,6,u>: Cost 1 vdup2 RHS
-   432349286U,  // <u,6,7,0>: Cost 1 vext1 RHS, LHS
-  1506091766U,  // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
-  1506092648U,  // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1506093206U,  // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
-   432352809U,  // <u,6,7,4>: Cost 1 vext1 RHS, RHS
-  1506094800U,  // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
-  1506095610U,  // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
-  1906904374U,  // <u,6,7,7>: Cost 2 vzipr RHS, RHS
-   432355118U,  // <u,6,7,u>: Cost 1 vext1 RHS, LHS
-   432357478U,  // <u,6,u,0>: Cost 1 vext1 RHS, LHS
-  1545475886U,  // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
-  1506100840U,  // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
-  1506101398U,  // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
-   432361002U,  // <u,6,u,4>: Cost 1 vext1 RHS, RHS
-  1545476250U,  // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
-   296144182U,  // <u,6,u,6>: Cost 1 vdup2 RHS
-  1880370486U,  // <u,6,u,7>: Cost 2 vzipr LHS, RHS
-   432363310U,  // <u,6,u,u>: Cost 1 vext1 RHS, LHS
-  1571356672U,  // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
-   497614950U,  // <u,7,0,1>: Cost 1 vext2 RHS, LHS
-  1571356836U,  // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
-  2573880146U,  // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
-  1571357010U,  // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
-  1512083716U,  // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
-  2621874741U,  // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
-  2585826298U,  // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
-   497615517U,  // <u,7,0,u>: Cost 1 vext2 RHS, LHS
-  1571357430U,  // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
-  1571357492U,  // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
-  1571357590U,  // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
-  1552114715U,  // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
-  2573888822U,  // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
-  1553441981U,  // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
-  2627847438U,  // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
-  2727408775U,  // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
-  1555432880U,  // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
-  2629838337U,  // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
-  1188058754U,  // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
-  1571358312U,  // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
-  1571358374U,  // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
-  2632492869U,  // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
-  2633156502U,  // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
-  1560078311U,  // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
-  2728072408U,  // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
-  1561405577U,  // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
-  1571358870U,  // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
-  2627184913U,  // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
-  2633820523U,  // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
-  1571359132U,  // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
-  1571359234U,  // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
-  1512108295U,  // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
-  1518080992U,  // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
-  2640456465U,  // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
-  1571359518U,  // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
-  1571359634U,  // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
-  2573911067U,  // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
-  2645101622U,  // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
-  2573912918U,  // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
-  1571359952U,  // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
-   497618248U,  // <u,7,4,5>: Cost 1 vext2 RHS, RHS
-  1571360116U,  // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  2645102024U,  // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
-   497618473U,  // <u,7,4,u>: Cost 1 vext2 RHS, RHS
-  2645102152U,  // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
-  1571360464U,  // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  2645102334U,  // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
-  2645102447U,  // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
-  1571360710U,  // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
-  1571360772U,  // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
-  1571360866U,  // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
-  1571360936U,  // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
-  1571361017U,  // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
-  1530044518U,  // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
-  2645103016U,  // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
-  1571361274U,  // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  2645103154U,  // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
-  1530047798U,  // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
-  1188386474U,  // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
-  1571361592U,  // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
-  1571361614U,  // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
-  1571361695U,  // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
-  1571361786U,  // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
-  2573935616U,  // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
-  2645103781U,  // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
-  2573937497U,  // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
-  1571362150U,  // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
-  1512141067U,  // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
-  1518113764U,  // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
-   363253046U,  // <u,7,7,7>: Cost 1 vdup3 RHS
-   363253046U,  // <u,7,7,u>: Cost 1 vdup3 RHS
-  1571362515U,  // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
-   497620782U,  // <u,7,u,1>: Cost 1 vext2 RHS, LHS
-  1571362693U,  // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
-  1571362748U,  // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
-  1571362879U,  // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
-   497621146U,  // <u,7,u,5>: Cost 1 vext2 RHS, RHS
-  1571363024U,  // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
-   363253046U,  // <u,7,u,7>: Cost 1 vdup3 RHS
-   497621349U,  // <u,7,u,u>: Cost 1 vext2 RHS, LHS
-   135053414U,  // <u,u,0,0>: Cost 1 vdup0 LHS
-   471081121U,  // <u,u,0,1>: Cost 1 vext2 LHS, LHS
-  1544822948U,  // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
-  1616140005U,  // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
-  1544823122U,  // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
-  1512157453U,  // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
-  1662220032U,  // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
-  1194457487U,  // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
-   471081629U,  // <u,u,0,u>: Cost 1 vext2 LHS, LHS
-  1544823542U,  // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
-   202162278U,  // <u,u,1,1>: Cost 1 vdup1 LHS
-   537753390U,  // <u,u,1,2>: Cost 1 vext3 LHS, LHS
-  1544823768U,  // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
-  1494248758U,  // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
-  1544823952U,  // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
-  1518138343U,  // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
-  1640322907U,  // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
-   537753444U,  // <u,u,1,u>: Cost 1 vext3 LHS, LHS
-  1482309734U,  // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
-  1194031451U,  // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
-   269271142U,  // <u,u,2,2>: Cost 1 vdup2 LHS
-      835584U,  // <u,u,2,3>: Cost 0 copy LHS
-  1482313014U,  // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
-  2618566504U,  // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
-  1544824762U,  // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
-  1638479788U,  // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
-      835584U,  // <u,u,2,u>: Cost 0 copy LHS
-   408576723U,  // <u,u,3,0>: Cost 1 vext1 LHS, LHS
-  1482318582U,  // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
-   120371557U,  // <u,u,3,2>: Cost 1 vrev LHS
-   336380006U,  // <u,u,3,3>: Cost 1 vdup3 LHS
-   408579382U,  // <u,u,3,4>: Cost 1 vext1 LHS, RHS
-  1616140271U,  // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
-  1530098170U,  // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
-  1880329544U,  // <u,u,3,7>: Cost 2 vzipr LHS, RHS
-   408581934U,  // <u,u,3,u>: Cost 1 vext1 LHS, LHS
-  1488298086U,  // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
-  1488299437U,  // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
-  1659271204U,  // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
-  1194195311U,  // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
-   161926454U,  // <u,u,4,4>: Cost 1 vdup0 RHS
-   471084342U,  // <u,u,4,5>: Cost 1 vext2 LHS, RHS
-  1571368308U,  // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
-  1640323153U,  // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
-   471084585U,  // <u,u,4,u>: Cost 1 vext2 LHS, RHS
-  1494278246U,  // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
-  1571368656U,  // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
-  1494280327U,  // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
-  1616140415U,  // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
-  1494281526U,  // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
-   229035318U,  // <u,u,5,5>: Cost 1 vdup1 RHS
-   537753754U,  // <u,u,5,6>: Cost 1 vext3 LHS, RHS
-  1750355254U,  // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
-   537753772U,  // <u,u,5,u>: Cost 1 vext3 LHS, RHS
-  1482342502U,  // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
-  2556084982U,  // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
-  1571369466U,  // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
-  1611938000U,  // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
-  1482345782U,  // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
-  1194359171U,  // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
-   296144182U,  // <u,u,6,6>: Cost 1 vdup2 RHS
-    27705344U,  // <u,u,6,7>: Cost 0 copy RHS
-    27705344U,  // <u,u,6,u>: Cost 0 copy RHS
-   432496742U,  // <u,u,7,0>: Cost 1 vext1 RHS, LHS
-  1488324016U,  // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
-  1494296713U,  // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
-  1906901148U,  // <u,u,7,3>: Cost 2 vzipr RHS, LHS
-   432500283U,  // <u,u,7,4>: Cost 1 vext1 RHS, RHS
-  1506242256U,  // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
-   120699277U,  // <u,u,7,6>: Cost 1 vrev RHS
-   363253046U,  // <u,u,7,7>: Cost 1 vdup3 RHS
-   432502574U,  // <u,u,7,u>: Cost 1 vext1 RHS, LHS
-   408617688U,  // <u,u,u,0>: Cost 1 vext1 LHS, LHS
-   471086894U,  // <u,u,u,1>: Cost 1 vext2 LHS, LHS
-   537753957U,  // <u,u,u,2>: Cost 1 vext3 LHS, LHS
-      835584U,  // <u,u,u,3>: Cost 0 copy LHS
-   408620342U,  // <u,u,u,4>: Cost 1 vext1 LHS, RHS
-   471087258U,  // <u,u,u,5>: Cost 1 vext2 LHS, RHS
-   537753997U,  // <u,u,u,6>: Cost 1 vext3 LHS, RHS
-    27705344U,  // <u,u,u,7>: Cost 0 copy RHS
-      835584U,  // <u,u,u,u>: Cost 0 copy LHS
+  135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
+  1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
+  2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
+  2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+  2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
+  2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
+  2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
+  2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
+  1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS
+  1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
+  2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
+  2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
+  3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
+  2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
+  1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS
+  3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
+  3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
+  1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
+  3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
+  3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
+  2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
+  2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
+  2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
+  2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
+  2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
+  2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
+  3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
+  3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
+  3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
+  2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
+  2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
+  3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
+  1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
+  1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
+  2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
+  2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
+  3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
+  3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
+  2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
+  2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
+  2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+  2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
+  2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
+  2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
+  3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
+  2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
+  3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
+  3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
+  3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
+  2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
+  2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
+  2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
+  2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
+  3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
+  3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
+  2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
+  3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
+  3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
+  2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
+  2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
+  1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS
+  1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
+  2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
+  1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
+  1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
+  135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS
+  2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
+  1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
+  2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
+  2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
+  4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
+  2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
+  2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
+  1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
+  1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
+  2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
+  2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
+  2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
+  1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+  2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
+  2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
+  2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
+  1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
+  1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
+  3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
+  2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
+  835584U, // <0,1,2,3>: Cost 0 copy LHS
+  1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
+  3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
+  2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
+  1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
+  835584U, // <0,1,2,u>: Cost 0 copy LHS
+  2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
+  2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
+  2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
+  2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
+  2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
+  2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
+  2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
+  2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
+  2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
+  2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
+  4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
+  2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
+  2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
+  2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
+  1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
+  1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
+  3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
+  2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
+  3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
+  2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
+  2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
+  2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
+  2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
+  2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
+  2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
+  2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
+  3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
+  2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
+  3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
+  2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
+  3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
+  2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
+  1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
+  1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
+  2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
+  2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
+  2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
+  2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
+  2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
+  3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
+  2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
+  2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
+  2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
+  1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
+  1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
+  835584U, // <0,1,u,3>: Cost 0 copy LHS
+  1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
+  1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
+  835584U, // <0,1,u,u>: Cost 0 copy LHS
+  2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
+  1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
+  2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
+  2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
+  2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
+  2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
+  2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
+  1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
+  2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
+  2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
+  2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
+  2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
+  2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
+  2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
+  3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
+  2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
+  1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
+  2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
+  2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
+  3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
+  2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
+  2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
+  2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
+  1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
+  2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
+  2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
+  2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
+  2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
+  2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
+  2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
+  4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
+  2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
+  2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
+  2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
+  1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
+  2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
+  2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
+  2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
+  3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
+  3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
+  2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
+  2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
+  2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
+  2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
+  3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
+  2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
+  2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
+  2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
+  3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
+  2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
+  2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
+  2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
+  2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
+  3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
+  2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
+  2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
+  2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
+  3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
+  2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
+  2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
+  1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
+  3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
+  1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
+  2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
+  2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
+  2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
+  2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
+  4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
+  2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+  4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
+  3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
+  3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
+  2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
+  2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
+  2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
+  2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
+  2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
+  2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
+  4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
+  3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
+  3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
+  2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
+  1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+  1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
+  2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
+  3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+  3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
+  2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
+  1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
+  3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
+  2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
+  3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
+  2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
+  3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
+  3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
+  3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
+  3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
+  2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
+  3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
+  2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
+  2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
+  4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
+  3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
+  2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
+  3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
+  3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
+  2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
+  3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
+  3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
+  4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
+  3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
+  2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+  3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
+  2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
+  2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
+  2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
+  2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
+  3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
+  3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
+  3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
+  3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
+  3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
+  3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
+  2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
+  2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
+  3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
+  2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+  3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
+  3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
+  3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
+  3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
+  3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
+  3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
+  2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
+  1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
+  1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
+  2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
+  3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
+  3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
+  2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
+  1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
+  2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
+  2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
+  3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
+  2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
+  2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
+  3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
+  3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
+  3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
+  3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
+  2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
+  2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
+  1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS
+  3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
+  2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS
+  2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
+  2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
+  3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
+  2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
+  2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
+  2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
+  1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
+  3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
+  3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
+  3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
+  3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
+  3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
+  3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
+  4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
+  3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
+  3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
+  2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
+  3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
+  3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
+  3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
+  2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
+  2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
+  2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
+  3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
+  2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
+  2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+  2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+  3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
+  2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
+  2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+  2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
+  1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
+  1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
+  3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
+  2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
+  3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
+  2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
+  3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
+  2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
+  2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
+  2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
+  3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
+  3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
+  3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
+  3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
+  3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
+  2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
+  3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
+  2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
+  2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
+  2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
+  2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
+  1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS
+  1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
+  2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
+  1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
+  3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
+  2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
+  3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
+  3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
+  3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
+  2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
+  3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
+  2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
+  1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+  2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
+  2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
+  2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
+  1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
+  2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
+  2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
+  2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
+  1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
+  2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
+  2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
+  3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
+  2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
+  2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
+  3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
+  4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
+  3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
+  3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
+  3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
+  3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
+  2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+  3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
+  3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
+  2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
+  2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
+  2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
+  3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
+  3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
+  3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
+  3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
+  2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
+  2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
+  2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
+  3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
+  3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
+  3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
+  3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
+  3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
+  2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
+  2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+  2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
+  2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
+  2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
+  3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
+  3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
+  3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
+  2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
+  2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
+  3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
+  1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
+  1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
+  2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+  2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+  2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+  3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
+  2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+  2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+  2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+  2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
+  2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
+  1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+  2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
+  2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
+  1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
+  2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
+  1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
+  1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
+  2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
+  2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
+  3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
+  2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+  2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+  2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
+  2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
+  2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
+  3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
+  2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
+  3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
+  2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
+  2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
+  2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
+  2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
+  1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
+  2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
+  2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
+  2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
+  1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
+  2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
+  3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
+  2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
+  1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
+  3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
+  3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
+  3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
+  3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
+  2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
+  3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
+  3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
+  2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
+  2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
+  2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
+  4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
+  2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
+  3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
+  2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+  2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
+  4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
+  2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
+  2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
+  3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
+  3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
+  3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
+  3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
+  3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
+  4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
+  2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
+  3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
+  3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
+  3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
+  3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
+  3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
+  2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
+  2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
+  2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
+  2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
+  2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+  3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
+  2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
+  2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
+  2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+  3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
+  2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
+  2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
+  1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
+  2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
+  2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
+  1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
+  2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
+  2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
+  1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
+  2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
+  2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
+  3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
+  2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
+  2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
+  2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
+  3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
+  2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
+  3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
+  2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+  3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
+  2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
+  3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
+  3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
+  2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
+  2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
+  2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
+  1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+  3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
+  2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
+  2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
+  2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
+  2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
+  3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
+  1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
+  3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
+  3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
+  3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
+  3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
+  3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
+  3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
+  2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
+  2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
+  3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
+  2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+  2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+  3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
+  3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
+  2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
+  3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
+  2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
+  3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
+  3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
+  3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
+  3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
+  2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+  3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
+  2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+  2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
+  2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
+  3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
+  2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+  3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
+  3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
+  3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
+  2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+  3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
+  2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
+  2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
+  3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
+  3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
+  3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
+  3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
+  3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
+  3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
+  3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
+  2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
+  2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
+  2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
+  1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+  2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
+  2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
+  2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
+  2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
+  2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
+  1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
+  135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
+  1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
+  2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
+  1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
+  2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
+  3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
+  135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS
+  1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
+  1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS
+  1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
+  1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS
+  2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
+  2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS
+  1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
+  1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
+  1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
+  835584U, // <0,u,2,3>: Cost 0 copy LHS
+  1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
+  3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
+  1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
+  1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
+  835584U, // <0,u,2,u>: Cost 0 copy LHS
+  2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
+  2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
+  2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
+  2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
+  2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
+  2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
+  2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
+  1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
+  2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
+  1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
+  2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
+  2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
+  3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
+  2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
+  2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
+  2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
+  1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
+  1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
+  2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
+  2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
+  2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
+  2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
+  2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
+  2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
+  1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
+  1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
+  2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
+  2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
+  2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
+  2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
+  2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
+  2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
+  2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
+  2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
+  2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
+  135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
+  1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
+  835584U, // <0,u,u,3>: Cost 0 copy LHS
+  1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
+  1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
+  1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
+  835584U, // <0,u,u,u>: Cost 0 copy LHS
+  2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
+  1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
+  2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
+  2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+  2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
+  2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
+  3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
+  3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
+  1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
+  2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
+  2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
+  1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
+  2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
+  2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
+  2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+  3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
+  1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+  2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+  2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
+  2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
+  2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
+  2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+  3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
+  2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
+  2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
+  2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
+  3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
+  67944550U, // <1,0,3,2>: Cost 1 vrev LHS
+  2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
+  2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
+  4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
+  3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
+  2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
+  68386972U, // <1,0,3,u>: Cost 1 vrev LHS
+  2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
+  2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+  2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
+  3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
+  3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
+  2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
+  3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
+  2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
+  4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
+  2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
+  3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
+  2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
+  3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
+  3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
+  3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
+  3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
+  2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
+  3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
+  3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
+  3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
+  3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
+  3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
+  2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
+  2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
+  2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+  4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
+  2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
+  3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
+  3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
+  3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
+  2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
+  3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
+  2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
+  3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
+  67985515U, // <1,0,u,2>: Cost 1 vrev LHS
+  2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
+  2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
+  2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
+  2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
+  68427937U, // <1,0,u,u>: Cost 1 vrev LHS
+  1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
+  1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+  2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
+  2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
+  2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+  2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
+  3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
+  3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
+  1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+  1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
+  2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
+  2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
+  1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
+  2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
+  2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
+  2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
+  2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
+  2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
+  2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
+  2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
+  3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
+  2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
+  3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
+  2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
+  2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
+  3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
+  4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
+  2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
+  2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
+  2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
+  3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
+  2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
+  2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
+  2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+  2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
+  3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
+  3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
+  2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
+  1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
+  3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
+  1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+  2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
+  2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
+  4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
+  2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
+  2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
+  2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
+  2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
+  2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
+  3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
+  2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
+  2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
+  3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
+  3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
+  4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
+  2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
+  2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
+  2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
+  2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
+  2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
+  4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
+  2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
+  3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
+  3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
+  2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
+  2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
+  2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
+  2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
+  1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
+  2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS
+  2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
+  1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+  2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
+  2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
+  2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
+  3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
+  2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
+  2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+  1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+  2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
+  2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
+  2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
+  2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
+  2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
+  2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+  2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
+  3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
+  2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
+  2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
+  2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
+  2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
+  2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
+  3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
+  3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
+  2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
+  2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
+  2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
+  403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
+  1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+  403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
+  1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
+  2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
+  3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
+  3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
+  2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
+  2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
+  1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
+  2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+  1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+  2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
+  2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+  3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
+  2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
+  2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+  2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
+  2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
+  2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
+  2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
+  2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
+  2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+  2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
+  3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
+  3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
+  2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
+  2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
+  2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
+  1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+  2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
+  3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
+  2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
+  2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
+  3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
+  3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
+  2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+  1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+  403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
+  1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+  403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
+  1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
+  2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
+  1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
+  2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
+  2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
+  2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
+  3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
+  3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
+  3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
+  1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
+  2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
+  2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
+  2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
+  1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
+  2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
+  2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
+  3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
+  2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
+  1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
+  2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
+  2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
+  2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
+  2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+  1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
+  2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
+  2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
+  1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+  2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
+  3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
+  1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
+  2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+  2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
+  3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
+  2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
+  2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
+  1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
+  3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
+  1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
+  2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
+  2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
+  2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
+  2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
+  2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
+  2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
+  2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
+  1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
+  3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
+  2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
+  3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
+  2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
+  3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
+  2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
+  2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+  2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
+  3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
+  2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
+  2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
+  2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
+  3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
+  2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
+  2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
+  1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
+  1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
+  2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
+  1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
+  1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
+  1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
+  1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
+  1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
+  2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
+  2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
+  2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
+  1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
+  2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
+  3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
+  1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
+  3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
+  2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
+  3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
+  3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
+  3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
+  2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
+  3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
+  2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
+  3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
+  3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
+  3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
+  2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
+  3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
+  2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
+  3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
+  2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
+  2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
+  2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
+  2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
+  3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
+  2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
+  3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
+  3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
+  3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
+  3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
+  2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
+  3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
+  3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
+  3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
+  2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
+  2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+  2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
+  3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
+  2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
+  2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
+  2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
+  2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
+  3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
+  2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
+  2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
+  1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+  2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
+  3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
+  2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
+  2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+  3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
+  3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
+  2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
+  2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+  3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
+  3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
+  3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
+  3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
+  3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
+  2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
+  3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
+  2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
+  2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
+  2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
+  2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
+  2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
+  1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
+  1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+  1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
+  3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
+  1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
+  2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
+  3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
+  1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+  2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
+  2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+  2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
+  2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+  2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
+  2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
+  3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
+  2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
+  3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
+  2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+  2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+  2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
+  3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
+  3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
+  2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+  4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
+  2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
+  2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
+  2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
+  3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
+  2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
+  2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
+  3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
+  4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
+  2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+  2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
+  3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
+  2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+  1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
+  2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
+  1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+  2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
+  2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
+  3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
+  4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
+  2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
+  2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
+  2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
+  2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
+  2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
+  2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
+  2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
+  2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
+  3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
+  3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
+  3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
+  2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
+  2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
+  2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
+  2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+  2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
+  2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+  2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
+  2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+  2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
+  3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
+  2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
+  2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
+  2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
+  1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
+  2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
+  1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+  1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
+  2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
+  2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
+  3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
+  2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
+  3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
+  2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
+  2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
+  2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
+  4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
+  2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
+  3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
+  2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
+  3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
+  3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
+  2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+  3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
+  3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
+  2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
+  2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+  3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
+  3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
+  3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
+  3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
+  3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
+  2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+  2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
+  2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
+  3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
+  2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
+  3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
+  2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
+  4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
+  2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
+  3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
+  3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
+  3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
+  3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
+  3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
+  3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
+  2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
+  4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
+  2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
+  3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
+  3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
+  3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
+  3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
+  3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
+  3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
+  2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
+  2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
+  2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
+  2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+  3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
+  3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
+  3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
+  3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
+  3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
+  2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
+  2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
+  2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
+  1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+  2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+  2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
+  3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
+  2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
+  2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+  2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
+  4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
+  1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+  1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+  2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
+  3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
+  2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
+  2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
+  3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
+  2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+  2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
+  3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
+  2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+  3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
+  2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+  2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
+  2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
+  2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
+  2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
+  3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
+  2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
+  2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
+  2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+  2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+  2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
+  2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
+  2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+  3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
+  3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
+  2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
+  2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+  3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
+  2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+  3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
+  2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
+  1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+  2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
+  2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
+  2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
+  1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+  1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+  2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+  2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
+  1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+  2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+  3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
+  3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
+  3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
+  3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
+  2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
+  3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
+  3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
+  2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
+  2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+  2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
+  3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
+  2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
+  2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+  4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
+  2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+  3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
+  2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
+  2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+  2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+  3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
+  3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
+  3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
+  3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
+  3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
+  2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
+  2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
+  2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+  3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
+  3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
+  3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
+  3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
+  2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
+  3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
+  2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
+  2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
+  1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+  2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
+  2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
+  1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+  1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+  2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+  2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
+  1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+  1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
+  1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
+  2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
+  2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
+  1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+  1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
+  2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
+  2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
+  1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
+  1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
+  1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
+  1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
+  3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS
+  2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
+  2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
+  1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  115726126U, // <1,u,3,2>: Cost 1 vrev LHS
+  2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
+  403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
+  1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
+  1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
+  403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS
+  2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
+  2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
+  2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
+  2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
+  2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
+  1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
+  2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
+  1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
+  2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
+  2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
+  3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
+  2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
+  1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
+  2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
+  2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
+  2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
+  2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
+  2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
+  3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
+  2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
+  2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+  2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
+  2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
+  2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
+  2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
+  2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
+  2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
+  2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
+  1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+  403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
+  202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
+  115767091U, // <1,u,u,2>: Cost 1 vrev LHS
+  1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
+  403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
+  1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
+  403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS
+  2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
+  2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
+  1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
+  2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
+  2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+  3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
+  2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
+  4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
+  1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
+  2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
+  2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
+  1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
+  2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+  3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
+  2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+  3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
+  1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
+  2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
+  2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
+  2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
+  1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+  2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
+  2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
+  2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
+  1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+  2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
+  4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
+  3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
+  3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
+  4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
+  3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
+  2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
+  2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
+  2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+  2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
+  3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
+  2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+  2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
+  2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
+  3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
+  3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
+  2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
+  3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
+  2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+  3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
+  3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
+  2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
+  4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
+  2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
+  2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
+  3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
+  2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
+  3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
+  3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
+  3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
+  2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
+  2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
+  2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
+  3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
+  3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
+  3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
+  3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
+  3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
+  2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
+  2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
+  1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+  2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
+  2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
+  3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
+  1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+  2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
+  2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
+  3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
+  2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
+  1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+  2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
+  2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
+  2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
+  2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
+  2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+  3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
+  3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
+  3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
+  2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
+  2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
+  3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
+  2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
+  2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
+  2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
+  3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
+  3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
+  2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
+  2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
+  2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
+  2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
+  2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
+  2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
+  2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
+  2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
+  3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
+  3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
+  1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+  3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
+  2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
+  3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
+  3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
+  1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
+  2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
+  2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+  3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
+  2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
+  2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+  3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
+  3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
+  3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
+  2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
+  2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
+  3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
+  2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+  3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
+  2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+  4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
+  3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
+  3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
+  2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
+  2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
+  3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
+  3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
+  2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
+  3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
+  3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
+  3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
+  3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
+  2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
+  2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
+  2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
+  2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
+  2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
+  1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
+  1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
+  1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
+  2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
+  2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
+  3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
+  3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
+  3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
+  1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+  2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+  2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
+  2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
+  2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
+  3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
+  2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+  3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
+  3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
+  2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
+  1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
+  269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
+  2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
+  1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
+  2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
+  2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
+  2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
+  2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
+  2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
+  1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS
+  2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
+  3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
+  2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
+  1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS
+  2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
+  3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
+  2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
+  3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
+  2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
+  1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
+  3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
+  1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+  3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
+  2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
+  2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
+  4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
+  2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+  2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
+  2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
+  2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
+  3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
+  3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
+  2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
+  2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
+  3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
+  3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
+  3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
+  2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
+  2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
+  2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+  3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
+  2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
+  4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
+  2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
+  3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
+  3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
+  2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
+  2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
+  1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
+  1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS
+  1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS
+  1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
+  1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
+  1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
+  470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+  1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+  2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
+  1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
+  2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
+  2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
+  1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
+  1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
+  1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
+  1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+  1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
+  2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
+  2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
+  1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+  1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+  2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
+  1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+  470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
+  470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
+  2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
+  1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
+  2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
+  1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
+  2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
+  1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
+  2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
+  1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
+  1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
+  2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
+  2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
+  1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
+  1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
+  1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
+  1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
+  470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
+  2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
+  2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
+  2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
+  2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
+  2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
+  1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
+  1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
+  3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
+  2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+  2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
+  3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
+  3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
+  3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
+  3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
+  2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
+  2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
+  3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
+  2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
+  2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
+  2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
+  2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
+  3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
+  3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
+  2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
+  2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
+  4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
+  3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
+  3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
+  3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
+  2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
+  2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
+  3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
+  3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
+  3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
+  2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
+  2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
+  2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
+  4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
+  2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
+  2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+  2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+  2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+  2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
+  2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+  3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
+  1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
+  2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+  2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+  2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+  1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+  2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
+  2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
+  1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+  2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
+  3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
+  3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
+  3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
+  2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+  2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
+  3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
+  2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
+  2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
+  2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
+  1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+  2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
+  2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+  2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+  2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+  2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
+  3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
+  3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
+  3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
+  2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+  2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+  2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+  3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
+  3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
+  2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
+  2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
+  3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
+  4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
+  2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
+  3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
+  2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+  3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
+  3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
+  3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
+  2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
+  2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
+  2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
+  3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
+  2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+  2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
+  2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
+  2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+  3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
+  2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+  2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
+  2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+  2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
+  3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
+  2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
+  3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
+  3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
+  3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
+  2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
+  2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
+  4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
+  2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
+  2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
+  2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
+  3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
+  2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
+  3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
+  2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
+  3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
+  4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
+  3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
+  3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
+  2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+  2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
+  2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+  3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
+  2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+  2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
+  2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
+  4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
+  2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
+  2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
+  2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
+  2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
+  2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
+  2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
+  1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
+  3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
+  1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+  3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
+  2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
+  2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
+  1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+  2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
+  2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+  2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+  2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+  3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
+  2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
+  2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
+  2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
+  2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
+  2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
+  2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
+  2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
+  2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
+  2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
+  2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
+  2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
+  3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
+  2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
+  2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
+  2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
+  3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
+  3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS
+  1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS
+  2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
+  2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
+  2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
+  3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
+  2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
+  1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
+  4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
+  1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+  3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
+  2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+  2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
+  3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
+  2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+  2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
+  2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+  2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
+  3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
+  2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
+  3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
+  2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
+  3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
+  2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
+  2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
+  2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
+  2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
+  2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+  2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
+  2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
+  2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
+  3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
+  2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+  2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
+  2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
+  2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
+  1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
+  2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+  1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
+  1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS
+  1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS
+  2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
+  1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+  2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+  2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+  2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
+  2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
+  2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+  2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
+  1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+  2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
+  3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
+  3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
+  3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
+  3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
+  2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
+  3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
+  2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
+  3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
+  2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
+  3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
+  3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
+  2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+  2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+  3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
+  2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
+  1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+  2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+  2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+  2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
+  1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+  2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+  1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+  2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
+  1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+  2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
+  3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
+  3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
+  2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+  3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
+  2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
+  3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
+  2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
+  2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+  3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
+  3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
+  3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
+  2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+  3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
+  3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
+  2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
+  2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
+  2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+  3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
+  2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
+  3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
+  2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+  2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+  2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+  3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
+  2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
+  2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+  3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
+  3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
+  3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
+  2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
+  3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
+  2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+  2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
+  2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
+  1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+  1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+  2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
+  2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
+  1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+  2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+  2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
+  1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+  1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
+  1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
+  3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
+  1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+  1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
+  2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
+  1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+  2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS
+  1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
+  1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
+  1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS
+  1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
+  1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
+  1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS
+  1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS
+  1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+  1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+  2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
+  1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
+  1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+  470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
+  1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
+  2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+  1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
+  2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
+  1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
+  2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
+  1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
+  1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+  2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
+  1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
+  2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
+  2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
+  1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
+  470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
+  269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
+  1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS
+  1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
+  470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
+  1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS
+  470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
+  1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
+  2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+  3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
+  3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
+  3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
+  1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+  1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+  2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
+  537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
+  2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
+  1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+  2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
+  2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
+  2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
+  537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
+  2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
+  2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+  2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
+  1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
+  2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
+  2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
+  2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+  3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
+  2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
+  3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
+  2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
+  2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
+  1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
+  2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
+  1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
+  3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
+  1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
+  2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
+  3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
+  3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
+  2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
+  2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+  2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
+  2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
+  3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
+  2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
+  2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
+  2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
+  3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
+  3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
+  3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
+  2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+  2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
+  2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
+  2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
+  2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
+  2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
+  3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
+  2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+  3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
+  3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
+  2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+  2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
+  1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+  1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
+  2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
+  2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
+  537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
+  2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
+  2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
+  2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
+  1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+  2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
+  3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
+  3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
+  1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+  1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+  2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
+  1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+  2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
+  3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
+  1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+  2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
+  2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
+  1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
+  2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
+  2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
+  2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
+  1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
+  1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+  1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
+  1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+  1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
+  1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
+  2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
+  2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
+  2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
+  2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+  2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
+  2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
+  3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
+  2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
+  2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+  2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+  2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+  1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+  3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
+  3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
+  2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
+  2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
+  2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
+  2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
+  2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
+  2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
+  3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
+  2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
+  4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
+  3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
+  2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
+  3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
+  3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
+  3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+  1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+  2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
+  1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+  1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
+  2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+  1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
+  2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+  2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
+  2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
+  1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+  1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+  2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+  2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
+  2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+  2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+  2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
+  1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+  2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+  2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+  1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+  2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
+  3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
+  1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+  1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+  2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+  2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
+  2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
+  1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+  2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+  2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
+  2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+  1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+  2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+  2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
+  2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
+  2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
+  1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
+  2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
+  1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+  2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
+  2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+  2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+  2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+  2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
+  1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+  2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+  2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+  2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+  1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+  2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
+  1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
+  2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
+  4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
+  2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
+  2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
+  2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
+  2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
+  2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+  2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
+  1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+  1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+  1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
+  2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
+  1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+  2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
+  1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+  2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
+  2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
+  2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+  2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
+  3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
+  4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
+  1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+  2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+  1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+  2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
+  2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
+  2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
+  2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+  3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
+  2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
+  1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
+  2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
+  2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
+  1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
+  2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
+  2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
+  2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+  2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+  2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
+  1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+  1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
+  2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
+  336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
+  2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
+  2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
+  336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
+  2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
+  2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+  2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
+  2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
+  2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
+  1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+  2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
+  4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
+  1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+  2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+  2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+  2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+  2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
+  2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+  3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
+  2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
+  2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+  3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
+  2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+  2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
+  2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
+  3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
+  2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
+  2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
+  2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
+  2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+  2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+  2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
+  2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
+  2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+  2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
+  3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
+  3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
+  2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
+  1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+  1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
+  336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+  2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
+  2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
+  2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+  1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+  3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
+  2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+  1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
+  1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+  3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
+  1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+  2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+  1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+  2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
+  2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+  2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
+  2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
+  1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+  3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
+  2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+  2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+  2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
+  2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
+  2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
+  2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
+  2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
+  2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
+  2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+  2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+  2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+  2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
+  2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
+  2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
+  3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
+  3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
+  2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
+  2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
+  2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
+  2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+  2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+  3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
+  1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+  1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+  2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+  2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
+  1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+  2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
+  537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
+  2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
+  537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
+  2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
+  2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
+  2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
+  2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+  1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
+  2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
+  1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+  2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+  3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
+  2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+  3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
+  2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+  2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
+  2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
+  2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+  2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
+  1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+  1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+  2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
+  1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+  1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
+  2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
+  537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
+  3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
+  2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
+  2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
+  3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
+  2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+  2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
+  2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
+  2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
+  2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+  3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
+  2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
+  2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
+  2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+  2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
+  3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
+  1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
+  1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
+  3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
+  2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+  3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
+  2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
+  2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
+  2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+  3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
+  2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
+  2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
+  3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
+  2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
+  3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
+  2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
+  2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+  2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
+  4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
+  3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
+  3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
+  2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
+  3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
+  2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
+  3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
+  2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
+  2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
+  2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+  1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
+  2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+  2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
+  2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+  2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
+  2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+  1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
+  1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
+  1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
+  2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
+  2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
+  2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
+  2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
+  2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
+  2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
+  2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
+  1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
+  1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
+  1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+  2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+  2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+  1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+  1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
+  2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
+  1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+  1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+  2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
+  2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
+  1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+  1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+  2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
+  1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
+  2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
+  2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+  2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
+  3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
+  2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+  2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+  2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+  2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
+  3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
+  3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
+  2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
+  3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
+  2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+  3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
+  2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+  2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
+  2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
+  2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+  2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+  2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+  3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
+  2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+  2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+  2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
+  1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
+  1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
+  3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
+  3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
+  3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
+  2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
+  2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+  3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
+  3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
+  2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
+  3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
+  2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+  2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
+  3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
+  2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+  2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+  2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
+  2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
+  2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
+  2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+  3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
+  2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+  2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
+  3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
+  4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
+  2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
+  2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
+  2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
+  2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+  2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
+  3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
+  2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+  2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+  1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+  1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+  1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+  2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+  2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
+  2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
+  1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+  2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
+  2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
+  2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
+  1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
+  1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+  2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+  2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
+  2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
+  1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+  2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+  1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
+  1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
+  2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
+  1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
+  3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
+  2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+  2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+  2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+  2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+  1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+  2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
+  2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
+  2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
+  2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+  1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+  2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+  2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+  1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
+  2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
+  3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
+  2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
+  2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
+  2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
+  2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
+  1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+  2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+  1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+  2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+  2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
+  2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+  2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
+  2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+  2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
+  2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+  2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
+  2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
+  2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+  2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+  2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+  3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
+  2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+  1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
+  2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
+  1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+  2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
+  2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
+  3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
+  2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+  2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
+  2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
+  2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
+  2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+  2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+  2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+  2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+  2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+  2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+  2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+  2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
+  2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
+  2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
+  2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
+  2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
+  2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
+  2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
+  2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
+  2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+  1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+  2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
+  1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
+  2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
+  2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
+  1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+  1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+  1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
+  1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+  1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
+  1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+  2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+  1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+  1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+  537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
+  1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+  1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+  1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+  2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
+  1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
+  537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
+  1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+  1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
+  1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
+  1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+  1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
+  1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
+  1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+  1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
+  2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
+  336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
+  1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+  1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
+  2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
+  2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
+  1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
+  1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
+  1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+  1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
+  1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
+  1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+  1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+  2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
+  1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+  1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+  1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
+  1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
+  537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
+  2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
+  2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
+  2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
+  1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
+  1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
+  1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
+  1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+  1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+  2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
+  3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+  1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
+  1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
+  1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+  1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+  537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
+  336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS
+  1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+  1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+  537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
+  1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
+  537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
+  2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
+  2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
+  2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
+  3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
+  2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
+  3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
+  3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
+  3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
+  2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
+  2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+  2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
+  1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
+  2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+  2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
+  3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
+  2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
+  1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+  3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
+  2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
+  2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
+  2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+  3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
+  2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
+  2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
+  3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
+  2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
+  3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
+  3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
+  3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
+  3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
+  3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
+  2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
+  2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
+  2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
+  3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
+  2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
+  2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
+  3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
+  2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
+  2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
+  1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS
+  2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
+  2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
+  2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
+  3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
+  3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
+  2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
+  1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS
+  2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
+  3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
+  1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
+  2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
+  2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
+  3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
+  2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
+  1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
+  3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
+  3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
+  4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
+  3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
+  3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
+  3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
+  3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
+  3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
+  4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
+  2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
+  1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS
+  1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
+  2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
+  2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
+  1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
+  2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
+  3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
+  2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
+  2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+  3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
+  3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
+  3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
+  2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
+  3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
+  2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
+  2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+  2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
+  3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
+  3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
+  3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
+  3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
+  2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
+  2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
+  2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
+  3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
+  1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
+  2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
+  3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
+  3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
+  3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
+  1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
+  2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+  2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+  2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+  2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
+  2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+  2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+  2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
+  2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
+  3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
+  3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
+  3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
+  3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
+  3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
+  2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
+  3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
+  2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
+  1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
+  2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
+  2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
+  1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
+  2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
+  2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
+  2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
+  4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
+  3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
+  2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
+  2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
+  4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
+  3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
+  2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
+  2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
+  3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
+  2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
+  3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
+  3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
+  3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
+  3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
+  2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
+  2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
+  2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
+  1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
+  1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+  2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
+  2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
+  2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
+  2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
+  3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
+  2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+  3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
+  2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+  3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
+  2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
+  3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
+  3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
+  3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
+  3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
+  2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
+  3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
+  3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
+  2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
+  3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
+  2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
+  2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
+  3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
+  3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
+  3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
+  3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
+  2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
+  2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
+  3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
+  3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
+  2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
+  2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
+  2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
+  3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
+  2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
+  2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
+  2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+  2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+  2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
+  2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
+  2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
+  2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
+  3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
+  2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
+  2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
+  2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
+  2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
+  2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
+  2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
+  3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
+  2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
+  3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
+  2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
+  1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
+  2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
+  2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
+  2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
+  1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+  2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
+  2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
+  2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
+  1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
+  2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
+  3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
+  2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
+  4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
+  3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
+  3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
+  3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
+  2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
+  2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
+  2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
+  2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
+  1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+  2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
+  2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
+  1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
+  3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
+  2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
+  2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
+  3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
+  3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
+  2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+  3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
+  3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
+  2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
+  2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+  3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
+  2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+  2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
+  3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
+  3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
+  3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
+  3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
+  2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
+  3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
+  2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+  3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
+  2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
+  3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
+  2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
+  3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
+  3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
+  2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
+  3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
+  3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
+  3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
+  2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
+  3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
+  3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
+  3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
+  2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+  2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+  2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
+  3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
+  2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
+  2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
+  2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
+  3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
+  2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
+  2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
+  2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
+  2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
+  2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
+  4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
+  3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
+  2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
+  2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
+  2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
+  2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
+  2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
+  2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
+  1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+  4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
+  2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
+  1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
+  3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
+  3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
+  3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
+  3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
+  3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
+  3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
+  3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
+  3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
+  3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
+  2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
+  2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
+  2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
+  2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
+  2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
+  1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+  2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
+  2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
+  1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
+  2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
+  1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
+  2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
+  2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
+  2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
+  3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
+  1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
+  2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
+  2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
+  2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+  2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
+  2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
+  2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
+  3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
+  3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
+  2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
+  3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
+  3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
+  2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
+  2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
+  2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
+  3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
+  2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
+  3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
+  2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
+  2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
+  3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
+  3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
+  2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
+  3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
+  3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
+  3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
+  2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+  2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
+  2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
+  2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
+  161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
+  1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
+  2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
+  2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
+  2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
+  2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
+  3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
+  2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
+  2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
+  1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS
+  1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
+  1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
+  3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
+  2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
+  2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
+  2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
+  2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
+  1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
+  1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
+  2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
+  3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
+  3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
+  3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
+  2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
+  2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
+  2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
+  2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
+  1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
+  161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
+  1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS
+  1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
+  161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS
+  2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
+  1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
+  2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
+  2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
+  3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
+  2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
+  2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
+  1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
+  2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
+  2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
+  2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
+  3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
+  2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
+  3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
+  2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
+  2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
+  2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
+  3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
+  2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
+  1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
+  2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
+  3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
+  2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
+  4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
+  1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
+  2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
+  3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
+  2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
+  2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
+  2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
+  2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
+  2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
+  2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
+  2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
+  2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
+  3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
+  2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
+  2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
+  2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
+  1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
+  2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
+  1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
+  1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+  2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
+  2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
+  2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
+  1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
+  2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
+  2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
+  2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
+  1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
+  1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
+  2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
+  2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
+  1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
+  1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
+  3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
+  2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
+  27705344U, // <4,5,6,7>: Cost 0 copy RHS
+  27705344U, // <4,5,6,u>: Cost 0 copy RHS
+  2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
+  2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
+  2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
+  2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
+  2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
+  2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
+  2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
+  2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
+  2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
+  1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
+  1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
+  1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
+  1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
+  1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
+  27705344U, // <4,5,u,7>: Cost 0 copy RHS
+  27705344U, // <4,5,u,u>: Cost 0 copy RHS
+  2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
+  1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
+  3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
+  2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
+  3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
+  2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
+  4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
+  1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+  2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
+  2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
+  2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
+  3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
+  2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
+  3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
+  4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
+  2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
+  2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
+  2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
+  2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
+  2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
+  2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
+  2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
+  2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
+  2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
+  2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
+  2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+  2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
+  3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
+  2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
+  2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+  2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
+  3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
+  2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
+  2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
+  2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
+  2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
+  2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
+  2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
+  2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
+  1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
+  2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
+  1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
+  2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
+  2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
+  2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
+  3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
+  2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
+  2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
+  2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
+  2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
+  2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
+  1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
+  2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
+  2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
+  2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
+  1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
+  2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
+  2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
+  2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
+  1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
+  2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
+  2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
+  3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
+  2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
+  2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
+  2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
+  2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
+  1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
+  1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
+  2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
+  1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
+  1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
+  2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
+  1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
+  2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
+  3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
+  3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
+  2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
+  3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
+  3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
+  2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
+  2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+  3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
+  2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+  3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
+  3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
+  3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
+  3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
+  3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
+  2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
+  3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
+  3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
+  3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
+  2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
+  3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
+  2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+  3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
+  3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
+  2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
+  3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
+  3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
+  3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
+  3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
+  3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
+  2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
+  2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+  3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
+  2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
+  2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
+  3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
+  3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
+  3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
+  2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
+  2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
+  2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
+  3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
+  2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
+  2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
+  3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
+  2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
+  3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
+  2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
+  2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
+  2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
+  2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
+  2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
+  1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+  3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
+  2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
+  2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
+  1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+  1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
+  2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
+  3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
+  3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
+  3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
+  3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
+  3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
+  3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
+  2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
+  3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
+  2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
+  2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
+  1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
+  2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
+  2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
+  1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
+  1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
+  2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
+  3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
+  2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
+  1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
+  2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
+  2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
+  2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
+  2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
+  2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
+  1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
+  2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
+  2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
+  1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
+  2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
+  2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
+  3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
+  2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
+  1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
+  2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
+  2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
+  1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
+  2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
+  2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
+  2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
+  2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
+  1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
+  2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
+  2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
+  2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
+  2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
+  2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
+  2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
+  2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
+  2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
+  1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
+  2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
+  2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
+  2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
+  161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
+  1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
+  2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
+  161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS
+  1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS
+  2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
+  3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
+  1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS
+  1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
+  1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
+  2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
+  1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
+  1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
+  1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
+  1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
+  1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
+  27705344U, // <4,u,6,7>: Cost 0 copy RHS
+  27705344U, // <4,u,6,u>: Cost 0 copy RHS
+  2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
+  2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
+  2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
+  2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
+  2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
+  2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
+  2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
+  1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
+  1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
+  161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
+  1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  27705344U, // <4,u,u,7>: Cost 0 copy RHS
+  27705344U, // <4,u,u,u>: Cost 0 copy RHS
+  2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
+  2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+  2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+  3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
+  2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+  3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
+  3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
+  3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
+  2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
+  2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
+  2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
+  1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
+  2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
+  2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+  3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
+  3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
+  1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+  2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
+  2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
+  2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+  2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+  2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
+  2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
+  3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
+  3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
+  2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
+  3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
+  2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+  3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
+  3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
+  3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
+  2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
+  2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+  1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+  3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
+  2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
+  2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
+  2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
+  2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
+  3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
+  3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
+  3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
+  2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
+  3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
+  3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
+  2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
+  4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
+  2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
+  3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
+  3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
+  3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
+  3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
+  2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
+  2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
+  2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
+  2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
+  2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
+  3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
+  2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
+  2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
+  2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
+  4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
+  2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
+  2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
+  1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+  1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
+  2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
+  2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
+  2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+  1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
+  2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
+  2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+  2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
+  3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
+  3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
+  1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+  2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+  2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
+  2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+  2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
+  2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+  2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+  3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
+  3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
+  2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
+  3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
+  3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
+  2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
+  2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
+  2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
+  2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
+  2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
+  3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
+  2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
+  2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
+  2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+  3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
+  2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
+  2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+  2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+  3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
+  2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
+  2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
+  1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+  2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+  2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+  2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
+  2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+  1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
+  3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
+  1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+  2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+  2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
+  3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
+  2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
+  2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+  2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
+  2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+  2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
+  2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
+  2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
+  3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
+  2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
+  3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
+  2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
+  2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
+  2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+  2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
+  2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
+  2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
+  2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
+  2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
+  2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
+  2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
+  4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
+  2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
+  2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
+  1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
+  2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
+  2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
+  1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
+  2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
+  2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
+  3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
+  2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
+  3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
+  3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
+  3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
+  3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
+  3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
+  2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
+  2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
+  3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
+  3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
+  2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
+  3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
+  3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
+  3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
+  3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
+  2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
+  3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
+  3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
+  2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
+  2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
+  2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+  3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
+  3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
+  3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
+  2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
+  2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
+  2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+  2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+  3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
+  1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+  2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
+  3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
+  3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
+  1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+  2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
+  3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
+  2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+  2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+  2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
+  2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
+  3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
+  2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
+  2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
+  3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
+  3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
+  2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
+  2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
+  2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
+  3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
+  3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
+  2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
+  3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
+  3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
+  2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
+  2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
+  2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+  3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
+  3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
+  3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
+  2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
+  2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
+  2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
+  2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
+  2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
+  2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
+  4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
+  3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
+  3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
+  2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
+  2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
+  2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
+  2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
+  1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+  2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
+  3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
+  1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
+  3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
+  2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
+  3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
+  3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
+  2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+  3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
+  3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
+  2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+  2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
+  3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
+  3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
+  2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+  2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
+  3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
+  2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+  3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
+  2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
+  2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
+  3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
+  3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
+  3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
+  2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
+  2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+  2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+  3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
+  3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
+  2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
+  3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
+  2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+  3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
+  2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
+  2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+  2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+  3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
+  3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
+  2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
+  2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+  3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
+  2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+  2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+  2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+  2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
+  3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
+  2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+  2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
+  2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+  2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
+  2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+  2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
+  2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+  2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
+  3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
+  2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
+  2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
+  2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+  3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
+  2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+  2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+  2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+  3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
+  4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
+  2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
+  2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
+  1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+  2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
+  2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
+  1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+  2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
+  2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
+  3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
+  1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+  1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+  2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
+  2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
+  1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+  2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
+  2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
+  2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
+  1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+  3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
+  2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
+  3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
+  3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
+  2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+  2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+  2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
+  3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
+  2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
+  2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
+  3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
+  3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
+  3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
+  3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
+  2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
+  3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
+  3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
+  2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
+  3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
+  3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
+  3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
+  2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
+  3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
+  2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
+  3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
+  3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
+  2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
+  3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
+  3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
+  2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
+  3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
+  2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+  3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
+  2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
+  3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
+  2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
+  2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
+  2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+  3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
+  3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
+  2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
+  1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
+  2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
+  2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+  1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
+  2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
+  2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+  2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+  3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
+  2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
+  2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
+  1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
+  1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+  2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+  2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
+  3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
+  2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
+  2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+  2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
+  3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
+  2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
+  2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
+  2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
+  2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
+  2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
+  2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
+  2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
+  3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
+  94817590U, // <5,4,7,6>: Cost 1 vrev RHS
+  2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
+  94965064U, // <5,4,7,u>: Cost 1 vrev RHS
+  2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
+  2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
+  2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
+  2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
+  2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
+  1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
+  94825783U, // <5,4,u,6>: Cost 1 vrev RHS
+  2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
+  94973257U, // <5,4,u,u>: Cost 1 vrev RHS
+  2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
+  1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
+  3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
+  2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+  2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
+  3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
+  4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
+  1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+  2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
+  2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+  2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
+  2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+  2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
+  3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
+  2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
+  2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
+  3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
+  3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
+  2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
+  2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
+  3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
+  2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
+  2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+  4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
+  2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
+  2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+  3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
+  3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
+  2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
+  2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+  2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
+  3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
+  3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
+  2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
+  2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+  2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
+  3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
+  3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
+  1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+  2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
+  2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
+  1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+  1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
+  2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
+  2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
+  1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
+  2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
+  2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
+  229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
+  2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
+  3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
+  2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
+  3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
+  2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
+  2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
+  2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
+  2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
+  2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
+  2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
+  3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
+  2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
+  2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
+  2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
+  3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
+  4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
+  2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
+  2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
+  1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
+  2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
+  1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
+  2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
+  2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
+  229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS
+  2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
+  1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
+  2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+  2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
+  3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
+  3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
+  4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
+  1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+  2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
+  2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+  2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+  2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
+  2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
+  2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+  3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
+  2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
+  3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
+  3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
+  2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+  2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
+  2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
+  2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+  2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+  2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
+  2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
+  2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
+  3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
+  3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
+  2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+  1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+  2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
+  3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
+  4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
+  1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+  2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
+  3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
+  2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
+  2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+  2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
+  1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
+  2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
+  1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+  2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
+  2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+  3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
+  3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
+  2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
+  2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
+  2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
+  2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
+  2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
+  2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
+  3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
+  2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
+  2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
+  2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
+  2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
+  2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
+  2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
+  430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
+  1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
+  1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+  1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+  1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
+  430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
+  430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
+  1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
+  1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
+  430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
+  2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
+  1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
+  2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
+  2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+  2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
+  3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
+  2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
+  1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
+  2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
+  2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
+  1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
+  2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
+  2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
+  2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
+  3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
+  1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
+  2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
+  3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
+  2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
+  2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
+  2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
+  2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+  2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
+  3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
+  2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
+  2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+  2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
+  3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
+  2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
+  2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+  2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
+  3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
+  2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
+  2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
+  2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+  2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+  3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
+  2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
+  2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
+  1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
+  1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+  2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
+  2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
+  2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
+  2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
+  2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
+  2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
+  2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
+  1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
+  1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
+  2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
+  3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
+  2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
+  2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
+  2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
+  2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
+  2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
+  2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
+  1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+  2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
+  2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
+  2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
+  1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+  1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
+  2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
+  2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
+  1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
+  1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
+  1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
+  1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
+  1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
+  1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
+  1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
+  1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
+  1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
+  2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
+  2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
+  2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
+  2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
+  2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
+  1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
+  2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
+  1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
+  2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
+  2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
+  2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
+  2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
+  2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
+  2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
+  2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
+  2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
+  2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
+  2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
+  2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
+  2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
+  2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
+  2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
+  2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
+  1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+  2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
+  2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
+  2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
+  1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+  1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
+  1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
+  2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
+  2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
+  1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+  2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
+  1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+  1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
+  2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
+  2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
+  1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
+  1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
+  229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS
+  2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
+  2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
+  2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
+  2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
+  2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
+  2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
+  430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
+  1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+  1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
+  430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
+  1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  118708378U, // <5,u,7,6>: Cost 1 vrev RHS
+  2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
+  430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS
+  430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
+  1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
+  430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
+  229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
+  118716571U, // <5,u,u,6>: Cost 1 vrev RHS
+  1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
+  430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS
+  2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
+  2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
+  2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
+  3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
+  2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+  3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
+  3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
+  3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
+  2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
+  2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+  3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
+  1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
+  2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+  2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+  2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+  2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
+  1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
+  2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
+  3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
+  2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
+  1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+  3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
+  2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
+  3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
+  1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+  3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
+  2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+  2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
+  3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
+  2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+  3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
+  3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
+  3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
+  2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
+  2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+  2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+  1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+  3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
+  2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+  2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
+  2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
+  1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+  3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
+  2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
+  3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
+  3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
+  2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
+  3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
+  2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
+  2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
+  2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
+  3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
+  3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
+  2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
+  3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
+  2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
+  2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
+  2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
+  2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
+  2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
+  2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
+  2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
+  2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+  4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
+  3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
+  4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
+  2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
+  2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
+  2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
+  1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
+  1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+  2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
+  2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+  2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
+  3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
+  2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
+  2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
+  3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
+  2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+  2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
+  2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
+  3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
+  2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
+  3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
+  2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
+  2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+  3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
+  3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
+  3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
+  2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
+  2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
+  3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
+  3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
+  2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
+  2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
+  3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
+  2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
+  3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
+  2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
+  2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+  2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+  2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+  3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
+  2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+  2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+  2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+  3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
+  2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
+  2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
+  2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+  3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
+  2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
+  3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
+  2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
+  3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
+  2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+  2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
+  2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
+  3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
+  3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
+  2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
+  2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+  3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
+  2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+  3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
+  2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
+  2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
+  3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
+  3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
+  3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
+  3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
+  2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
+  3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
+  3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
+  2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
+  3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
+  2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+  2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
+  3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
+  4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
+  2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
+  2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
+  2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
+  2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
+  2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
+  2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
+  2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
+  2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
+  2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
+  2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
+  1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
+  3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
+  2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+  3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
+  2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+  3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
+  1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+  2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+  2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
+  2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+  2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
+  3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
+  2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+  2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
+  3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
+  2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
+  3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
+  3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
+  2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
+  2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
+  2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+  3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
+  2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+  3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
+  2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
+  2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
+  3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
+  2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+  2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
+  2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
+  2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
+  2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
+  2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
+  2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
+  1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
+  2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+  2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
+  2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+  1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
+  3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
+  1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+  3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
+  2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+  3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
+  2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
+  2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+  2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
+  2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
+  2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
+  2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+  2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
+  2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
+  2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
+  2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+  2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
+  2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
+  2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
+  2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
+  2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
+  2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
+  2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
+  1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS
+  2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
+  3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
+  2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
+  1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS
+  1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
+  1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
+  1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS
+  2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
+  1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
+  2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS
+  3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
+  2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
+  2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+  3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
+  2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+  2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
+  3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
+  2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
+  2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
+  3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
+  3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
+  3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
+  3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
+  2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
+  3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
+  3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
+  2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
+  2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
+  3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
+  3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
+  3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
+  2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
+  3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
+  2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
+  3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
+  2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+  2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
+  2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+  3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
+  3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
+  3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
+  2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
+  2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+  2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
+  2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+  2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+  2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+  1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+  2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
+  2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
+  1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+  3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
+  3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
+  3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
+  3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
+  2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+  3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
+  3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
+  3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
+  2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
+  2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
+  2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
+  2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
+  3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
+  2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
+  3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
+  2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
+  3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
+  2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
+  1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+  2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+  1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+  2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
+  1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+  2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
+  2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+  2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+  1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+  2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
+  1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+  2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
+  1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+  1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+  2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
+  2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+  3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
+  2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
+  3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
+  2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+  2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+  2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
+  3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
+  2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
+  2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+  3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
+  3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
+  2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
+  3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
+  3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
+  2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
+  3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
+  2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
+  2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
+  3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
+  3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
+  3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
+  2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+  2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
+  2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
+  3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
+  2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
+  3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
+  3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
+  3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
+  3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
+  2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+  3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
+  2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
+  3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
+  2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
+  2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
+  3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
+  2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+  3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
+  2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
+  2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
+  1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+  3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
+  2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+  2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
+  2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+  2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
+  1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+  2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+  2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+  1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
+  2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
+  2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
+  2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
+  1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+  2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+  2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+  3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
+  2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
+  2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
+  2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
+  2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
+  4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
+  2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
+  1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
+  2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
+  1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
+  2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
+  2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
+  3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
+  3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
+  3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
+  4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
+  3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
+  1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+  1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
+  2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
+  3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
+  3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
+  3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
+  3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
+  3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
+  3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
+  2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
+  2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
+  2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+  3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
+  3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
+  3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
+  2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
+  3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
+  2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+  3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
+  2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
+  3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
+  3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
+  3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
+  3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
+  2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+  3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
+  3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
+  2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
+  2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
+  2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
+  3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
+  3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
+  2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+  2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
+  2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
+  1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+  1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+  2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+  3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
+  3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
+  3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
+  2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
+  2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
+  2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
+  2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
+  2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
+  2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
+  3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
+  3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
+  2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
+  2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
+  3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
+  2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
+  2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
+  2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
+  2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
+  2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+  3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
+  2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+  2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
+  2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
+  2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
+  2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
+  2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
+  2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
+  2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
+  2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
+  1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
+  2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
+  1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
+  3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
+  2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+  3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
+  2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
+  4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
+  1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+  2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+  2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
+  2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+  2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
+  2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+  2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
+  3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
+  2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
+  3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
+  2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
+  2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
+  3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
+  3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
+  2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
+  2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
+  2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
+  2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+  3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
+  3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
+  2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
+  2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+  3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
+  2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
+  4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
+  2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
+  2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
+  3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
+  3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
+  3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
+  1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
+  2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
+  1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+  3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
+  2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+  3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
+  3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
+  2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+  2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
+  2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
+  2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
+  2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
+  1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
+  2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
+  2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
+  1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
+  296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
+  2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
+  296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
+  2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
+  3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
+  2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
+  2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
+  2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
+  2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
+  2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
+  1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS
+  1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS
+  1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
+  2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
+  1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS
+  296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS
+  1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
+  1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+  2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
+  497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+  1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
+  1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
+  2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
+  2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
+  2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
+  1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
+  1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
+  1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
+  2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
+  2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
+  1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
+  1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
+  497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
+  2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
+  2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
+  1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
+  2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
+  1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
+  2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+  2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
+  1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
+  1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
+  1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+  1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
+  1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
+  497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
+  1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
+  1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
+  1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+  2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
+  497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
+  1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+  1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
+  1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+  2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
+  2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
+  1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
+  1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
+  1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
+  2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
+  2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
+  2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
+  1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
+  1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
+  497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
+  2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+  1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+  2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
+  1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+  2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
+  1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
+  2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
+  296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
+  1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS
+  1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+  2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
+  1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+  1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS
+  1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+  2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
+  1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
+  1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS
+  1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
+  1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+  497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
+  1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+  497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
+  296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
+  1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS
+  497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
+  1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+  1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+  3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
+  2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+  2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
+  2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+  3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
+  1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+  1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+  2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
+  564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
+  2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
+  1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+  2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
+  1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+  2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
+  564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+  2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
+  2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
+  1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
+  2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
+  1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+  2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
+  2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+  2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
+  2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+  2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+  2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+  2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
+  3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
+  2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
+  2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
+  1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+  1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+  3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
+  2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
+  1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
+  3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
+  1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+  2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
+  2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
+  2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+  2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+  2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+  1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+  2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
+  1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+  2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
+  2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
+  2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+  3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
+  2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+  2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+  2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
+  2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
+  2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
+  2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
+  3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
+  3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
+  2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+  2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+  2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+  2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
+  2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
+  1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+  1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+  564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
+  2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
+  1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+  1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+  2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
+  564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
+  2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
+  2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
+  2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
+  1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+  2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
+  2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
+  3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
+  1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+  2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
+  1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
+  1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+  2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+  2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+  2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+  3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
+  1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+  2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
+  2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
+  3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
+  1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
+  2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
+  2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
+  2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
+  2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
+  1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
+  2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
+  1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+  2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
+  2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
+  1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+  2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+  2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
+  1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+  2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+  2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+  2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+  2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
+  3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
+  2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
+  2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
+  3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
+  2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
+  2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+  2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+  3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
+  1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+  2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+  3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
+  2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
+  2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
+  1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+  3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
+  2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+  2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
+  3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
+  2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+  3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
+  2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
+  2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
+  2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+  2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
+  3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
+  3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
+  3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
+  3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
+  2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
+  3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
+  1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+  2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
+  1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
+  2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
+  1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+  2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
+  2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
+  1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
+  2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+  2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+  2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
+  2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
+  2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
+  2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+  2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+  3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
+  2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
+  2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+  3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
+  3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
+  2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
+  2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+  3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
+  2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+  2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
+  2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
+  2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+  1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+  2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
+  2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+  2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
+  3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
+  1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+  1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+  2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+  2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
+  2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
+  1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+  2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+  2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
+  2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
+  1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+  2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
+  2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+  2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+  2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
+  2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+  2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+  2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
+  3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
+  2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
+  2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+  3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
+  2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+  2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+  2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+  3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
+  2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
+  3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
+  2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
+  2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+  2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+  2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+  1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+  2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+  2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+  2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+  3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
+  1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+  2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
+  3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
+  3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
+  2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
+  2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
+  3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
+  4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
+  2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
+  2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
+  1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+  2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+  1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+  1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+  2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+  2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
+  2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
+  1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+  2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+  1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+  2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+  2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
+  2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+  2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+  2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+  3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
+  1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+  2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+  2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+  2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
+  2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
+  2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+  2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+  3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
+  2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+  2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
+  2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+  2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+  2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+  2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
+  2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
+  2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+  2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+  2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+  2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
+  2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
+  2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+  2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+  1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+  2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+  2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
+  2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
+  1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+  2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+  2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
+  2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+  2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
+  1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+  2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
+  3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
+  1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+  2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
+  1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+  2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+  2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+  2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+  2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+  2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
+  2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
+  1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
+  2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+  2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
+  1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+  2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+  2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+  3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
+  2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+  2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
+  1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+  2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+  2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+  2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+  2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
+  2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+  2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
+  2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+  2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
+  2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
+  2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+  1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+  1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+  1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+  1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+  2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
+  2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
+  1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+  2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
+  1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+  3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
+  2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
+  1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
+  2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+  2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+  2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
+  2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+  2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
+  2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
+  2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
+  3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
+  2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
+  3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
+  3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
+  2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+  2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
+  3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
+  2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
+  3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
+  2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
+  2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+  3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
+  3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
+  2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+  2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
+  2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+  2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
+  3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
+  2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
+  2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
+  3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
+  3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
+  3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
+  1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+  1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
+  1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
+  1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+  2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
+  2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
+  2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
+  1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+  2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
+  564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
+  2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
+  564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
+  2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
+  3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
+  2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
+  2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
+  1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+  2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
+  2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+  2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+  3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
+  3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
+  3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
+  2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
+  2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
+  2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
+  2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
+  1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
+  1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
+  2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
+  1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+  1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+  564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
+  2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
+  564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
+  2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+  2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
+  3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
+  2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+  2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
+  2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
+  2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
+  2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+  3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
+  3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
+  2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
+  2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+  2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
+  2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+  1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
+  3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
+  3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
+  2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
+  3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
+  2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
+  2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
+  2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
+  3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
+  2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
+  3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
+  3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
+  3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
+  2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
+  3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
+  2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
+  2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
+  2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+  3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
+  3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
+  3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
+  2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
+  2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
+  2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
+  1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+  2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+  2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
+  3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
+  3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
+  2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
+  1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+  2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+  2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
+  2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
+  3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
+  2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
+  2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
+  2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
+  1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
+  1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+  2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
+  2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
+  2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+  1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+  1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
+  1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+  1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+  2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
+  2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
+  1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+  1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+  2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
+  1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
+  1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
+  2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+  1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
+  2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+  2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+  2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
+  2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
+  2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
+  1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+  1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+  2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+  2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
+  2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
+  2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
+  2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+  2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
+  1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+  2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+  2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+  2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
+  2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
+  2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+  2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+  2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
+  1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+  2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
+  2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
+  2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+  2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+  2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
+  3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
+  2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
+  2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
+  2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+  2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+  2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
+  2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+  2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
+  1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
+  2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
+  1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+  2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+  2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+  2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+  3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
+  1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+  2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+  2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
+  2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
+  1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+  2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+  2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+  2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
+  3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
+  2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
+  2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+  1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+  1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+  2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+  2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
+  2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+  1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+  2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+  2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
+  2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
+  1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+  1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+  1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
+  2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
+  1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+  1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+  1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+  2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
+  1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+  2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
+  3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
+  2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+  2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+  2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
+  2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
+  1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+  2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+  2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
+  2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
+  2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
+  2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+  2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+  2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+  2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
+  2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
+  3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
+  3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
+  2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
+  2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
+  3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
+  3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
+  2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+  2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
+  2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
+  2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+  3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
+  3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
+  2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
+  2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+  2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
+  2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+  2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
+  2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
+  2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+  3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
+  3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
+  3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
+  2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
+  1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+  2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
+  2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
+  1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+  2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
+  2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+  3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
+  2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+  2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+  1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+  2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
+  2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
+  1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
+  2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
+  2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+  2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+  2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
+  2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
+  2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
+  1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
+  2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
+  1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+  1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
+  2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
+  2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
+  1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
+  2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
+  363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
+  1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+  2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
+  2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
+  1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+  1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
+  363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
+  1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+  1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
+  1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+  2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+  1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
+  1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
+  1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+  1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+  1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
+  1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+  1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+  2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
+  1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+  1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
+  1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
+  1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
+  2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
+  1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
+  1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+  1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
+  1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+  1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+  2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
+  2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
+  1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+  2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+  1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
+  1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+  2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
+  1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+  1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
+  1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+  1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+  1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+  2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
+  1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+  1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+  1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+  564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
+  1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+  564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
+  2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
+  2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
+  1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+  1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+  1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
+  1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+  1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
+  1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
+  1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+  2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
+  2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
+  2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
+  1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+  1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
+  2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
+  363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
+  1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+  1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+  564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
+  1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
+  1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+  1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+  564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
+  363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS
+  564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
+  135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
+  1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+  2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
+  2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
+  2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
+  1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+  1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS
+  537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
+  2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
+  1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+  2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
+  1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+  2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
+  537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
+  2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  72589981U, // <u,0,3,2>: Cost 1 vrev LHS
+  2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
+  2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
+  2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
+  2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
+  2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
+  73032403U, // <u,0,3,u>: Cost 1 vrev LHS
+  2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
+  1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
+  2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
+  1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
+  2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
+  1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS
+  2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
+  2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
+  2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
+  1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+  2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
+  2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
+  2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
+  2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
+  2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
+  2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
+  2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
+  1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
+  2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
+  2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
+  2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
+  2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
+  2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
+  2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
+  2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
+  2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
+  2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
+  135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
+  1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
+  2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+  2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
+  1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
+  1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
+  2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
+  1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
+  2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
+  2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
+  1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
+  202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
+  2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
+  1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+  2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
+  2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS
+  1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
+  2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
+  835584U, // <u,1,2,3>: Cost 0 copy LHS
+  1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
+  2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
+  1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
+  835584U, // <u,1,2,u>: Cost 0 copy LHS
+  1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+  1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
+  1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+  1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
+  1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+  2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
+  2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
+  1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
+  2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
+  1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
+  2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
+  2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
+  1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
+  1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
+  2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
+  2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
+  1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+  2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
+  2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
+  2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
+  1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
+  2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
+  2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
+  1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
+  1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
+  2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
+  2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
+  2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
+  2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
+  2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
+  2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
+  2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
+  2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
+  202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS
+  2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  835584U, // <u,1,u,3>: Cost 0 copy LHS
+  1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
+  1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
+  835584U, // <u,1,u,u>: Cost 0 copy LHS
+  1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
+  1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
+  2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
+  2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
+  2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
+  1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
+  1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+  2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
+  2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
+  2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
+  2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
+  2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
+  1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
+  2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
+  269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
+  1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+  2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
+  2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS
+  408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
+  1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS
+  408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
+  1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS
+  1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
+  2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
+  2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
+  1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
+  2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
+  2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
+  2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
+  2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+  2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
+  2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
+  1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
+  1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
+  2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
+  2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
+  1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+  2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
+  1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
+  2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
+  2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
+  1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS
+  2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
+  2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
+  2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
+  1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS
+  408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
+  1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS
+  1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
+  1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
+  1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS
+  1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
+  1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
+  1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
+  2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
+  471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
+  1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
+  1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+  1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
+  2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
+  1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
+  1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
+  336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
+  1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
+  2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
+  1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+  1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+  2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
+  1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+  471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
+  471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
+  2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
+  1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+  2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
+  2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
+  1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
+  1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+  2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
+  2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+  1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
+  1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+  1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+  1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+  2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
+  1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+  2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
+  2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
+  1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+  1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
+  1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS
+  1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
+  1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
+  471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
+  2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
+  1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
+  2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
+  1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+  2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
+  2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
+  1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+  2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
+  2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
+  1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS
+  2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
+  2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
+  2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
+  2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
+  2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
+  2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
+  2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
+  2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
+  2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
+  2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
+  2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
+  2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
+  2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
+  2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+  2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
+  2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
+  2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
+  1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
+  1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+  2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+  2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
+  1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+  1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS
+  537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
+  2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
+  1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
+  2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
+  2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
+  2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
+  1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+  2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
+  2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
+  2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
+  2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
+  2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
+  2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
+  2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  96808489U, // <u,4,7,6>: Cost 1 vrev RHS
+  2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
+  96955963U, // <u,4,7,u>: Cost 1 vrev RHS
+  1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
+  1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+  2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
+  161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
+  1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
+  2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
+  2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
+  1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
+  2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
+  2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
+  1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
+  1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
+  1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
+  1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+  2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
+  2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
+  2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
+  1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
+  2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
+  2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
+  1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
+  2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
+  2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
+  1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
+  2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
+  2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
+  3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
+  1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
+  2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
+  2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
+  2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
+  2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
+  2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
+  3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
+  2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
+  2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
+  2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
+  1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
+  1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
+  2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
+  1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
+  1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+  2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
+  2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
+  2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
+  1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
+  229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
+  2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS
+  1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
+  2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
+  2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
+  1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
+  1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
+  2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  27705344U, // <u,5,6,7>: Cost 0 copy RHS
+  27705344U, // <u,5,6,u>: Cost 0 copy RHS
+  1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+  2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
+  2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
+  1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+  1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
+  1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
+  1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
+  2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
+  1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
+  1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
+  229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS
+  2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  27705344U, // <u,5,u,7>: Cost 0 copy RHS
+  27705344U, // <u,5,u,u>: Cost 0 copy RHS
+  2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
+  1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
+  2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
+  1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+  2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
+  2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
+  2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
+  1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
+  1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+  2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
+  2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
+  2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
+  2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
+  2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
+  2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
+  1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
+  1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+  2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
+  2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
+  2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
+  1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
+  2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
+  2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
+  1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
+  2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
+  2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
+  2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
+  1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+  2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
+  3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS
+  1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS
+  2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
+  2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
+  2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
+  2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
+  1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
+  2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
+  1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
+  2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
+  2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
+  2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
+  2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+  2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
+  2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
+  2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
+  1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
+  1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+  2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
+  2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
+  2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
+  1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
+  2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
+  296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
+  1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS
+  432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
+  1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
+  1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS
+  432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS
+  432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
+  1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
+  1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS
+  1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS
+  432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS
+  1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
+  1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+  2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
+  2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
+  497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
+  2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
+  1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+  2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
+  2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
+  1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
+  2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
+  1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+  1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
+  2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
+  1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+  2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
+  1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
+  1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
+  2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
+  1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
+  1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+  2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
+  1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
+  2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
+  1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
+  1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
+  497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
+  2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
+  2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
+  1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+  1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
+  2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
+  2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
+  1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
+  1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
+  363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
+  1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS
+  497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
+  135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
+  471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+  1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
+  471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS
+  537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
+  1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
+  1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+  1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
+  1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
+  1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
+  269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS
+  835584U, // <u,u,2,3>: Cost 0 copy LHS
+  1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
+  2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  835584U, // <u,u,2,u>: Cost 0 copy LHS
+  408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
+  1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  120371557U, // <u,u,3,2>: Cost 1 vrev LHS
+  336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS
+  408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
+  1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS
+  408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS
+  1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+  1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+  1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
+  161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
+  471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
+  1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
+  1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
+  1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+  1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
+  229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS
+  537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
+  1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
+  537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
+  1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
+  2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
+  1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
+  1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
+  296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS
+  27705344U, // <u,u,6,7>: Cost 0 copy RHS
+  27705344U, // <u,u,6,u>: Cost 0 copy RHS
+  432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
+  1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+  1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+  1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS
+  432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
+  1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  120699277U, // <u,u,7,6>: Cost 1 vrev RHS
+  363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS
+  432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS
+  408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
+  471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
+  537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
+  835584U, // <u,u,u,3>: Cost 0 copy LHS
+  408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
+  471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
+  537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
+  27705344U, // <u,u,u,7>: Cost 0 copy RHS
+  835584U, // <u,u,u,u>: Cost 0 copy LHS
   0
 };
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 54bf82a..99418733 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -30,18 +30,6 @@ def ssub_0  : SubRegIndex;
 def ssub_1  : SubRegIndex;
 def ssub_2  : SubRegIndex; // In a Q reg.
 def ssub_3  : SubRegIndex;
-def ssub_4  : SubRegIndex; // In a QQ reg.
-def ssub_5  : SubRegIndex;
-def ssub_6  : SubRegIndex;
-def ssub_7  : SubRegIndex;
-def ssub_8  : SubRegIndex; // In a QQQQ reg.
-def ssub_9  : SubRegIndex;
-def ssub_10 : SubRegIndex;
-def ssub_11 : SubRegIndex;
-def ssub_12 : SubRegIndex;
-def ssub_13 : SubRegIndex;
-def ssub_14 : SubRegIndex;
-def ssub_15 : SubRegIndex;
 
 def dsub_0 : SubRegIndex;
 def dsub_1 : SubRegIndex;
@@ -169,43 +157,28 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>;
 // starting D register number doesn't have to be multiple of 4, e.g.,
 // D1, D2, D3, D4 would be a legal quad, but that would make the subregister
 // stuff very messy.
-let SubRegIndices = [qsub_0, qsub_1] in {
-let CompositeIndices = [(dsub_2 qsub_1, dsub_0), (dsub_3 qsub_1, dsub_1),
-                        (ssub_4 qsub_1, ssub_0), (ssub_5 qsub_1, ssub_1),
-                        (ssub_6 qsub_1, ssub_2), (ssub_7 qsub_1, ssub_3)] in {
+let SubRegIndices = [qsub_0, qsub_1],
+ CompositeIndices = [(dsub_2 qsub_1, dsub_0), (dsub_3 qsub_1, dsub_1)] in {
 def QQ0 : ARMReg<0, "qq0", [Q0,  Q1]>;
 def QQ1 : ARMReg<1, "qq1", [Q2,  Q3]>;
 def QQ2 : ARMReg<2, "qq2", [Q4,  Q5]>;
 def QQ3 : ARMReg<3, "qq3", [Q6,  Q7]>;
-}
-let CompositeIndices = [(dsub_2 qsub_1, dsub_0), (dsub_3 qsub_1, dsub_1)] in {
 def QQ4 : ARMReg<4, "qq4", [Q8,  Q9]>;
 def QQ5 : ARMReg<5, "qq5", [Q10, Q11]>;
 def QQ6 : ARMReg<6, "qq6", [Q12, Q13]>;
 def QQ7 : ARMReg<7, "qq7", [Q14, Q15]>;
 }
-}
 
 // Pseudo 512-bit registers to represent four consecutive Q registers.
-let SubRegIndices = [qqsub_0, qqsub_1] in {
-let CompositeIndices = [(qsub_2  qqsub_1, qsub_0), (qsub_3  qqsub_1, qsub_1),
-                        (dsub_4  qqsub_1, dsub_0), (dsub_5  qqsub_1, dsub_1),
-                        (dsub_6  qqsub_1, dsub_2), (dsub_7  qqsub_1, dsub_3),
-                        (ssub_8  qqsub_1, ssub_0), (ssub_9  qqsub_1, ssub_1),
-                        (ssub_10 qqsub_1, ssub_2), (ssub_11 qqsub_1, ssub_3),
-                        (ssub_12 qqsub_1, ssub_4), (ssub_13 qqsub_1, ssub_5),
-                        (ssub_14 qqsub_1, ssub_6), (ssub_15 qqsub_1, ssub_7)] in
-{
+let SubRegIndices = [qqsub_0, qqsub_1],
+ CompositeIndices = [(qsub_2  qqsub_1, qsub_0), (qsub_3  qqsub_1, qsub_1),
+                     (dsub_4  qqsub_1, dsub_0), (dsub_5  qqsub_1, dsub_1),
+                     (dsub_6  qqsub_1, dsub_2), (dsub_7  qqsub_1, dsub_3)] in {
 def QQQQ0 : ARMReg<0, "qqqq0", [QQ0, QQ1]>;
 def QQQQ1 : ARMReg<1, "qqqq1", [QQ2, QQ3]>;
-}
-let CompositeIndices = [(qsub_2 qqsub_1, qsub_0), (qsub_3 qqsub_1, qsub_1),
-                        (dsub_4 qqsub_1, dsub_0), (dsub_5 qqsub_1, dsub_1),
-                        (dsub_6 qqsub_1, dsub_2), (dsub_7 qqsub_1, dsub_3)] in {
 def QQQQ2 : ARMReg<2, "qqqq2", [QQ4, QQ5]>;
 def QQQQ3 : ARMReg<3, "qqqq3", [QQ6, QQ7]>;
 }
-}
 
 // Current Program Status Register.
 def CPSR    : ARMReg<0, "cpsr">;
@@ -567,4 +540,6 @@ def QQQQPR : RegisterClass<"ARM", [v8i64],
 }
 
 // Condition code registers.
-def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>;
+def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]> {
+  let isAllocatable = 0;
+}
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index aa1e398..ef0aaf2 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -13,6 +13,8 @@
 
 #define DEBUG_TYPE "arm-selectiondag-info"
 #include "ARMTargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 using namespace llvm;
 
 ARMSelectionDAGInfo::ARMSelectionDAGInfo(const TargetMachine &TM)
@@ -132,3 +134,65 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
   }
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
 }
+
+// Adjust parameters for memset, EABI uses format (ptr, size, value),
+// GNU library uses (ptr, value, size)
+// See RTABI section 4.3.4
+SDValue
+ARMSelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                                             SDValue Chain, SDValue Dst,
+                                             SDValue Src, SDValue Size,
+                                             unsigned Align, bool isVolatile,
+                                             MachinePointerInfo DstPtrInfo) const
+{
+  // Use default for non AAPCS subtargets
+  if (!Subtarget->isAAPCS_ABI())
+    return SDValue();
+
+  const ARMTargetLowering &TLI =
+    *static_cast<const ARMTargetLowering*>(DAG.getTarget().getTargetLowering());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  // First argument: data pointer
+  const Type *IntPtrTy = TLI.getTargetData()->getIntPtrType(*DAG.getContext());
+  Entry.Node = Dst;
+  Entry.Ty = IntPtrTy;
+  Args.push_back(Entry);
+
+  // Second argument: buffer size
+  Entry.Node = Size;
+  Entry.Ty = IntPtrTy;
+  Entry.isSExt = false;
+  Args.push_back(Entry);
+
+  // Extend or truncate the argument to be an i32 value for the call.
+  if (Src.getValueType().bitsGT(MVT::i32))
+    Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
+  else
+    Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
+
+  // Third argument: value to fill
+  Entry.Node = Src;
+  Entry.Ty = Type::getInt32Ty(*DAG.getContext());
+  Entry.isSExt = true;
+  Args.push_back(Entry);
+
+  // Emit __eabi_memset call
+  std::pair<SDValue,SDValue> CallResult =
+    TLI.LowerCallTo(Chain,
+                    Type::getVoidTy(*DAG.getContext()), // return type
+                    false, // return sign ext
+                    false, // return zero ext
+                    false, // is var arg
+                    false, // is in regs
+                    0,     // number of fixed arguments
+                    TLI.getLibcallCallingConv(RTLIB::MEMSET), // call conv
+                    false, // is tail call
+                    false, // is return val used
+                    DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET),
+                                          TLI.getPointerTy()), // callee
+                    Args, DAG, dl); // arg list, DAG and debug
+
+  return CallResult.second;
+}
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h
index 7533690..ec1bf5c 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -35,6 +35,15 @@ public:
                                   bool isVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
                                   MachinePointerInfo SrcPtrInfo) const;
+
+  // Adjust parameters for memset, see RTABI section 4.3.4
+  virtual
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                                  SDValue Chain,
+                                  SDValue Op1, SDValue Op2,
+                                  SDValue Op3, unsigned Align,
+                                  bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const;
 };
 
 }
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 29ecc18..4bc12c9 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -15,6 +15,7 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCExpr.h"
@@ -1240,6 +1241,8 @@ tryParseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
         FlagsVal = 0; // No flag
     }
   } else if (SpecReg == "cpsr" || SpecReg == "spsr") {
+    if (Flags == "all") // cpsr_all is an alias for cpsr_fc
+      Flags = "fc";
     for (int i = 0, e = Flags.size(); i != e; ++i) {
       unsigned Flag = StringSwitch<unsigned>(Flags.substr(i, 1))
       .Case("c", 1)
@@ -1827,10 +1830,11 @@ GetMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
       Mnemonic == "rrx" || Mnemonic == "ror" || Mnemonic == "sub" ||
       Mnemonic == "smull" || Mnemonic == "add" || Mnemonic == "adc" ||
       Mnemonic == "mul" || Mnemonic == "bic" || Mnemonic == "asr" ||
-      Mnemonic == "umlal" || Mnemonic == "orr" || Mnemonic == "mov" ||
+      Mnemonic == "umlal" || Mnemonic == "orr" || Mnemonic == "mvn" ||
       Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" ||
       Mnemonic == "sbc" || Mnemonic == "mla" || Mnemonic == "umull" ||
-      Mnemonic == "eor" || Mnemonic == "smlal" || Mnemonic == "mvn") {
+      Mnemonic == "eor" || Mnemonic == "smlal" ||
+      (Mnemonic == "mov" && !isThumb)) {
     CanAcceptCarrySet = true;
   } else {
     CanAcceptCarrySet = false;
@@ -1849,7 +1853,8 @@ GetMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
 
   if (isThumb)
     if (Mnemonic == "bkpt" || Mnemonic == "mcr" || Mnemonic == "mcrr" ||
-        Mnemonic == "mrc" || Mnemonic == "mrrc" || Mnemonic == "cdp")
+        Mnemonic == "mrc" || Mnemonic == "mrrc" || Mnemonic == "cdp" ||
+        Mnemonic == "mov")
       CanAcceptPredicationCode = false;
 }
 
@@ -2099,15 +2104,29 @@ bool ARMAsmParser::ParseDirectiveThumb(SMLoc L) {
 /// ParseDirectiveThumbFunc
 ///  ::= .thumbfunc symbol_name
 bool ARMAsmParser::ParseDirectiveThumbFunc(SMLoc L) {
-  const AsmToken &Tok = Parser.getTok();
-  if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String))
-    return Error(L, "unexpected token in .thumb_func directive");
-  StringRef Name = Tok.getString();
-  Parser.Lex(); // Consume the identifier token.
+  const MCAsmInfo &MAI = getParser().getStreamer().getContext().getAsmInfo();
+  bool isMachO = MAI.hasSubsectionsViaSymbols();
+  StringRef Name;
+
+  // Darwin asm has function name after .thumb_func direction
+  // ELF doesn't
+  if (isMachO) {
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String))
+      return Error(L, "unexpected token in .thumb_func directive");
+    Name = Tok.getString();
+    Parser.Lex(); // Consume the identifier token.
+  }
+
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(L, "unexpected token in directive");
   Parser.Lex();
 
+  // FIXME: assuming function name will be the line following .thumb_func
+  if (!isMachO) {
+    Name = Parser.getTok().getString();
+  }
+
   // Mark symbol as a thumb symbol.
   MCSymbol *Func = getParser().getContext().GetOrCreateSymbol(Name);
   getParser().getStreamer().EmitThumbFunc(Func);
diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
index 642829c..271ca8c 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
@@ -895,8 +895,9 @@ static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 }
 
 // Misc. Branch Instructions.
-// BLX, BLXi, BX
-// BX, BX_RET
+// BX_RET, MOVPCLR
+// BLX, BLX_pred, BX, BX_pred
+// BLXi
 static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
     unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
 
@@ -913,7 +914,7 @@ static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   // BLX and BX take one GPR reg.
   if (Opcode == ARM::BLX || Opcode == ARM::BLX_pred ||
-      Opcode == ARM::BX) {
+      Opcode == ARM::BX || Opcode == ARM::BX_pred) {
     assert(NumOps >= 1 && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
            "Reg operand expected");
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
diff --git a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h b/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
index 8d39982..9639c8a 100644
--- a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
+++ b/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
@@ -476,6 +476,7 @@ static bool DisassembleThumb1DP(MCInst &MI, unsigned Opcode, uint32_t insn,
 // tADDhirr: Rd Rd(TIED_TO) Rm
 // tCMPhir:  Rd Rm
 // tMOVr, tMOVgpr2gpr, tMOVgpr2tgpr, tMOVtgpr2gpr: Rd|tRd Rm|tRn
+// tBX: Rm
 // tBX_RET: 0 operand
 // tBX_RET_vararg: Rm
 // tBLXr_r9: Rm
@@ -488,16 +489,25 @@ static bool DisassembleThumb1Special(MCInst &MI, unsigned Opcode, uint32_t insn,
     return true;
 
   // BX/BLX/tBRIND (indirect branch, i.e, mov pc, Rm) has 1 reg operand: Rm.
-  if (Opcode==ARM::tBLXr_r9 || Opcode==ARM::tBX_Rm || Opcode==ARM::tBRIND) {
-    if (Opcode != ARM::tBRIND) {
+  if (Opcode==ARM::tBLXr_r9 || Opcode==ARM::tBX || Opcode==ARM::tBRIND) {
+    if (Opcode == ARM::tBLXr_r9) {
       // Handling the two predicate operands before the reg operand.
       if (!B->DoPredicateOperands(MI, Opcode, insn, NumOps))
         return false;
       NumOpsAdded += 2;
     }
+
     MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
                                                        getT1Rm(insn))));
     NumOpsAdded += 1;
+
+    if (Opcode == ARM::tBX) {
+      // Handling the two predicate operands after the reg operand.
+      if (!B->DoPredicateOperands(MI, Opcode, insn, NumOps))
+        return false;
+      NumOpsAdded += 2;
+    }
+
     return true;
   }
 
@@ -957,7 +967,7 @@ static bool DisassembleThumb1CondBr(MCInst &MI, unsigned Opcode, uint32_t insn,
 
   unsigned Imm8 = getT1Imm8(insn);
   MI.addOperand(MCOperand::CreateImm(
-                  Opcode == ARM::tBcc ? SignExtend32<9>(Imm8 << 1) + 4
+                  Opcode == ARM::tBcc ? SignExtend32<9>(Imm8 << 1)
                                       : (int)Imm8));
 
   // Predicate operands by ARMBasicMCBuilder::TryPredicateAndSBitModifier().
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index fc2aa75..8ae87f8 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -29,8 +29,8 @@ StringRef ARMInstPrinter::getOpcodeName(unsigned Opcode) const {
   return getInstructionName(Opcode);
 }
 
-StringRef ARMInstPrinter::getRegName(unsigned RegNo) const {
-  return getRegisterName(RegNo);
+void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << getRegisterName(RegNo);
 }
 
 void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index b3ac03a..bde0eb9 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -28,7 +28,7 @@ public:
 
   virtual void printInst(const MCInst *MI, raw_ostream &O);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
-  virtual StringRef getRegName(unsigned RegNo) const;
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
 
   static const char *getInstructionName(unsigned Opcode);
 
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 33cefb6..6bf5650 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -49,7 +49,7 @@ Thumb1RegisterInfo::Thumb1RegisterInfo(const ARMBaseInstrInfo &tii,
 const TargetRegisterClass*
 Thumb1RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)
                                                                          const {
-  if (RC == ARM::tGPRRegisterClass || RC->hasSuperClass(ARM::tGPRRegisterClass))
+  if (ARM::tGPRRegClass.hasSubClassEq(RC))
     return ARM::tGPRRegisterClass;
   return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC);
 }
diff --git a/lib/Target/Alpha/AlphaISelLowering.cpp b/lib/Target/Alpha/AlphaISelLowering.cpp
index ee404f0..0875cfd 100644
--- a/lib/Target/Alpha/AlphaISelLowering.cpp
+++ b/lib/Target/Alpha/AlphaISelLowering.cpp
@@ -155,6 +155,8 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM)
   setJumpBufSize(272);
   setJumpBufAlignment(16);
 
+  setMinFunctionAlignment(4);
+
   computeRegisterProperties();
 }
 
@@ -180,11 +182,6 @@ const char *AlphaTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned AlphaTargetLowering::getFunctionAlignment(const Function *F) const {
-  return 4;
-}
-
 static SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
   EVT PtrVT = Op.getValueType();
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
@@ -233,8 +230,8 @@ AlphaTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallOperands(Outs, CC_Alpha);
 
@@ -347,8 +344,8 @@ AlphaTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_Alpha);
 
diff --git a/lib/Target/Alpha/AlphaISelLowering.h b/lib/Target/Alpha/AlphaISelLowering.h
index cb98f92..d38c314 100644
--- a/lib/Target/Alpha/AlphaISelLowering.h
+++ b/lib/Target/Alpha/AlphaISelLowering.h
@@ -104,9 +104,6 @@ namespace llvm {
 
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const;
-
     /// isFPImmLegal - Returns true if the target can instruction select the
     /// specified FP immediate natively. If false, the legalizer will
     /// materialize the FP immediate as a load from a constant pool.
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.cpp b/lib/Target/Alpha/AlphaRegisterInfo.cpp
index 7667fd8..d6c3809 100644
--- a/lib/Target/Alpha/AlphaRegisterInfo.cpp
+++ b/lib/Target/Alpha/AlphaRegisterInfo.cpp
@@ -69,6 +69,7 @@ const unsigned* AlphaRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
 BitVector AlphaRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(Alpha::R15);
+  Reserved.set(Alpha::R29);
   Reserved.set(Alpha::R30);
   Reserved.set(Alpha::R31);
   return Reserved;
@@ -198,6 +199,11 @@ int AlphaRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return -1;
 }
 
+int AlphaRegisterInfo::getLLVMRegNum(unsigned DwarfRegNum, bool isEH) const {
+  llvm_unreachable("What is the dwarf register number");
+  return -1;
+}
+
 #include "AlphaGenRegisterInfo.inc"
 
 std::string AlphaRegisterInfo::getPrettyName(unsigned reg)
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.h b/lib/Target/Alpha/AlphaRegisterInfo.h
index b0d4dd0..ffe6cf1 100644
--- a/lib/Target/Alpha/AlphaRegisterInfo.h
+++ b/lib/Target/Alpha/AlphaRegisterInfo.h
@@ -48,6 +48,7 @@ struct AlphaRegisterInfo : public AlphaGenRegisterInfo {
   unsigned getEHHandlerRegister() const;
 
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 
   static std::string getPrettyName(unsigned reg);
 };
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.td b/lib/Target/Alpha/AlphaRegisterInfo.td
index 35e6804..d644f05 100644
--- a/lib/Target/Alpha/AlphaRegisterInfo.td
+++ b/lib/Target/Alpha/AlphaRegisterInfo.td
@@ -121,51 +121,18 @@ def GPRC : RegisterClass<"Alpha", [i64], 64,
      // Non-volatile
      R9, R10, R11, R12, R13, R14,
 // Don't allocate 15, 30, 31
-     R15, R30, R31 ]> //zero
-{
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GPRCClass::iterator
-    GPRCClass::allocation_order_end(const MachineFunction &MF) const {
-        return end()-3;
-    }
-  }];
-}
+     R15, R30, R31 ]>; //zero
 
 def F4RC : RegisterClass<"Alpha", [f32], 64, [F0, F1, 
         F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
         F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30,
         // Saved:
         F2, F3, F4, F5, F6, F7, F8, F9,
-        F31 ]> //zero
-{
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    F4RCClass::iterator
-    F4RCClass::allocation_order_end(const MachineFunction &MF) const {
-        return end()-1;
-    }
-  }];
-}
+        F31 ]>; //zero
 
 def F8RC : RegisterClass<"Alpha", [f64], 64, [F0, F1, 
         F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
         F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30,
         // Saved:
         F2, F3, F4, F5, F6, F7, F8, F9,
-        F31 ]> //zero
-{
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    F8RCClass::iterator
-    F8RCClass::allocation_order_end(const MachineFunction &MF) const {
-        return end()-1;
-    }
-  }];
-}
+        F31 ]>; //zero
diff --git a/lib/Target/Blackfin/BlackfinFrameLowering.cpp b/lib/Target/Blackfin/BlackfinFrameLowering.cpp
index 08bb952..0b0984d 100644
--- a/lib/Target/Blackfin/BlackfinFrameLowering.cpp
+++ b/lib/Target/Blackfin/BlackfinFrameLowering.cpp
@@ -31,6 +31,12 @@ bool BlackfinFrameLowering::hasFP(const MachineFunction &MF) const {
     MFI->adjustsStack() || MFI->hasVarSizedObjects();
 }
 
+// Always reserve a call frame. We dont have enough registers to adjust SP.
+bool BlackfinFrameLowering::
+hasReservedCallFrame(const MachineFunction &MF) const {
+  return true;
+}
+
 // Emit a prologue that sets up a stack frame.
 // On function entry, R0-R2 and P0 may hold arguments.
 // R3, P1, and P2 may be used as scratch registers
diff --git a/lib/Target/Blackfin/BlackfinFrameLowering.h b/lib/Target/Blackfin/BlackfinFrameLowering.h
index 3d2ee25..726fa2c 100644
--- a/lib/Target/Blackfin/BlackfinFrameLowering.h
+++ b/lib/Target/Blackfin/BlackfinFrameLowering.h
@@ -36,6 +36,7 @@ public:
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 
   bool hasFP(const MachineFunction &MF) const;
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                             RegScavenger *RS) const;
diff --git a/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp b/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp
index 9df2aee..42659ae 100644
--- a/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp
+++ b/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp
@@ -117,11 +117,11 @@ bool BlackfinDAGToDAGISel::SelectADDRspii(SDValue Addr,
 }
 
 static inline bool isCC(const TargetRegisterClass *RC) {
-  return RC == &BF::AnyCCRegClass || BF::AnyCCRegClass.hasSubClass(RC);
+  return BF::AnyCCRegClass.hasSubClassEq(RC);
 }
 
 static inline bool isDCC(const TargetRegisterClass *RC) {
-  return RC == &BF::DRegClass || BF::DRegClass.hasSubClass(RC) || isCC(RC);
+  return BF::DRegClass.hasSubClassEq(RC) || isCC(RC);
 }
 
 static void UpdateNodeOperand(SelectionDAG &DAG,
diff --git a/lib/Target/Blackfin/BlackfinISelLowering.cpp b/lib/Target/Blackfin/BlackfinISelLowering.cpp
index 1e1f8c9..588d9bd 100644
--- a/lib/Target/Blackfin/BlackfinISelLowering.cpp
+++ b/lib/Target/Blackfin/BlackfinISelLowering.cpp
@@ -121,6 +121,8 @@ BlackfinTargetLowering::BlackfinTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::VAEND, MVT::Other, Expand);
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+
+  setMinFunctionAlignment(2);
 }
 
 const char *BlackfinTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -169,8 +171,8 @@ BlackfinTargetLowering::LowerFormalArguments(SDValue Chain,
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
   CCInfo.AllocateStack(12, 4);  // ABI requires 12 bytes stack space
   CCInfo.AnalyzeFormalArguments(Ins, CC_Blackfin);
 
@@ -227,8 +229,8 @@ BlackfinTargetLowering::LowerReturn(SDValue Chain,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, DAG.getTarget(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 DAG.getTarget(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_Blackfin);
@@ -288,8 +290,8 @@ BlackfinTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getTarget(), ArgLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 DAG.getTarget(), ArgLocs, *DAG.getContext());
   CCInfo.AllocateStack(12, 4);  // ABI requires 12 bytes stack space
   CCInfo.AnalyzeCallOperands(Outs, CC_Blackfin);
 
@@ -376,8 +378,8 @@ BlackfinTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState RVInfo(CallConv, isVarArg, DAG.getTarget(), RVLocs,
-                 *DAG.getContext());
+  CCState RVInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 DAG.getTarget(), RVLocs, *DAG.getContext());
 
   RVInfo.AnalyzeCallResult(Ins, RetCC_Blackfin);
 
@@ -497,11 +499,6 @@ BlackfinTargetLowering::ReplaceNodeResults(SDNode *N,
   }
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned BlackfinTargetLowering::getFunctionAlignment(const Function *F) const {
-  return 2;
-}
-
 //===----------------------------------------------------------------------===//
 //                         Blackfin Inline Assembly Support
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Blackfin/BlackfinISelLowering.h b/lib/Target/Blackfin/BlackfinISelLowering.h
index 102c830..9a54557 100644
--- a/lib/Target/Blackfin/BlackfinISelLowering.h
+++ b/lib/Target/Blackfin/BlackfinISelLowering.h
@@ -53,7 +53,6 @@ namespace llvm {
                                       EVT VT) const;
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
     const char *getTargetNodeName(unsigned Opcode) const;
-    unsigned getFunctionAlignment(const Function *F) const;
 
   private:
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.cpp b/lib/Target/Blackfin/BlackfinInstrInfo.cpp
index e50d57a..598cf2a 100644
--- a/lib/Target/Blackfin/BlackfinInstrInfo.cpp
+++ b/lib/Target/Blackfin/BlackfinInstrInfo.cpp
@@ -160,7 +160,7 @@ static bool inClass(const TargetRegisterClass &Test,
   if (TargetRegisterInfo::isPhysicalRegister(Reg))
     return Test.contains(Reg);
   else
-    return &Test==RC || Test.hasSubClass(RC);
+    return Test.hasSubClassEq(RC);
 }
 
 void
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
index b4a9b84..6ca460e 100644
--- a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
@@ -351,5 +351,11 @@ int BlackfinRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return -1;
 }
 
+int BlackfinRegisterInfo::getLLVMRegNum(unsigned DwarfRegNum,
+                                        bool isEH) const {
+  llvm_unreachable("What is the dwarf register number");
+  return -1;
+}
+
 #include "BlackfinGenRegisterInfo.inc"
 
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.h b/lib/Target/Blackfin/BlackfinRegisterInfo.h
index 642b8ad..375d277 100644
--- a/lib/Target/Blackfin/BlackfinRegisterInfo.h
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.h
@@ -41,8 +41,6 @@ namespace llvm {
       return &BF::PRegClass;
     }
 
-    // bool hasReservedCallFrame(MachineFunction &MF) const;
-
     bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
     void eliminateCallFramePseudoInstr(MachineFunction &MF,
@@ -60,6 +58,7 @@ namespace llvm {
     unsigned getEHHandlerRegister() const;
 
     int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+    int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 
     // Utility functions
     void adjustRegister(MachineBasicBlock &MBB,
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.td b/lib/Target/Blackfin/BlackfinRegisterInfo.td
index f5dd439..d8fd302 100644
--- a/lib/Target/Blackfin/BlackfinRegisterInfo.td
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.td
@@ -245,18 +245,6 @@ def D : RegisterClass<"BF", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {
 
 def P : RegisterClass<"BF", [i32], 32, [P0, P1, P2, P3, P4, P5, FP, SP]> {
   let SubRegClasses = [(P16L lo16), (P16H hi16)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    PClass::iterator
-    PClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      return allocation_order_begin(MF)
-             + (TFI->hasFP(MF) ? 7 : 6);
-    }
-  }];
 }
 
 def I : RegisterClass<"BF", [i32], 32, [I0, I1, I2, I3]>;
@@ -268,18 +256,6 @@ def DP : RegisterClass<"BF", [i32], 32,
     [R0, R1, R2, R3, R4, R5, R6, R7,
      P0, P1, P2, P3, P4, P5, FP, SP]> {
   let SubRegClasses = [(DP16L lo16), (DP16H hi16)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    DPClass::iterator
-    DPClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      return allocation_order_begin(MF)
-             + (TFI->hasFP(MF) ? 15 : 14);
-    }
-  }];
 }
 
 def GR : RegisterClass<"BF", [i32], 32,
@@ -287,20 +263,7 @@ def GR : RegisterClass<"BF", [i32], 32,
      P0, P1, P2, P3, P4, P5,
      I0, I1, I2, I3, M0, M1, M2, M3,
      B0, B1, B2, B3, L0, L1, L2, L3,
-     FP, SP]> {
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GRClass::iterator
-    GRClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      return allocation_order_begin(MF)
-             + (TFI->hasFP(MF) ? 31 : 30);
-    }
-  }];
-}
+     FP, SP]>;
 
 def ALL : RegisterClass<"BF", [i32], 32,
     [R0, R1, R2, R3, R4, R5, R6, R7,
@@ -310,36 +273,10 @@ def ALL : RegisterClass<"BF", [i32], 32,
      FP, SP,
      A0X, A0W, A1X, A1W, ASTAT, RETS,
      LC0, LT0, LB0, LC1, LT1, LB1, CYCLES, CYCLES2,
-     USP, SEQSTAT, SYSCFG, RETI, RETX, RETN, RETE, EMUDAT]> {
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    ALLClass::iterator
-    ALLClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      return allocation_order_begin(MF)
-             + (TFI->hasFP(MF) ? 31 : 30);
-    }
-  }];
-}
+     USP, SEQSTAT, SYSCFG, RETI, RETX, RETN, RETE, EMUDAT]>;
 
 def PI : RegisterClass<"BF", [i32], 32,
-    [P0, P1, P2, P3, P4, P5, I0, I1, I2, I3, FP, SP]> {
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    PIClass::iterator
-    PIClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      return allocation_order_begin(MF)
-             + (TFI->hasFP(MF) ? 11 : 10);
-    }
-  }];
-}
+    [P0, P1, P2, P3, P4, P5, I0, I1, I2, I3, FP, SP]>;
 
 // We are going to pretend that CC and !CC are 32-bit registers, even though
 // they only can hold 1 bit.
diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp
index 358d1b3..fde2e29 100644
--- a/lib/Target/CBackend/CBackend.cpp
+++ b/lib/Target/CBackend/CBackend.cpp
@@ -373,7 +373,7 @@ static std::string CBEMangle(const std::string &S) {
 ///
 bool CBackendNameAllUsedStructsAndMergeFunctions::runOnModule(Module &M) {
   // Get a set of types that are used by the program...
-  std::set<const Type *> UT = getAnalysis<FindUsedTypes>().getTypes();
+  SetVector<const Type *> UT = getAnalysis<FindUsedTypes>().getTypes();
 
   // Loop over the module symbol table, removing types from UT that are
   // already named, and removing names for types that are not used.
@@ -390,11 +390,10 @@ bool CBackendNameAllUsedStructsAndMergeFunctions::runOnModule(Module &M) {
       TST.remove(I);
     } else {
       // If this is not used, remove it from the symbol table.
-      std::set<const Type *>::iterator UTI = UT.find(I->second);
-      if (UTI == UT.end())
+      if (!UT.count(I->second))
         TST.remove(I);
       else
-        UT.erase(UTI);    // Only keep one name for this type.
+        UT.remove(I->second); // Only keep one name for this type.
     }
   }
 
@@ -403,7 +402,7 @@ bool CBackendNameAllUsedStructsAndMergeFunctions::runOnModule(Module &M) {
   //
   bool Changed = false;
   unsigned RenameCounter = 0;
-  for (std::set<const Type *>::const_iterator I = UT.begin(), E = UT.end();
+  for (SetVector<const Type *>::const_iterator I = UT.begin(), E = UT.end();
        I != E; ++I)
     if ((*I)->isStructTy() || (*I)->isArrayTy()) {
       while (M.addTypeName("unnamed"+utostr(RenameCounter), *I))
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index 8668da3..f9b5041 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -445,6 +445,8 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::ANY_EXTEND);
 
+  setMinFunctionAlignment(3);
+
   computeRegisterProperties();
 
   // Set pre-RA register scheduler default to BURR, which produces slightly
@@ -489,11 +491,6 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
   return ((i != node_names.end()) ? i->second : 0);
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned SPUTargetLowering::getFunctionAlignment(const Function *) const {
-  return 3;
-}
-
 //===----------------------------------------------------------------------===//
 // Return the Cell SPU's SETCC result type
 //===----------------------------------------------------------------------===//
@@ -1120,8 +1117,8 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain,
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
   // FIXME: allow for other calling conventions
   CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU);
 
@@ -1218,7 +1215,7 @@ SPUTargetLowering::LowerFormalArguments(SDValue Chain,
       FuncInfo->setVarArgsFrameIndex(
         MFI->CreateFixedObject(StackSlotSize, ArgOffset, true));
       SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
-      unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::R32CRegClass);
+      unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::VECREGRegClass);
       SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8);
       SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(),
                                    false, false, 0);
@@ -1267,8 +1264,8 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
 
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
   // FIXME: allow for other calling conventions
   CCInfo.AnalyzeCallOperands(Outs, CCC_SPU);
 
@@ -1428,8 +1425,8 @@ SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Now handle the return value(s)
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCRetInfo(CallConv, isVarArg, getTargetMachine(),
-                    RVLocs, *DAG.getContext());
+  CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		    getTargetMachine(), RVLocs, *DAG.getContext());
   CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU);
 
 
@@ -1455,8 +1452,8 @@ SPUTargetLowering::LowerReturn(SDValue Chain,
                                DebugLoc dl, SelectionDAG &DAG) const {
 
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_SPU);
 
   // If this is the first return lowered for this function, add the regs to the
@@ -3207,11 +3204,11 @@ SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
 // LowerAsmOperandForConstraint
 void
 SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
-                                                char ConstraintLetter,
+                                                std::string &Constraint,
                                                 std::vector<SDValue> &Ops,
                                                 SelectionDAG &DAG) const {
   // Default, for the time being, to the base class handler
-  TargetLowering::LowerAsmOperandForConstraint(Op, ConstraintLetter, Ops, DAG);
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
 /// isLegalAddressImmediate - Return true if the integer value can be used
diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h
index cf883e2..d23f6cc 100644
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@@ -141,7 +141,7 @@ namespace llvm {
       getRegForInlineAsmConstraint(const std::string &Constraint,
                                    EVT VT) const;
 
-    void LowerAsmOperandForConstraint(SDValue Op, char ConstraintLetter,
+    void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
                                       std::vector<SDValue> &Ops,
                                       SelectionDAG &DAG) const;
 
@@ -152,9 +152,6 @@ namespace llvm {
 
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const;
-
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp
index 0bdd50a..623ae76 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.cpp
+++ b/lib/Target/CellSPU/SPURegisterInfo.cpp
@@ -328,6 +328,10 @@ SPURegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return SPUGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
 }
 
+int SPURegisterInfo::getLLVMRegNum(unsigned RegNum, bool isEH) const {
+  return SPUGenRegisterInfo::getLLVMRegNumFull(RegNum, 0);
+}
+
 int
 SPURegisterInfo::convertDFormToXForm(int dFormOpcode) const
 {
diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h
index 1708c59..6ecf0f2 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.h
+++ b/lib/Target/CellSPU/SPURegisterInfo.h
@@ -83,6 +83,7 @@ namespace llvm {
 
     //! Get DWARF debugging register number
     int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+    int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 
     //! Convert D-form load/store to X-form load/store
     /*!
diff --git a/lib/Target/CellSPU/SPURegisterInfo.td b/lib/Target/CellSPU/SPURegisterInfo.td
index 3e8f097..cce0c82 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.td
+++ b/lib/Target/CellSPU/SPURegisterInfo.td
@@ -170,23 +170,7 @@ def GPRC : RegisterClass<"SPU", [i128], 128,
    R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
    R86, R85, R84, R83, R82, R81, R80, 
    /* environment ptr, SP, LR */ 
-   R2, R1, R0 ]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GPRCClass::iterator
-    GPRCClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    GPRCClass::iterator
-    GPRCClass::allocation_order_end(const MachineFunction &MF) const {
-      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
-    }
-  }];
-}
+   R2, R1, R0 ]>;
 
 // The SPU's registers as 64-bit wide (double word integer) "preferred slot":
 def R64C : RegisterClass<"SPU", [i64], 128,
@@ -204,23 +188,7 @@ def R64C : RegisterClass<"SPU", [i64], 128,
    R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
    R86, R85, R84, R83, R82, R81, R80, 
    /* environment ptr, SP, LR */ 
-   R2, R1, R0 ]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    R64CClass::iterator
-    R64CClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    R64CClass::iterator
-    R64CClass::allocation_order_end(const MachineFunction &MF) const {
-      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
-    }
-  }];
-}
+   R2, R1, R0 ]>;
 
 // The SPU's registers as 64-bit wide (double word) FP "preferred slot":
 def R64FP : RegisterClass<"SPU", [f64], 128,
@@ -238,23 +206,7 @@ def R64FP : RegisterClass<"SPU", [f64], 128,
    R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
    R86, R85, R84, R83, R82, R81, R80, 
    /* environment ptr, SP, LR */ 
-   R2, R1, R0 ]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    R64FPClass::iterator
-    R64FPClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    R64FPClass::iterator
-    R64FPClass::allocation_order_end(const MachineFunction &MF) const {
-      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
-    }
-  }];
-}
+   R2, R1, R0 ]>;
 
 // The SPU's registers as 32-bit wide (word) "preferred slot":
 def R32C : RegisterClass<"SPU", [i32], 128,
@@ -272,23 +224,7 @@ def R32C : RegisterClass<"SPU", [i32], 128,
    R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
    R86, R85, R84, R83, R82, R81, R80, 
    /* environment ptr, SP, LR */ 
-   R2, R1, R0 ]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    R32CClass::iterator
-    R32CClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    R32CClass::iterator
-    R32CClass::allocation_order_end(const MachineFunction &MF) const {
-      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
-    }
-  }];
-}
+   R2, R1, R0 ]>;
 
 // The SPU's registers as single precision floating point "preferred slot":
 def R32FP : RegisterClass<"SPU", [f32], 128,
@@ -306,23 +242,7 @@ def R32FP : RegisterClass<"SPU", [f32], 128,
    R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
    R86, R85, R84, R83, R82, R81, R80, 
    /* environment ptr, SP, LR */ 
-   R2, R1, R0 ]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    R32FPClass::iterator
-    R32FPClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    R32FPClass::iterator
-    R32FPClass::allocation_order_end(const MachineFunction &MF) const {
-      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
-    }
-  }];
-}
+   R2, R1, R0 ]>;
 
 // The SPU's registers as 16-bit wide (halfword) "preferred slot":
 def R16C : RegisterClass<"SPU", [i16], 128,
@@ -340,23 +260,7 @@ def R16C : RegisterClass<"SPU", [i16], 128,
    R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
    R86, R85, R84, R83, R82, R81, R80, 
    /* environment ptr, SP, LR */ 
-   R2, R1, R0 ]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    R16CClass::iterator
-    R16CClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    R16CClass::iterator
-    R16CClass::allocation_order_end(const MachineFunction &MF) const {
-      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
-    }
-  }];
-}
+   R2, R1, R0 ]>;
 
 // The SPU's registers as 8-bit wide (byte) "preferred slot":
 def R8C : RegisterClass<"SPU", [i8], 128,
@@ -374,23 +278,7 @@ def R8C : RegisterClass<"SPU", [i8], 128,
    R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
    R86, R85, R84, R83, R82, R81, R80, 
    /* environment ptr, SP, LR */ 
-   R2, R1, R0 ]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    R8CClass::iterator
-    R8CClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    R8CClass::iterator
-    R8CClass::allocation_order_end(const MachineFunction &MF) const {
-      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
-    }
-  }];
-}
+   R2, R1, R0 ]>;
 
 // The SPU's registers as vector registers:
 def VECREG : RegisterClass<"SPU",
@@ -410,20 +298,4 @@ def VECREG : RegisterClass<"SPU",
    R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
    R86, R85, R84, R83, R82, R81, R80, 
    /* environment ptr, SP, LR */ 
-   R2, R1, R0 ]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    VECREGClass::iterator
-    VECREGClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    VECREGClass::iterator
-    VECREGClass::allocation_order_end(const MachineFunction &MF) const {
-      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
-    }
-  }];
-}
+   R2, R1, R0 ]>;
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp
index 21a5988..c5e0a89 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp
@@ -180,6 +180,8 @@ MBlazeTargetLowering::MBlazeTargetLowering(MBlazeTargetMachine &TM)
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
+  setMinFunctionAlignment(2);
+
   setStackPointerRegisterToSaveRestore(MBlaze::R1);
   computeRegisterProperties();
 }
@@ -188,11 +190,6 @@ MVT::SimpleValueType MBlazeTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i32;
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned MBlazeTargetLowering::getFunctionAlignment(const Function *) const {
-  return 2;
-}
-
 SDValue MBlazeTargetLowering::LowerOperation(SDValue Op,
                                              SelectionDAG &DAG) const {
   switch (Op.getOpcode())
@@ -420,7 +417,7 @@ MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI,
   // All atomic instructions on the Microblaze are implemented using the
   // load-linked / store-conditional style atomic instruction sequences.
   // Thus, all operations will look something like the following:
-  // 
+  //
   //  start:
   //    lwx     RV, RP, 0
   //    <do stuff>
@@ -701,8 +698,8 @@ LowerCall(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeCallOperands(Outs, CC_MBlaze);
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -840,8 +837,8 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv,
                 SmallVectorImpl<SDValue> &InVals) const {
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_MBlaze);
 
@@ -883,8 +880,8 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_MBlaze);
   SDValue StackPtr;
@@ -1015,8 +1012,8 @@ LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_MBlaze);
@@ -1046,9 +1043,9 @@ LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
 
   // If this function is using the interrupt_handler calling convention
   // then use "rtid r14, 0" otherwise use "rtsd r15, 8"
-  unsigned Ret = (CallConv == llvm::CallingConv::MBLAZE_INTR) ? MBlazeISD::IRet 
+  unsigned Ret = (CallConv == llvm::CallingConv::MBLAZE_INTR) ? MBlazeISD::IRet
                                                               : MBlazeISD::Ret;
-  unsigned Reg = (CallConv == llvm::CallingConv::MBLAZE_INTR) ? MBlaze::R14 
+  unsigned Reg = (CallConv == llvm::CallingConv::MBLAZE_INTR) ? MBlaze::R14
                                                               : MBlaze::R15;
   SDValue DReg = DAG.getRegister(Reg, MVT::i32);
 
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.h b/lib/Target/MBlaze/MBlazeISelLowering.h
index 91649bc..265c1a7 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.h
+++ b/lib/Target/MBlaze/MBlazeISelLowering.h
@@ -104,7 +104,6 @@ namespace llvm {
     /// getSetCCResultType - get the ISD::SETCC result ValueType
     MVT::SimpleValueType getSetCCResultType(EVT VT) const;
 
-    virtual unsigned getFunctionAlignment(const Function *F) const;
   private:
     // Subtarget Info
     const MBlazeSubtarget *Subtarget;
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.td b/lib/Target/MBlaze/MBlazeInstrInfo.td
index 896e8ea..950f2d7 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.td
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.td
@@ -245,20 +245,25 @@ class PatCmp<bits<6> op, bits<11> flags, string instr_asm> :
 //===----------------------------------------------------------------------===//
 // Memory Access Instructions
 //===----------------------------------------------------------------------===//
+
+let mayLoad = 1 in {
 class LoadM<bits<6> op, bits<11> flags, string instr_asm> :
             TA<op, flags, (outs GPR:$dst), (ins memrr:$addr),
                !strconcat(instr_asm, "   $dst, $addr"),
                [], IIC_MEMl>;
+}
 
 class LoadMI<bits<6> op, string instr_asm, PatFrag OpNode> :
              TB<op, (outs GPR:$dst), (ins memri:$addr),
                 !strconcat(instr_asm, "   $dst, $addr"),
                 [(set (i32 GPR:$dst), (OpNode iaddr:$addr))], IIC_MEMl>;
 
+let mayStore = 1 in {
 class StoreM<bits<6> op, bits<11> flags, string instr_asm> :
              TA<op, flags, (outs), (ins GPR:$dst, memrr:$addr),
                 !strconcat(instr_asm, "   $dst, $addr"),
                 [], IIC_MEMs>;
+}
 
 class StoreMI<bits<6> op, string instr_asm, PatFrag OpNode> :
               TB<op, (outs), (ins GPR:$dst, memri:$addr),
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
index ed8511d..517279f 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
@@ -356,5 +356,9 @@ int MBlazeRegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const {
   return MBlazeGenRegisterInfo::getDwarfRegNumFull(RegNo,0);
 }
 
+int MBlazeRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
+  return MBlazeGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
+}
+
 #include "MBlazeGenRegisterInfo.inc"
 
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.h b/lib/Target/MBlaze/MBlazeRegisterInfo.h
index 69ec5aa..3807839 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.h
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.h
@@ -75,6 +75,7 @@ struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo {
   unsigned getEHHandlerRegister() const;
 
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.td b/lib/Target/MBlaze/MBlazeRegisterInfo.td
index 1a695a7..bd396ed 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.td
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.td
@@ -43,7 +43,7 @@ let Namespace = "MBlaze" in {
   def R1  : MBlazeGPRReg< 1,  "r1">,   DwarfRegNum<[1]>;
   def R2  : MBlazeGPRReg< 2,  "r2">,   DwarfRegNum<[2]>;
   def R3  : MBlazeGPRReg< 3,  "r3">,   DwarfRegNum<[3]>;
-  def R4  : MBlazeGPRReg< 4,  "r4">,   DwarfRegNum<[5]>;
+  def R4  : MBlazeGPRReg< 4,  "r4">,   DwarfRegNum<[4]>;
   def R5  : MBlazeGPRReg< 5,  "r5">,   DwarfRegNum<[5]>;
   def R6  : MBlazeGPRReg< 6,  "r6">,   DwarfRegNum<[6]>;
   def R7  : MBlazeGPRReg< 7,  "r7">,   DwarfRegNum<[7]>;
@@ -102,7 +102,7 @@ let Namespace = "MBlaze" in {
   // The carry bit. In the Microblaze this is really bit 29 of the
   // MSR register but this is the only bit of that register that we
   // are interested in modeling.
-  def CARRY  : MBlazeSPRReg<0x0000, "rmsr[c]">, DwarfRegNum<[33]>;
+  def CARRY  : MBlazeSPRReg<0x0000, "rmsr[c]">;
 }
 
 //===----------------------------------------------------------------------===//
@@ -131,19 +131,7 @@ def GPR : RegisterClass<"MBlaze", [i32,f32], 32,
   R17, // Return address for exceptions
   R18, // Reserved for assembler
   R19  // The frame-pointer
-  ]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GPRClass::iterator
-    GPRClass::allocation_order_end(const MachineFunction &MF) const {
-      // The last 10 registers on the list above are reserved
-      return end()-10;
-    }
-  }];
-}
+  ]>;
 
 def SPR : RegisterClass<"MBlaze", [i32], 32,
   [
@@ -174,16 +162,8 @@ def SPR : RegisterClass<"MBlaze", [i32], 32,
   RPVR11
   ]>
 {
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    SPRClass::iterator
-    SPRClass::allocation_order_end(const MachineFunction &MF) const {
-      // None of the special purpose registers are allocatable.
-      return end()-24;
-    }
-  }];
+  // None of the special purpose registers are allocatable.
+  let isAllocatable = 0;
 }
 
 def CRC : RegisterClass<"MBlaze", [i32], 32, [CARRY]> {
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 006785b..0a3eab1 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -170,6 +170,9 @@ MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
     setLibcallName(RTLIB::MUL_I8,  "__mulqi3hw_noint");
     setLibcallName(RTLIB::MUL_I16, "__mulhi3hw_noint");
   }
+
+  setMinFunctionAlignment(1);
+  setPrefFunctionAlignment(2);
 }
 
 SDValue MSP430TargetLowering::LowerOperation(SDValue Op,
@@ -193,11 +196,6 @@ SDValue MSP430TargetLowering::LowerOperation(SDValue Op,
   }
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned MSP430TargetLowering::getFunctionAlignment(const Function *F) const {
-  return F->hasFnAttr(Attribute::OptimizeForSize) ? 1 : 2;
-}
-
 //===----------------------------------------------------------------------===//
 //                       MSP430 Inline Assembly Support
 //===----------------------------------------------------------------------===//
@@ -314,8 +312,8 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CC_MSP430);
 
   assert(!isVarArg && "Varargs not supported yet");
@@ -397,8 +395,8 @@ MSP430TargetLowering::LowerReturn(SDValue Chain,
   }
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_MSP430);
@@ -451,8 +449,8 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
                                      SmallVectorImpl<SDValue> &InVals) const {
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallOperands(Outs, CC_MSP430);
 
@@ -574,8 +572,8 @@ MSP430TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_MSP430);
 
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index 19c9eac..bd660a0 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -82,9 +82,6 @@ namespace llvm {
     /// DAG node.
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const;
-
     SDValue LowerShifts(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 1da6d8d..53f4c2e 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -76,7 +76,11 @@ BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
-  // Mark 4 special registers as reserved.
+  // Mark 4 special registers with subregisters as reserved.
+  Reserved.set(MSP430::PCB);
+  Reserved.set(MSP430::SPB);
+  Reserved.set(MSP430::SRB);
+  Reserved.set(MSP430::CGB);
   Reserved.set(MSP430::PCW);
   Reserved.set(MSP430::SPW);
   Reserved.set(MSP430::SRW);
@@ -242,4 +246,9 @@ int MSP430RegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return 0;
 }
 
+int MSP430RegisterInfo::getLLVMRegNum(unsigned RegNum, bool isEH) const {
+  llvm_unreachable("Not implemented yet!");
+  return 0;
+}
+
 #include "MSP430GenRegisterInfo.inc"
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index 56744fa..e820558 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -39,6 +39,13 @@ public:
   BitVector getReservedRegs(const MachineFunction &MF) const;
   const TargetRegisterClass* getPointerRegClass(unsigned Kind = 0) const;
 
+  const TargetRegisterClass *
+  getMatchingSuperRegClass(const TargetRegisterClass *A,
+                           const TargetRegisterClass *B, unsigned Idx) const {
+    // No sub-classes makes this really easy.
+    return A;
+  }
+
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
@@ -54,6 +61,7 @@ public:
 
   //! Get DWARF debugging register number
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.td b/lib/Target/MSP430/MSP430RegisterInfo.td
index ab7b59b..3ef6ab2 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.td
+++ b/lib/Target/MSP430/MSP430RegisterInfo.td
@@ -70,25 +70,7 @@ def GR8 : RegisterClass<"MSP430", [i8], 8,
    // Frame pointer, sometimes allocable
    FPB,
    // Volatile, but not allocable
-   PCB, SPB, SRB, CGB]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GR8Class::iterator
-    GR8Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      // Depending on whether the function uses frame pointer or not, last 5 or 4
-      // registers on the list above are reserved
-      if (TFI->hasFP(MF))
-        return end()-5;
-      else
-        return end()-4;
-    }
-  }];
-}
+   PCB, SPB, SRB, CGB]>;
 
 def GR16 : RegisterClass<"MSP430", [i16], 16,
    // Volatile registers
@@ -99,21 +81,5 @@ def GR16 : RegisterClass<"MSP430", [i16], 16,
    PCW, SPW, SRW, CGW]>
 {
   let SubRegClasses = [(GR8 subreg_8bit)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GR16Class::iterator
-    GR16Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      // Depending on whether the function uses frame pointer or not, last 5 or 4
-      // registers on the list above are reserved
-      if (TFI->hasFP(MF))
-        return end()-5;
-      else
-        return end()-4;
-    }
-  }];
 }
 
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index 8939b0a..fd16516 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -13,6 +13,7 @@ tablegen(MipsGenSubtarget.inc -gen-subtarget)
 add_llvm_target(MipsCodeGen
   MipsAsmPrinter.cpp
   MipsDelaySlotFiller.cpp
+  MipsEmitGPRestore.cpp
   MipsExpandPseudo.cpp
   MipsInstrInfo.cpp
   MipsISelDAGToDAG.cpp
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index 05b4c5a..76a26a9 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -26,6 +26,7 @@ namespace llvm {
   FunctionPass *createMipsISelDag(MipsTargetMachine &TM);
   FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
   FunctionPass *createMipsExpandPseudoPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsEmitGPRestorePass(MipsTargetMachine &TM);
 
   extern Target TheMipsTarget;
   extern Target TheMipselTarget;
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 502f744..8caa7cd 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -126,44 +126,60 @@ namespace {
 // Create a bitmask with all callee saved registers for CPU or Floating Point
 // registers. For CPU registers consider RA, GP and FP for saving if necessary.
 void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) {
-  const TargetFrameLowering *TFI = TM.getFrameLowering();
-  const TargetRegisterInfo *RI = TM.getRegisterInfo();
-  const MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>();
-
   // CPU and FPU Saved Registers Bitmasks
-  unsigned int CPUBitmask = 0;
-  unsigned int FPUBitmask = 0;
+  unsigned CPUBitmask = 0, FPUBitmask = 0;
+  int CPUTopSavedRegOff, FPUTopSavedRegOff;
 
   // Set the CPU and FPU Bitmasks
   const MachineFrameInfo *MFI = MF->getFrameInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+  // size of stack area to which FP callee-saved regs are saved.
+  unsigned CPURegSize = Mips::CPURegsRegisterClass->getSize();
+  unsigned FGR32RegSize = Mips::FGR32RegisterClass->getSize();
+  unsigned AFGR64RegSize = Mips::AFGR64RegisterClass->getSize();
+  bool HasAFGR64Reg = false;
+  unsigned CSFPRegsSize = 0;
+  unsigned i, e = CSI.size();
+
+  // Set FPU Bitmask.
+  for (i = 0; i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
-    unsigned RegNum = MipsRegisterInfo::getRegisterNumbering(Reg);
     if (Mips::CPURegsRegisterClass->contains(Reg))
-      CPUBitmask |= (1 << RegNum);
-    else
-      FPUBitmask |= (1 << RegNum);
+      break;
+
+    unsigned RegNum = MipsRegisterInfo::getRegisterNumbering(Reg);
+    if (Mips::AFGR64RegisterClass->contains(Reg)) {
+      FPUBitmask |= (3 << RegNum);
+      CSFPRegsSize += AFGR64RegSize;
+      HasAFGR64Reg = true;
+      continue;
+    }
+
+    FPUBitmask |= (1 << RegNum);
+    CSFPRegsSize += FGR32RegSize;
+  }
+
+  // Set CPU Bitmask.
+  for (; i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    unsigned RegNum = MipsRegisterInfo::getRegisterNumbering(Reg);
+    CPUBitmask |= (1 << RegNum);
   }
 
-  // Return Address and Frame registers must also be set in CPUBitmask.
-  // FIXME: Do we really need hasFP() call here? When no FP is present SP is
-  // just returned -- will it be ok?
-  if (TFI->hasFP(*MF))
-    CPUBitmask |= (1 << MipsRegisterInfo::
-                getRegisterNumbering(RI->getFrameRegister(*MF)));
+  // FP Regs are saved right below where the virtual frame pointer points to.
+  FPUTopSavedRegOff = FPUBitmask ?
+    (HasAFGR64Reg ? -AFGR64RegSize : -FGR32RegSize) : 0;
 
-  if (MFI->adjustsStack())
-    CPUBitmask |= (1 << MipsRegisterInfo::
-                getRegisterNumbering(RI->getRARegister()));
+  // CPU Regs are saved below FP Regs.
+  CPUTopSavedRegOff = CPUBitmask ? -CSFPRegsSize - CPURegSize : 0;
 
   // Print CPUBitmask
   O << "\t.mask \t"; printHex32(CPUBitmask, O);
-  O << ',' << MipsFI->getCPUTopSavedRegOff() << '\n';
+  O << ',' << CPUTopSavedRegOff << '\n';
 
   // Print FPUBitmask
-  O << "\t.fmask\t"; printHex32(FPUBitmask, O); O << ","
-    << MipsFI->getFPUTopSavedRegOff() << '\n';
+  O << "\t.fmask\t"; printHex32(FPUBitmask, O);
+  O << "," << FPUTopSavedRegOff << '\n';
 }
 
 // Print a 32 bit hex number with all numbers.
@@ -302,6 +318,10 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   case MipsII::MO_GOT:      O << "%got(";    break;
   case MipsII::MO_ABS_HI:   O << "%hi(";     break;
   case MipsII::MO_ABS_LO:   O << "%lo(";     break;
+  case MipsII::MO_TLSGD:    O << "%tlsgd(";  break;
+  case MipsII::MO_GOTTPREL: O << "%gottprel("; break;
+  case MipsII::MO_TPREL_HI: O << "%tprel_hi("; break;
+  case MipsII::MO_TPREL_LO: O << "%tprel_lo("; break;
   }
 
   switch (MO.getType()) {
@@ -310,7 +330,7 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
       break;
 
     case MachineOperand::MO_Immediate:
-      O << (short int)MO.getImm();
+      O << MO.getImm();
       break;
 
     case MachineOperand::MO_MachineBasicBlock:
diff --git a/lib/Target/Mips/MipsEmitGPRestore.cpp b/lib/Target/Mips/MipsEmitGPRestore.cpp
new file mode 100644
index 0000000..f49d490
--- /dev/null
+++ b/lib/Target/Mips/MipsEmitGPRestore.cpp
@@ -0,0 +1,94 @@
+//===-- MipsEmitGPRestore.cpp - Emit GP restore instruction----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass emits instructions that restore $gp right
+// after jalr instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "emit-gp-restore"
+
+#include "Mips.h"
+#include "MipsTargetMachine.h"
+#include "MipsMachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+namespace {
+  struct Inserter : public MachineFunctionPass {
+
+    TargetMachine &TM;
+    const TargetInstrInfo *TII;
+
+    static char ID;
+    Inserter(TargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
+
+    virtual const char *getPassName() const {
+      return "Mips Emit GP Restore";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F);
+  };
+  char Inserter::ID = 0;
+} // end of anonymous namespace
+
+bool Inserter::runOnMachineFunction(MachineFunction &F) {
+  if (TM.getRelocationModel() != Reloc::PIC_)
+    return false;
+
+  bool Changed = false;
+  int FI =  F.getInfo<MipsFunctionInfo>()->getGPFI();
+
+  for (MachineFunction::iterator MFI = F.begin(), MFE = F.end();
+       MFI != MFE; ++MFI) {
+    MachineBasicBlock& MBB = *MFI;
+    MachineBasicBlock::iterator I = MFI->begin();
+
+    // If MBB is a landing pad, insert instruction that restores $gp after
+    // EH_LABEL.
+    if (MBB.isLandingPad()) {
+      // Find EH_LABEL first.
+      for (; I->getOpcode() != TargetOpcode::EH_LABEL; ++I) ;
+      
+      // Insert lw.
+      ++I;
+      DebugLoc dl = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+      BuildMI(MBB, I, dl, TII->get(Mips::LW), Mips::GP).addImm(0)
+                                                       .addFrameIndex(FI);
+      Changed = true;
+    }
+
+    while (I != MFI->end()) {
+      if (I->getOpcode() != Mips::JALR) {
+        ++I;
+        continue;
+      }
+
+      DebugLoc dl = I->getDebugLoc();
+      // emit lw $gp, ($gp save slot on stack) after jalr
+      BuildMI(MBB, ++I, dl, TII->get(Mips::LW), Mips::GP).addImm(0)
+        .addFrameIndex(FI);
+      Changed = true;
+    }
+  } 
+
+  return Changed;
+}
+
+/// createMipsEmitGPRestorePass - Returns a pass that emits instructions that
+/// restores $gp clobbered by jalr instructions.
+FunctionPass *llvm::createMipsEmitGPRestorePass(MipsTargetMachine &tm) {
+  return new Inserter(tm);
+}
+
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index 21e3314..a0f90a0 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -84,125 +84,17 @@ using namespace llvm;
 // if frame pointer elimination is disabled.
 bool MipsFrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  return DisableFramePointerElim(MF) || MFI->hasVarSizedObjects();
+  return DisableFramePointerElim(MF) || MFI->hasVarSizedObjects()
+      || MFI->isFrameAddressTaken();
 }
 
-void MipsFrameLowering::adjustMipsStackFrame(MachineFunction &MF) const {
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  unsigned StackAlign = getStackAlignment();
-  unsigned RegSize = STI.isGP32bit() ? 4 : 8;
-  bool HasGP = MipsFI->needGPSaveRestore();
-
-  // Min and Max CSI FrameIndex.
-  int MinCSFI = -1, MaxCSFI = -1;
-
-  // See the description at MipsMachineFunction.h
-  int TopCPUSavedRegOff = -1, TopFPUSavedRegOff = -1;
-
-  // Replace the dummy '0' SPOffset by the negative offsets, as explained on
-  // LowerFormalArguments. Leaving '0' for while is necessary to avoid the
-  // approach done by calculateFrameObjectOffsets to the stack frame.
-  MipsFI->adjustLoadArgsFI(MFI);
-  MipsFI->adjustStoreVarArgsFI(MFI);
-
-  // It happens that the default stack frame allocation order does not directly
-  // map to the convention used for mips. So we must fix it. We move the callee
-  // save register slots after the local variables area, as described in the
-  // stack frame above.
-  unsigned CalleeSavedAreaSize = 0;
-  if (!CSI.empty()) {
-    MinCSFI = CSI[0].getFrameIdx();
-    MaxCSFI = CSI[CSI.size()-1].getFrameIdx();
-  }
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i)
-    CalleeSavedAreaSize += MFI->getObjectAlignment(CSI[i].getFrameIdx());
-
-  unsigned StackOffset = HasGP ? (MipsFI->getGPStackOffset()+RegSize)
-                : (STI.isABI_O32() ? 16 : 0);
-
-  // Adjust local variables. They should come on the stack right
-  // after the arguments.
-  int LastOffsetFI = -1;
-  for (int i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
-    if (i >= MinCSFI && i <= MaxCSFI)
-      continue;
-    if (MFI->isDeadObjectIndex(i))
-      continue;
-    unsigned Offset =
-      StackOffset + MFI->getObjectOffset(i) - CalleeSavedAreaSize;
-    if (LastOffsetFI == -1)
-      LastOffsetFI = i;
-    if (Offset > MFI->getObjectOffset(LastOffsetFI))
-      LastOffsetFI = i;
-    MFI->setObjectOffset(i, Offset);
-  }
-
-  // Adjust CPU Callee Saved Registers Area. Registers RA and FP must
-  // be saved in this CPU Area. This whole area must be aligned to the
-  // default Stack Alignment requirements.
-  if (LastOffsetFI >= 0)
-    StackOffset = MFI->getObjectOffset(LastOffsetFI)+
-                  MFI->getObjectSize(LastOffsetFI);
-  StackOffset = ((StackOffset+StackAlign-1)/StackAlign*StackAlign);
-
-  for (unsigned i = 0, e = CSI.size(); i != e ; ++i) {
-    unsigned Reg = CSI[i].getReg();
-    if (!Mips::CPURegsRegisterClass->contains(Reg))
-      break;
-    MFI->setObjectOffset(CSI[i].getFrameIdx(), StackOffset);
-    TopCPUSavedRegOff = StackOffset;
-    StackOffset += MFI->getObjectAlignment(CSI[i].getFrameIdx());
-  }
-
-  // Stack locations for FP and RA. If only one of them is used,
-  // the space must be allocated for both, otherwise no space at all.
-  if (hasFP(MF) || MFI->adjustsStack()) {
-    // FP stack location
-    MFI->setObjectOffset(MFI->CreateStackObject(RegSize, RegSize, true),
-                         StackOffset);
-    MipsFI->setFPStackOffset(StackOffset);
-    TopCPUSavedRegOff = StackOffset;
-    StackOffset += RegSize;
-
-    // SP stack location
-    MFI->setObjectOffset(MFI->CreateStackObject(RegSize, RegSize, true),
-                         StackOffset);
-    MipsFI->setRAStackOffset(StackOffset);
-    StackOffset += RegSize;
-
-    if (MFI->adjustsStack())
-      TopCPUSavedRegOff += RegSize;
-  }
-
-  StackOffset = ((StackOffset+StackAlign-1)/StackAlign*StackAlign);
-
-  // Adjust FPU Callee Saved Registers Area. This Area must be
-  // aligned to the default Stack Alignment requirements.
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    unsigned Reg = CSI[i].getReg();
-    if (Mips::CPURegsRegisterClass->contains(Reg))
-      continue;
-    MFI->setObjectOffset(CSI[i].getFrameIdx(), StackOffset);
-    TopFPUSavedRegOff = StackOffset;
-    StackOffset += MFI->getObjectAlignment(CSI[i].getFrameIdx());
-  }
-  StackOffset = ((StackOffset+StackAlign-1)/StackAlign*StackAlign);
-
-  // Update frame info
-  MFI->setStackSize(StackOffset);
-
-  // Recalculate the final tops offset. The final values must be '0'
-  // if there isn't a callee saved register for CPU or FPU, otherwise
-  // a negative offset is needed.
-  if (TopCPUSavedRegOff >= 0)
-    MipsFI->setCPUTopSavedRegOff(TopCPUSavedRegOff-StackOffset);
-
-  if (TopFPUSavedRegOff >= 0)
-    MipsFI->setFPUTopSavedRegOff(TopFPUSavedRegOff-StackOffset);
+bool MipsFrameLowering::targetHandlesStackFrameRounding() const {
+  return true;
 }
 
+static unsigned AlignOffset(unsigned Offset, unsigned Align) {
+  return (Offset + Align - 1) / Align * Align; 
+} 
 
 // expand pair of register and immediate if the immediate doesn't fit in the
 // 16-bit offset field.
@@ -228,7 +120,7 @@ static bool expandRegLargeImmPair(unsigned OrigReg, int OrigImm,
   MachineFunction* MF = MBB.getParent();
   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
   DebugLoc DL = I->getDebugLoc();
-  int ImmLo = OrigImm & 0xffff;
+  int ImmLo = (short)(OrigImm & 0xffff);
   int ImmHi = (((unsigned)OrigImm & 0xffff0000) >> 16) +
               ((OrigImm & 0x8000) != 0);
 
@@ -258,18 +150,18 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
   int NewImm = 0;
   bool ATUsed;
 
-  // Get the right frame order for Mips.
-  adjustMipsStackFrame(MF);
-
-  // Get the number of bytes to allocate from the FrameInfo.
-  unsigned StackSize = MFI->getStackSize();
-
-  // No need to allocate space on the stack.
-  if (StackSize == 0 && !MFI->adjustsStack()) return;
-
-  int FPOffset = MipsFI->getFPStackOffset();
-  int RAOffset = MipsFI->getRAStackOffset();
-
+  // First, compute final stack size.
+  unsigned RegSize = STI.isGP32bit() ? 4 : 8;
+  unsigned StackAlign = getStackAlignment();
+  unsigned LocalVarAreaOffset = MipsFI->needGPSaveRestore() ? 
+    (MFI->getObjectOffset(MipsFI->getGPFI()) + RegSize) :
+    MipsFI->getMaxCallFrameSize();
+  unsigned StackSize = AlignOffset(LocalVarAreaOffset, StackAlign) +
+    AlignOffset(MFI->getStackSize(), StackAlign);
+
+   // Update stack size
+  MFI->setStackSize(StackSize); 
+  
   BuildMI(MBB, MBBI, dl, TII.get(Mips::NOREORDER));
 
   // TODO: check need from GP here.
@@ -278,6 +170,13 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
       .addReg(RegInfo->getPICCallReg());
   BuildMI(MBB, MBBI, dl, TII.get(Mips::NOMACRO));
 
+  // No need to allocate space on the stack.
+  if (StackSize == 0 && !MFI->adjustsStack()) return;
+
+  MachineModuleInfo &MMI = MF.getMMI();
+  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  MachineLocation DstML, SrcML;
+
   // Adjust stack : addi sp, sp, (-imm)
   ATUsed = expandRegLargeImmPair(Mips::SP, -StackSize, NewReg, NewImm, MBB,
                                  MBBI);
@@ -288,97 +187,109 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
   if (ATUsed)
     BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO));
 
-  // Save the return address only if the function isn't a leaf one.
-  // sw  $ra, stack_loc($sp)
-  if (MFI->adjustsStack()) {
-    ATUsed = expandRegLargeImmPair(Mips::SP, RAOffset, NewReg, NewImm, MBB,
-                                   MBBI);
-    BuildMI(MBB, MBBI, dl, TII.get(Mips::SW))
-      .addReg(Mips::RA).addImm(NewImm).addReg(NewReg);
+  // emit ".cfi_def_cfa_offset StackSize"
+  MCSymbol *AdjustSPLabel = MMI.getContext().CreateTempSymbol();
+  BuildMI(MBB, MBBI, dl,
+          TII.get(TargetOpcode::PROLOG_LABEL)).addSym(AdjustSPLabel);
+  DstML = MachineLocation(MachineLocation::VirtualFP);
+  SrcML = MachineLocation(MachineLocation::VirtualFP, -StackSize);
+  Moves.push_back(MachineMove(AdjustSPLabel, DstML, SrcML));
 
-    // FIXME: change this when mips goes MC".
-    if (ATUsed)
-      BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO));
-  }
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
 
-  // if framepointer enabled, save it and set it
-  // to point to the stack pointer
+  if (CSI.size()) {
+    // Find the instruction past the last instruction that saves a callee-saved
+    // register to the stack.
+    for (unsigned i = 0; i < CSI.size(); ++i)
+      ++MBBI;
+ 
+    // Iterate over list of callee-saved registers and emit .cfi_offset
+    // directives.
+    MCSymbol *CSLabel = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, dl,
+            TII.get(TargetOpcode::PROLOG_LABEL)).addSym(CSLabel);
+ 
+    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+           E = CSI.end(); I != E; ++I) {
+      int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
+      unsigned Reg = I->getReg();
+
+      // If Reg is a double precision register, emit two cfa_offsets,
+      // one for each of the paired single precision registers.
+      if (Mips::AFGR64RegisterClass->contains(Reg)) {
+        const unsigned *SubRegs = RegInfo->getSubRegisters(Reg);
+        MachineLocation DstML0(MachineLocation::VirtualFP, Offset);
+        MachineLocation DstML1(MachineLocation::VirtualFP, Offset + 4);
+        MachineLocation SrcML0(*SubRegs);
+        MachineLocation SrcML1(*(SubRegs + 1));
+
+        if (!STI.isLittle())
+          std::swap(SrcML0, SrcML1);
+
+        Moves.push_back(MachineMove(CSLabel, DstML0, SrcML0));
+        Moves.push_back(MachineMove(CSLabel, DstML1, SrcML1));
+      }
+      else {
+        // Reg is either in CPURegs or FGR32.
+        DstML = MachineLocation(MachineLocation::VirtualFP, Offset);
+        SrcML = MachineLocation(Reg);
+        Moves.push_back(MachineMove(CSLabel, DstML, SrcML));
+      }
+    }
+  }    
+
+  // if framepointer enabled, set it to point to the stack pointer.
   if (hasFP(MF)) {
-    // sw  $fp,stack_loc($sp)
-    ATUsed = expandRegLargeImmPair(Mips::SP, FPOffset, NewReg, NewImm, MBB,
-                                   MBBI);
-    BuildMI(MBB, MBBI, dl, TII.get(Mips::SW))
-      .addReg(Mips::FP).addImm(NewImm).addReg(NewReg);
-
-    // FIXME: change this when mips goes MC".
-    if (ATUsed)
-      BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO));
-
-    // move $fp, $sp
+    // Insert instruction "move $fp, $sp" at this location.    
     BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDu), Mips::FP)
       .addReg(Mips::SP).addReg(Mips::ZERO);
+
+    // emit ".cfi_def_cfa_register $fp" 
+    MCSymbol *SetFPLabel = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, dl,
+            TII.get(TargetOpcode::PROLOG_LABEL)).addSym(SetFPLabel);
+    DstML = MachineLocation(Mips::FP);
+    SrcML = MachineLocation(MachineLocation::VirtualFP);
+    Moves.push_back(MachineMove(SetFPLabel, DstML, SrcML));
   }
 
   // Restore GP from the saved stack location
   if (MipsFI->needGPSaveRestore())
     BuildMI(MBB, MBBI, dl, TII.get(Mips::CPRESTORE))
-      .addImm(MipsFI->getGPStackOffset());
+      .addImm(MFI->getObjectOffset(MipsFI->getGPFI()));
 }
 
 void MipsFrameLowering::emitEpilogue(MachineFunction &MF,
                                  MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   MachineFrameInfo *MFI            = MF.getFrameInfo();
-  MipsFunctionInfo *MipsFI         = MF.getInfo<MipsFunctionInfo>();
   const MipsInstrInfo &TII =
     *static_cast<const MipsInstrInfo*>(MF.getTarget().getInstrInfo());
   DebugLoc dl = MBBI->getDebugLoc();
 
   // Get the number of bytes from FrameInfo
-  int NumBytes = (int) MFI->getStackSize();
-
-  // Get the FI's where RA and FP are saved.
-  int FPOffset = MipsFI->getFPStackOffset();
-  int RAOffset = MipsFI->getRAStackOffset();
+  unsigned StackSize = MFI->getStackSize();
 
   unsigned NewReg = 0;
   int NewImm = 0;
   bool ATUsed = false;
 
-  // if framepointer enabled, restore it and restore the
-  // stack pointer
+  // if framepointer enabled, restore the stack pointer.
   if (hasFP(MF)) {
-    // move $sp, $fp
-    BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDu), Mips::SP)
+    // Find the first instruction that restores a callee-saved register.
+    MachineBasicBlock::iterator I = MBBI;
+    
+    for (unsigned i = 0; i < MFI->getCalleeSavedInfo().size(); ++i)
+      --I;
+
+    // Insert instruction "move $sp, $fp" at this location.
+    BuildMI(MBB, I, dl, TII.get(Mips::ADDu), Mips::SP)
       .addReg(Mips::FP).addReg(Mips::ZERO);
-
-    // lw  $fp,stack_loc($sp)
-    ATUsed = expandRegLargeImmPair(Mips::SP, FPOffset, NewReg, NewImm, MBB,
-                                   MBBI);
-    BuildMI(MBB, MBBI, dl, TII.get(Mips::LW), Mips::FP)
-      .addImm(NewImm).addReg(NewReg);
-
-    // FIXME: change this when mips goes MC".
-    if (ATUsed)
-      BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO));
-  }
-
-  // Restore the return address only if the function isn't a leaf one.
-  // lw  $ra, stack_loc($sp)
-  if (MFI->adjustsStack()) {
-    ATUsed = expandRegLargeImmPair(Mips::SP, RAOffset, NewReg, NewImm, MBB,
-                                   MBBI);
-    BuildMI(MBB, MBBI, dl, TII.get(Mips::LW), Mips::RA)
-      .addImm(NewImm).addReg(NewReg);
-
-    // FIXME: change this when mips goes MC".
-    if (ATUsed)
-      BuildMI(MBB, MBBI, dl, TII.get(Mips::ATMACRO));
   }
 
   // adjust stack  : insert addi sp, sp, (imm)
-  if (NumBytes) {
-    ATUsed = expandRegLargeImmPair(Mips::SP, NumBytes, NewReg, NewImm, MBB,
+  if (StackSize) {
+    ATUsed = expandRegLargeImmPair(Mips::SP, StackSize, NewReg, NewImm, MBB,
                                    MBBI);
     BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDiu), Mips::SP)
       .addReg(NewReg).addImm(NewImm);
@@ -389,9 +300,32 @@ void MipsFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
+void
+MipsFrameLowering::getInitialFrameState(std::vector<MachineMove> &Moves) const {
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(Mips::SP, 0);
+  Moves.push_back(MachineMove(0, Dst, Src));
+}
+
 void MipsFrameLowering::
-processFunctionBeforeFrameFinalized(MachineFunction &MF) const {
-  const MipsRegisterInfo *RegInfo =
-    static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
-  RegInfo->processFunctionBeforeFrameFinalized(MF);
+processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS) const {
+  MachineRegisterInfo& MRI = MF.getRegInfo();
+
+  // FIXME: remove this code if register allocator can correctly mark
+  //        $fp and $ra used or unused.
+
+  // Mark $fp and $ra as used or unused.
+  if (hasFP(MF))
+    MRI.setPhysRegUsed(Mips::FP);
+
+  // The register allocator might determine $ra is used after seeing 
+  // instruction "jr $ra", but we do not want PrologEpilogInserter to insert
+  // instructions to save/restore $ra unless there is a function call.
+  // To correct this, $ra is explicitly marked unused if there is no
+  // function call.
+  if (MF.getFrameInfo()->hasCalls())
+    MRI.setPhysRegUsed(Mips::RA);
+  else
+    MRI.setPhysRegUnused(Mips::RA);
 }
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index 34647df..78c78ee 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -27,11 +27,10 @@ protected:
 
 public:
   explicit MipsFrameLowering(const MipsSubtarget &sti)
-    // FIXME: Is this correct at all?
-    : TargetFrameLowering(StackGrowsUp, 8, 0), STI(sti) {
+    : TargetFrameLowering(StackGrowsDown, 8, 0), STI(sti) {
   }
 
-  void adjustMipsStackFrame(MachineFunction &MF) const;
+  bool targetHandlesStackFrameRounding() const;
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
@@ -40,7 +39,10 @@ public:
 
   bool hasFP(const MachineFunction &MF) const;
 
-  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
+  
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 0382964..d8a84ce 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -119,39 +119,41 @@ SelectAddr(SDValue Addr, SDValue &Offset, SDValue &Base) {
 
   // on PIC code Load GA
   if (TM.getRelocationModel() == Reloc::PIC_) {
-    if ((Addr.getOpcode() == ISD::TargetGlobalAddress) ||
-        (Addr.getOpcode() == ISD::TargetConstantPool) ||
-        (Addr.getOpcode() == ISD::TargetJumpTable) ||
-        (Addr.getOpcode() == ISD::TargetBlockAddress) ||
-        (Addr.getOpcode() == ISD::TargetExternalSymbol)) {
+    if (Addr.getOpcode() == MipsISD::WrapperPIC) {
       Base   = CurDAG->getRegister(Mips::GP, MVT::i32);
-      Offset = Addr;
+      Offset = Addr.getOperand(0);
       return true;
     }
   } else {
     if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
         Addr.getOpcode() == ISD::TargetGlobalAddress))
       return false;
+    else if (Addr.getOpcode() == ISD::TargetGlobalTLSAddress) {
+      Base   = CurDAG->getRegister(Mips::GP, MVT::i32);
+      Offset = Addr;
+      return true;
+    }
   }
 
-  // Operand is a result from an ADD.
-  if (Addr.getOpcode() == ISD::ADD) {
-    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
-      if (isInt<16>(CN->getSExtValue())) {
-
-        // If the first operand is a FI, get the TargetFI Node
-        if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
-                                    (Addr.getOperand(0))) {
-          Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
-        } else {
-          Base = Addr.getOperand(0);
-        }
-
-        Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
-        return true;
-      }
+  // Addresses of the form FI+const or FI|const
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isInt<16>(CN->getSExtValue())) {
+
+      // If the first operand is a FI, get the TargetFI Node
+      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
+                                  (Addr.getOperand(0)))
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+      else
+        Base = Addr.getOperand(0);
+
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+      return true;
     }
+  }
 
+  // Operand is a result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) {
     // When loading from constant pools, load the lower address part in
     // the instruction itself. Example, instead of:
     //  lui $2, %hi($CPI1_0)
@@ -321,7 +323,6 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
   // tablegen selection should be handled here.
   ///
   switch(Opcode) {
-
     default: break;
 
     case ISD::SUBE:
@@ -355,10 +356,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
                                   LHS, SDValue(AddCarry,0));
     }
 
-    /// Mul/Div with two results
-    case ISD::SDIVREM:
-    case ISD::UDIVREM:
-      break;
+    /// Mul with two results
     case ISD::SMUL_LOHI:
     case ISD::UMUL_LOHI: {
       SDValue Op1 = Node->getOperand(0);
@@ -405,13 +403,6 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
         return CurDAG->getMachineNode(Mips::MFHI, dl, MVT::i32, InFlag);
     }
 
-    /// Div/Rem operations
-    case ISD::SREM:
-    case ISD::UREM:
-    case ISD::SDIV:
-    case ISD::UDIV:
-      break;
-
     // Get target GOT address.
     case ISD::GLOBAL_OFFSET_TABLE:
       return getGlobalBaseReg();
@@ -445,6 +436,18 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
         return ResNode;
       // Other cases are autogenerated.
       break;
+
+    case MipsISD::ThreadPointer: {
+      unsigned SrcReg = Mips::HWR29;
+      unsigned DestReg = Mips::V1;
+      SDNode *Rdhwr = CurDAG->getMachineNode(Mips::RDHWR, Node->getDebugLoc(),
+          Node->getValueType(0), CurDAG->getRegister(SrcReg, MVT::i32));
+      SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, DestReg,
+          SDValue(Rdhwr, 0));
+      SDValue ResNode = CurDAG->getCopyFromReg(Chain, dl, DestReg, MVT::i32);
+      ReplaceUses(SDValue(Node, 0), ResNode);
+      return ResNode.getNode();
+    }
   }
 
   // Select the default instruction
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 1f1220f..fd90731 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -36,25 +36,30 @@ using namespace llvm;
 
 const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
-    case MipsISD::JmpLink    : return "MipsISD::JmpLink";
-    case MipsISD::Hi         : return "MipsISD::Hi";
-    case MipsISD::Lo         : return "MipsISD::Lo";
-    case MipsISD::GPRel      : return "MipsISD::GPRel";
-    case MipsISD::Ret        : return "MipsISD::Ret";
-    case MipsISD::FPBrcond   : return "MipsISD::FPBrcond";
-    case MipsISD::FPCmp      : return "MipsISD::FPCmp";
-    case MipsISD::CMovFP_T   : return "MipsISD::CMovFP_T";
-    case MipsISD::CMovFP_F   : return "MipsISD::CMovFP_F";
-    case MipsISD::FPRound    : return "MipsISD::FPRound";
-    case MipsISD::MAdd       : return "MipsISD::MAdd";
-    case MipsISD::MAddu      : return "MipsISD::MAddu";
-    case MipsISD::MSub       : return "MipsISD::MSub";
-    case MipsISD::MSubu      : return "MipsISD::MSubu";
-    case MipsISD::DivRem     : return "MipsISD::DivRem";
-    case MipsISD::DivRemU    : return "MipsISD::DivRemU";
-    case MipsISD::BuildPairF64: return "MipsISD::BuildPairF64";
-    case MipsISD::ExtractElementF64: return "MipsISD::ExtractElementF64";
-    default                  : return NULL;
+  case MipsISD::JmpLink:           return "MipsISD::JmpLink";
+  case MipsISD::Hi:                return "MipsISD::Hi";
+  case MipsISD::Lo:                return "MipsISD::Lo";
+  case MipsISD::GPRel:             return "MipsISD::GPRel";
+  case MipsISD::TlsGd:             return "MipsISD::TlsGd";
+  case MipsISD::TprelHi:           return "MipsISD::TprelHi";
+  case MipsISD::TprelLo:           return "MipsISD::TprelLo";
+  case MipsISD::ThreadPointer:     return "MipsISD::ThreadPointer";
+  case MipsISD::Ret:               return "MipsISD::Ret";
+  case MipsISD::FPBrcond:          return "MipsISD::FPBrcond";
+  case MipsISD::FPCmp:             return "MipsISD::FPCmp";
+  case MipsISD::CMovFP_T:          return "MipsISD::CMovFP_T";
+  case MipsISD::CMovFP_F:          return "MipsISD::CMovFP_F";
+  case MipsISD::FPRound:           return "MipsISD::FPRound";
+  case MipsISD::MAdd:              return "MipsISD::MAdd";
+  case MipsISD::MAddu:             return "MipsISD::MAddu";
+  case MipsISD::MSub:              return "MipsISD::MSub";
+  case MipsISD::MSubu:             return "MipsISD::MSubu";
+  case MipsISD::DivRem:            return "MipsISD::DivRem";
+  case MipsISD::DivRemU:           return "MipsISD::DivRemU";
+  case MipsISD::BuildPairF64:      return "MipsISD::BuildPairF64";
+  case MipsISD::ExtractElementF64: return "MipsISD::ExtractElementF64";
+  case MipsISD::WrapperPIC:        return "MipsISD::WrapperPIC";
+  default:                         return NULL;
   }
 }
 
@@ -102,7 +107,6 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::SELECT,             MVT::i32,   Custom);
   setOperationAction(ISD::BRCOND,             MVT::Other, Custom);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,   Custom);
-  setOperationAction(ISD::FP_TO_SINT,         MVT::i32,   Custom);
   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
 
   setOperationAction(ISD::SDIV, MVT::i32, Expand);
@@ -127,20 +131,22 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::SHL_PARTS,         MVT::i32,   Expand);
   setOperationAction(ISD::SRA_PARTS,         MVT::i32,   Expand);
   setOperationAction(ISD::SRL_PARTS,         MVT::i32,   Expand);
-  setOperationAction(ISD::FCOPYSIGN,         MVT::f32,   Expand);
-  setOperationAction(ISD::FCOPYSIGN,         MVT::f64,   Expand);
+  setOperationAction(ISD::FCOPYSIGN,         MVT::f32,   Custom);
+  setOperationAction(ISD::FCOPYSIGN,         MVT::f64,   Custom);
   setOperationAction(ISD::FSIN,              MVT::f32,   Expand);
   setOperationAction(ISD::FSIN,              MVT::f64,   Expand);
   setOperationAction(ISD::FCOS,              MVT::f32,   Expand);
   setOperationAction(ISD::FCOS,              MVT::f64,   Expand);
   setOperationAction(ISD::FPOWI,             MVT::f32,   Expand);
   setOperationAction(ISD::FPOW,              MVT::f32,   Expand);
+  setOperationAction(ISD::FPOW,              MVT::f64,   Expand);
   setOperationAction(ISD::FLOG,              MVT::f32,   Expand);
   setOperationAction(ISD::FLOG2,             MVT::f32,   Expand);
   setOperationAction(ISD::FLOG10,            MVT::f32,   Expand);
   setOperationAction(ISD::FEXP,              MVT::f32,   Expand);
 
-  setOperationAction(ISD::EH_LABEL,          MVT::Other, Expand);
+  setOperationAction(ISD::EXCEPTIONADDR,     MVT::i32, Expand);
+  setOperationAction(ISD::EHSELECTION,       MVT::i32, Expand);
 
   setOperationAction(ISD::VAARG,             MVT::Other, Expand);
   setOperationAction(ISD::VACOPY,            MVT::Other, Expand);
@@ -171,19 +177,19 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setTargetDAGCombine(ISD::UDIVREM);
   setTargetDAGCombine(ISD::SETCC);
 
+  setMinFunctionAlignment(2);
+
   setStackPointerRegisterToSaveRestore(Mips::SP);
   computeRegisterProperties();
+
+  setExceptionPointerRegister(Mips::A0);
+  setExceptionSelectorRegister(Mips::A1);
 }
 
 MVT::SimpleValueType MipsTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i32;
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned MipsTargetLowering::getFunctionAlignment(const Function *) const {
-  return 2;
-}
-
 // SelectMadd -
 // Transforms a subgraph in CurDAG if the following pattern is found:
 //  (addc multLo, Lo0), (adde multHi, Hi0),
@@ -383,7 +389,7 @@ static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG& DAG,
   // insert MFHI
   if (N->hasAnyUseOfValue(1)) {
     SDValue CopyFromHi = DAG.getCopyFromReg(InChain, dl,
-                                               Mips::HI, MVT::i32, InGlue);
+                                            Mips::HI, MVT::i32, InGlue);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), CopyFromHi);
   }
 
@@ -509,13 +515,14 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
     case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
     case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
-    case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
     case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
     case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
     case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
     case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
     case ISD::SELECT:             return LowerSELECT(Op, DAG);
     case ISD::VASTART:            return LowerVASTART(Op, DAG);
+    case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
+    case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
   }
   return SDValue();
 }
@@ -547,45 +554,16 @@ static Mips::FPBranchCode GetFPBranchCodeFromCond(Mips::CondCode CC) {
   return Mips::BRANCH_INVALID;
 }
 
-MachineBasicBlock *
-MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                MachineBasicBlock *BB) const {
+static MachineBasicBlock* ExpandCondMov(MachineInstr *MI, MachineBasicBlock *BB,
+                                        DebugLoc dl,
+                                        const MipsSubtarget* Subtarget,
+                                        const TargetInstrInfo *TII,
+                                        bool isFPCmp, unsigned Opc) {
   // There is no need to expand CMov instructions if target has
   // conditional moves.
   if (Subtarget->hasCondMov())
     return BB;
 
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  bool isFPCmp = false;
-  DebugLoc dl = MI->getDebugLoc();
-  unsigned Opc;
-
-  switch (MI->getOpcode()) {
-  default: assert(false && "Unexpected instr type to insert");
-  case Mips::MOVT:
-  case Mips::MOVT_S:
-  case Mips::MOVT_D:
-    isFPCmp = true;
-    Opc = Mips::BC1F;
-    break;
-  case Mips::MOVF:
-  case Mips::MOVF_S:
-  case Mips::MOVF_D:
-    isFPCmp = true;
-    Opc = Mips::BC1T;
-    break;
-  case Mips::MOVZ_I:
-  case Mips::MOVZ_S:
-  case Mips::MOVZ_D:
-    Opc = Mips::BNE;
-    break;
-  case Mips::MOVN_I:
-  case Mips::MOVN_S:
-  case Mips::MOVN_D:
-    Opc = Mips::BEQ;
-    break;
-  }
-
   // To "insert" a SELECT_CC instruction, we actually have to insert the
   // diamond control-flow pattern.  The incoming instruction knows the
   // destination vreg to set, the condition code register to branch on, the
@@ -624,7 +602,6 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     BuildMI(BB, dl, TII->get(Opc)).addReg(MI->getOperand(2).getReg())
       .addReg(Mips::ZERO).addMBB(sinkMBB);
 
-
   //  copy0MBB:
   //   %FalseValue = ...
   //   # fallthrough to sinkMBB
@@ -653,46 +630,572 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   return BB;
 }
 
-//===----------------------------------------------------------------------===//
-//  Misc Lower Operation implementation
-//===----------------------------------------------------------------------===//
+MachineBasicBlock *
+MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
 
-SDValue MipsTargetLowering::
-LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const
-{
-  if (!Subtarget->isMips1())
-    return Op;
+  switch (MI->getOpcode()) {
+  default:
+    assert(false && "Unexpected instr type to insert");
+    return NULL;
+  case Mips::MOVT:
+  case Mips::MOVT_S:
+  case Mips::MOVT_D:
+    return ExpandCondMov(MI, BB, dl, Subtarget, TII, true, Mips::BC1F);
+  case Mips::MOVF:
+  case Mips::MOVF_S:
+  case Mips::MOVF_D:
+    return ExpandCondMov(MI, BB, dl, Subtarget, TII, true, Mips::BC1T);
+  case Mips::MOVZ_I:
+  case Mips::MOVZ_S:
+  case Mips::MOVZ_D:
+    return ExpandCondMov(MI, BB, dl, Subtarget, TII, false, Mips::BNE);
+  case Mips::MOVN_I:
+  case Mips::MOVN_S:
+  case Mips::MOVN_D:
+    return ExpandCondMov(MI, BB, dl, Subtarget, TII, false, Mips::BEQ);
+
+  case Mips::ATOMIC_LOAD_ADD_I8:
+    return EmitAtomicBinaryPartword(MI, BB, 1, Mips::ADDu);
+  case Mips::ATOMIC_LOAD_ADD_I16:
+    return EmitAtomicBinaryPartword(MI, BB, 2, Mips::ADDu);
+  case Mips::ATOMIC_LOAD_ADD_I32:
+    return EmitAtomicBinary(MI, BB, 4, Mips::ADDu);
+
+  case Mips::ATOMIC_LOAD_AND_I8:
+    return EmitAtomicBinaryPartword(MI, BB, 1, Mips::AND);
+  case Mips::ATOMIC_LOAD_AND_I16:
+    return EmitAtomicBinaryPartword(MI, BB, 2, Mips::AND);
+  case Mips::ATOMIC_LOAD_AND_I32:
+    return EmitAtomicBinary(MI, BB, 4, Mips::AND);
+
+  case Mips::ATOMIC_LOAD_OR_I8:
+    return EmitAtomicBinaryPartword(MI, BB, 1, Mips::OR);
+  case Mips::ATOMIC_LOAD_OR_I16:
+    return EmitAtomicBinaryPartword(MI, BB, 2, Mips::OR);
+  case Mips::ATOMIC_LOAD_OR_I32:
+    return EmitAtomicBinary(MI, BB, 4, Mips::OR);
+
+  case Mips::ATOMIC_LOAD_XOR_I8:
+    return EmitAtomicBinaryPartword(MI, BB, 1, Mips::XOR);
+  case Mips::ATOMIC_LOAD_XOR_I16:
+    return EmitAtomicBinaryPartword(MI, BB, 2, Mips::XOR);
+  case Mips::ATOMIC_LOAD_XOR_I32:
+    return EmitAtomicBinary(MI, BB, 4, Mips::XOR);
+
+  case Mips::ATOMIC_LOAD_NAND_I8:
+    return EmitAtomicBinaryPartword(MI, BB, 1, 0, true);
+  case Mips::ATOMIC_LOAD_NAND_I16:
+    return EmitAtomicBinaryPartword(MI, BB, 2, 0, true);
+  case Mips::ATOMIC_LOAD_NAND_I32:
+    return EmitAtomicBinary(MI, BB, 4, 0, true);
+
+  case Mips::ATOMIC_LOAD_SUB_I8:
+    return EmitAtomicBinaryPartword(MI, BB, 1, Mips::SUBu);
+  case Mips::ATOMIC_LOAD_SUB_I16:
+    return EmitAtomicBinaryPartword(MI, BB, 2, Mips::SUBu);
+  case Mips::ATOMIC_LOAD_SUB_I32:
+    return EmitAtomicBinary(MI, BB, 4, Mips::SUBu);
+
+  case Mips::ATOMIC_SWAP_I8:
+    return EmitAtomicBinaryPartword(MI, BB, 1, 0);
+  case Mips::ATOMIC_SWAP_I16:
+    return EmitAtomicBinaryPartword(MI, BB, 2, 0);
+  case Mips::ATOMIC_SWAP_I32:
+    return EmitAtomicBinary(MI, BB, 4, 0);
+
+  case Mips::ATOMIC_CMP_SWAP_I8:
+    return EmitAtomicCmpSwapPartword(MI, BB, 1);
+  case Mips::ATOMIC_CMP_SWAP_I16:
+    return EmitAtomicCmpSwapPartword(MI, BB, 2);
+  case Mips::ATOMIC_CMP_SWAP_I32:
+    return EmitAtomicCmpSwap(MI, BB, 4);
+  }
+}
 
-  MachineFunction &MF = DAG.getMachineFunction();
-  unsigned CCReg = AddLiveIn(MF, Mips::FCR31, Mips::CCRRegisterClass);
+// This function also handles Mips::ATOMIC_SWAP_I32 (when BinOpcode == 0), and
+// Mips::ATOMIC_LOAD_NAND_I32 (when Nand == true)
+MachineBasicBlock *
+MipsTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                                     unsigned Size, unsigned BinOpcode,
+                                     bool Nand) const {
+  assert(Size == 4 && "Unsupported size for EmitAtomicBinary.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
 
-  SDValue Chain = DAG.getEntryNode();
-  DebugLoc dl = Op.getDebugLoc();
-  SDValue Src = Op.getOperand(0);
-
-  // Set the condition register
-  SDValue CondReg = DAG.getCopyFromReg(Chain, dl, CCReg, MVT::i32);
-  CondReg = DAG.getCopyToReg(Chain, dl, Mips::AT, CondReg);
-  CondReg = DAG.getCopyFromReg(CondReg, dl, Mips::AT, MVT::i32);
-
-  SDValue Cst = DAG.getConstant(3, MVT::i32);
-  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i32, CondReg, Cst);
-  Cst = DAG.getConstant(2, MVT::i32);
-  SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i32, Or, Cst);
-
-  SDValue InFlag(0, 0);
-  CondReg = DAG.getCopyToReg(Chain, dl, Mips::FCR31, Xor, InFlag);
-
-  // Emit the round instruction and bit convert to integer
-  SDValue Trunc = DAG.getNode(MipsISD::FPRound, dl, MVT::f32,
-                              Src, CondReg.getValue(1));
-  SDValue BitCvt = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Trunc);
-  return BitCvt;
+  unsigned Dest = MI->getOperand(0).getReg();
+  unsigned Ptr = MI->getOperand(1).getReg();
+  unsigned Incr = MI->getOperand(2).getReg();
+
+  unsigned Oldval = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp1 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp2 = RegInfo.createVirtualRegister(RC);
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = BB;
+  ++It;
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  //  thisMBB:
+  //    ...
+  //    sw incr, fi(sp)           // store incr to stack (when BinOpcode == 0)
+  //    fallthrough --> loopMBB
+
+  // Note: for atomic.swap (when BinOpcode == 0), storing incr to stack before
+  // the loop and then loading it from stack in block loopMBB is necessary to
+  // prevent MachineLICM pass to hoist "or" instruction out of the block
+  // loopMBB.
+
+  int fi = 0;
+  if (BinOpcode == 0 && !Nand) {
+    // Get or create a temporary stack location.
+    MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>();
+    fi = MipsFI->getAtomicFrameIndex();
+    if (fi == -1) {
+      fi = MF->getFrameInfo()->CreateStackObject(Size, Size, false);
+      MipsFI->setAtomicFrameIndex(fi);
+    }
+
+    BuildMI(BB, dl, TII->get(Mips::SW))
+        .addReg(Incr).addImm(0).addFrameIndex(fi);
+  }
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //    ll oldval, 0(ptr)
+  //    or dest, $0, oldval
+  //    <binop> tmp1, oldval, incr
+  //    sc tmp1, 0(ptr)
+  //    beq tmp1, $0, loopMBB
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(Mips::LL), Oldval).addImm(0).addReg(Ptr);
+  BuildMI(BB, dl, TII->get(Mips::OR), Dest).addReg(Mips::ZERO).addReg(Oldval);
+  if (Nand) {
+    //  and tmp2, oldval, incr
+    //  nor tmp1, $0, tmp2
+    BuildMI(BB, dl, TII->get(Mips::AND), Tmp2).addReg(Oldval).addReg(Incr);
+    BuildMI(BB, dl, TII->get(Mips::NOR), Tmp1).addReg(Mips::ZERO).addReg(Tmp2);
+  } else if (BinOpcode) {
+    //  <binop> tmp1, oldval, incr
+    BuildMI(BB, dl, TII->get(BinOpcode), Tmp1).addReg(Oldval).addReg(Incr);
+  } else {
+    //  lw tmp2, fi(sp)              // load incr from stack
+    //  or tmp1, $zero, tmp2
+    BuildMI(BB, dl, TII->get(Mips::LW), Tmp2).addImm(0).addFrameIndex(fi);;
+    BuildMI(BB, dl, TII->get(Mips::OR), Tmp1).addReg(Mips::ZERO).addReg(Tmp2);
+  }
+  BuildMI(BB, dl, TII->get(Mips::SC), Tmp1).addReg(Tmp1).addImm(0).addReg(Ptr);
+  BuildMI(BB, dl, TII->get(Mips::BEQ))
+    .addReg(Tmp1).addReg(Mips::ZERO).addMBB(loopMBB);
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
 }
 
+MachineBasicBlock *
+MipsTargetLowering::EmitAtomicBinaryPartword(MachineInstr *MI,
+                                             MachineBasicBlock *BB,
+                                             unsigned Size, unsigned BinOpcode,
+                                             bool Nand) const {
+  assert((Size == 1 || Size == 2) &&
+      "Unsupported size for EmitAtomicBinaryPartial.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+
+  unsigned Dest = MI->getOperand(0).getReg();
+  unsigned Ptr = MI->getOperand(1).getReg();
+  unsigned Incr = MI->getOperand(2).getReg();
+
+  unsigned Addr = RegInfo.createVirtualRegister(RC);
+  unsigned Shift = RegInfo.createVirtualRegister(RC);
+  unsigned Mask = RegInfo.createVirtualRegister(RC);
+  unsigned Mask2 = RegInfo.createVirtualRegister(RC);
+  unsigned Newval = RegInfo.createVirtualRegister(RC);
+  unsigned Oldval = RegInfo.createVirtualRegister(RC);
+  unsigned Incr2 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp1 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp2 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp3 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp4 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp5 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp6 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp7 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp8 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp9 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp10 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp11 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp12 = RegInfo.createVirtualRegister(RC);
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = BB;
+  ++It;
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  //  thisMBB:
+  //    addiu   tmp1,$0,-4                # 0xfffffffc
+  //    and     addr,ptr,tmp1
+  //    andi    tmp2,ptr,3
+  //    sll     shift,tmp2,3
+  //    ori     tmp3,$0,255               # 0xff
+  //    sll     mask,tmp3,shift
+  //    nor     mask2,$0,mask
+  //    andi    tmp4,incr,255
+  //    sll     incr2,tmp4,shift
+  //    sw      incr2, fi(sp)      // store incr2 to stack (when BinOpcode == 0)
+
+  // Note: for atomic.swap (when BinOpcode == 0), storing incr2 to stack before
+  // the loop and then loading it from stack in block loopMBB is necessary to
+  // prevent MachineLICM pass to hoist "or" instruction out of the block
+  // loopMBB.
+
+  int64_t MaskImm = (Size == 1) ? 255 : 65535;
+  BuildMI(BB, dl, TII->get(Mips::ADDiu), Tmp1).addReg(Mips::ZERO).addImm(-4);
+  BuildMI(BB, dl, TII->get(Mips::AND), Addr).addReg(Ptr).addReg(Tmp1);
+  BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp2).addReg(Ptr).addImm(3);
+  BuildMI(BB, dl, TII->get(Mips::SLL), Shift).addReg(Tmp2).addImm(3);
+  BuildMI(BB, dl, TII->get(Mips::ORi), Tmp3).addReg(Mips::ZERO).addImm(MaskImm);
+  BuildMI(BB, dl, TII->get(Mips::SLL), Mask).addReg(Tmp3).addReg(Shift);
+  BuildMI(BB, dl, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
+  if (BinOpcode != Mips::SUBu) {
+    BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp4).addReg(Incr).addImm(MaskImm);
+    BuildMI(BB, dl, TII->get(Mips::SLL), Incr2).addReg(Tmp4).addReg(Shift);
+  } else {
+    BuildMI(BB, dl, TII->get(Mips::SUBu), Tmp4).addReg(Mips::ZERO).addReg(Incr);
+    BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp5).addReg(Tmp4).addImm(MaskImm);
+    BuildMI(BB, dl, TII->get(Mips::SLL), Incr2).addReg(Tmp5).addReg(Shift);
+  }
+
+  int fi = 0;
+  if (BinOpcode == 0 && !Nand) {
+    // Get or create a temporary stack location.
+    MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>();
+    fi = MipsFI->getAtomicFrameIndex();
+    if (fi == -1) {
+      fi = MF->getFrameInfo()->CreateStackObject(Size, Size, false);
+      MipsFI->setAtomicFrameIndex(fi);
+    }
+
+    BuildMI(BB, dl, TII->get(Mips::SW))
+        .addReg(Incr2).addImm(0).addFrameIndex(fi);
+  }
+  BB->addSuccessor(loopMBB);
+
+  // loopMBB:
+  //   ll      oldval,0(addr)
+  //   binop   tmp7,oldval,incr2
+  //   and     newval,tmp7,mask
+  //   and     tmp8,oldval,mask2
+  //   or      tmp9,tmp8,newval
+  //   sc      tmp9,0(addr)
+  //   beq     tmp9,$0,loopMBB
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(Mips::LL), Oldval).addImm(0).addReg(Addr);
+  if (Nand) {
+    //  and tmp6, oldval, incr2
+    //  nor tmp7, $0, tmp6
+    BuildMI(BB, dl, TII->get(Mips::AND), Tmp6).addReg(Oldval).addReg(Incr2);
+    BuildMI(BB, dl, TII->get(Mips::NOR), Tmp7).addReg(Mips::ZERO).addReg(Tmp6);
+  } else if (BinOpcode == Mips::SUBu) {
+    //  addu tmp7, oldval, incr2
+    BuildMI(BB, dl, TII->get(Mips::ADDu), Tmp7).addReg(Oldval).addReg(Incr2);
+  } else if (BinOpcode) {
+    //  <binop> tmp7, oldval, incr2
+    BuildMI(BB, dl, TII->get(BinOpcode), Tmp7).addReg(Oldval).addReg(Incr2);
+  } else {
+    //  lw tmp6, fi(sp)              // load incr2 from stack
+    //  or tmp7, $zero, tmp6
+    BuildMI(BB, dl, TII->get(Mips::LW), Tmp6).addImm(0).addFrameIndex(fi);;
+    BuildMI(BB, dl, TII->get(Mips::OR), Tmp7).addReg(Mips::ZERO).addReg(Tmp6);
+  }
+  BuildMI(BB, dl, TII->get(Mips::AND), Newval).addReg(Tmp7).addReg(Mask);
+  BuildMI(BB, dl, TII->get(Mips::AND), Tmp8).addReg(Oldval).addReg(Mask2);
+  BuildMI(BB, dl, TII->get(Mips::OR), Tmp9).addReg(Tmp8).addReg(Newval);
+  BuildMI(BB, dl, TII->get(Mips::SC), Tmp9).addReg(Tmp9).addImm(0).addReg(Addr);
+  BuildMI(BB, dl, TII->get(Mips::BEQ))
+      .addReg(Tmp9).addReg(Mips::ZERO).addMBB(loopMBB);
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //    and     tmp10,oldval,mask
+  //    srl     tmp11,tmp10,shift
+  //    sll     tmp12,tmp11,24
+  //    sra     dest,tmp12,24
+  BB = exitMBB;
+  int64_t ShiftImm = (Size == 1) ? 24 : 16;
+  // reverse order
+  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRA), Dest)
+      .addReg(Tmp12).addImm(ShiftImm);
+  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SLL), Tmp12)
+      .addReg(Tmp11).addImm(ShiftImm);
+  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRL), Tmp11)
+      .addReg(Tmp10).addReg(Shift);
+  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::AND), Tmp10)
+    .addReg(Oldval).addReg(Mask);
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+MipsTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
+                                      MachineBasicBlock *BB,
+                                      unsigned Size) const {
+  assert(Size == 4 && "Unsupported size for EmitAtomicCmpSwap.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+
+  unsigned Dest    = MI->getOperand(0).getReg();
+  unsigned Ptr     = MI->getOperand(1).getReg();
+  unsigned Oldval  = MI->getOperand(2).getReg();
+  unsigned Newval  = MI->getOperand(3).getReg();
+
+  unsigned Tmp1 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp2 = RegInfo.createVirtualRegister(RC);
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = BB;
+  ++It;
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Get or create a temporary stack location.
+  MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>();
+  int fi = MipsFI->getAtomicFrameIndex();
+  if (fi == -1) {
+    fi = MF->getFrameInfo()->CreateStackObject(Size, Size, false);
+    MipsFI->setAtomicFrameIndex(fi);
+  }
+
+  //  thisMBB:
+  //    ...
+  //    sw newval, fi(sp)           // store newval to stack
+  //    fallthrough --> loop1MBB
+
+  // Note: storing newval to stack before the loop and then loading it from
+  // stack in block loop2MBB is necessary to prevent MachineLICM pass to
+  // hoist "or" instruction out of the block loop2MBB.
+
+  BuildMI(BB, dl, TII->get(Mips::SW))
+      .addReg(Newval).addImm(0).addFrameIndex(fi);
+  BB->addSuccessor(loop1MBB);
+
+  // loop1MBB:
+  //   ll dest, 0(ptr)
+  //   bne dest, oldval, exitMBB
+  BB = loop1MBB;
+  BuildMI(BB, dl, TII->get(Mips::LL), Dest).addImm(0).addReg(Ptr);
+  BuildMI(BB, dl, TII->get(Mips::BNE))
+    .addReg(Dest).addReg(Oldval).addMBB(exitMBB);
+  BB->addSuccessor(exitMBB);
+  BB->addSuccessor(loop2MBB);
+
+  // loop2MBB:
+  //   lw tmp2, fi(sp)              // load newval from stack
+  //   or tmp1, $0, tmp2
+  //   sc tmp1, 0(ptr)
+  //   beq tmp1, $0, loop1MBB
+  BB = loop2MBB;
+  BuildMI(BB, dl, TII->get(Mips::LW), Tmp2).addImm(0).addFrameIndex(fi);;
+  BuildMI(BB, dl, TII->get(Mips::OR), Tmp1).addReg(Mips::ZERO).addReg(Tmp2);
+  BuildMI(BB, dl, TII->get(Mips::SC), Tmp1).addReg(Tmp1).addImm(0).addReg(Ptr);
+  BuildMI(BB, dl, TII->get(Mips::BEQ))
+    .addReg(Tmp1).addReg(Mips::ZERO).addMBB(loop1MBB);
+  BB->addSuccessor(loop1MBB);
+  BB->addSuccessor(exitMBB);
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+MipsTargetLowering::EmitAtomicCmpSwapPartword(MachineInstr *MI,
+                                              MachineBasicBlock *BB,
+                                              unsigned Size) const {
+  assert((Size == 1 || Size == 2) &&
+      "Unsupported size for EmitAtomicCmpSwapPartial.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+
+  unsigned Dest    = MI->getOperand(0).getReg();
+  unsigned Ptr     = MI->getOperand(1).getReg();
+  unsigned Oldval  = MI->getOperand(2).getReg();
+  unsigned Newval  = MI->getOperand(3).getReg();
+
+  unsigned Addr = RegInfo.createVirtualRegister(RC);
+  unsigned Shift = RegInfo.createVirtualRegister(RC);
+  unsigned Mask = RegInfo.createVirtualRegister(RC);
+  unsigned Mask2 = RegInfo.createVirtualRegister(RC);
+  unsigned Oldval2 = RegInfo.createVirtualRegister(RC);
+  unsigned Oldval3 = RegInfo.createVirtualRegister(RC);
+  unsigned Oldval4 = RegInfo.createVirtualRegister(RC);
+  unsigned Newval2 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp1 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp2 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp3 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp4 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp5 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp6 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp7 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp8 = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp9 = RegInfo.createVirtualRegister(RC);
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = BB;
+  ++It;
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  //  thisMBB:
+  //    addiu   tmp1,$0,-4                # 0xfffffffc
+  //    and     addr,ptr,tmp1
+  //    andi    tmp2,ptr,3
+  //    sll     shift,tmp2,3
+  //    ori     tmp3,$0,255               # 0xff
+  //    sll     mask,tmp3,shift
+  //    nor     mask2,$0,mask
+  //    andi    tmp4,oldval,255
+  //    sll     oldval2,tmp4,shift
+  //    andi    tmp5,newval,255
+  //    sll     newval2,tmp5,shift
+  int64_t MaskImm = (Size == 1) ? 255 : 65535;
+  BuildMI(BB, dl, TII->get(Mips::ADDiu), Tmp1).addReg(Mips::ZERO).addImm(-4);
+  BuildMI(BB, dl, TII->get(Mips::AND), Addr).addReg(Ptr).addReg(Tmp1);
+  BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp2).addReg(Ptr).addImm(3);
+  BuildMI(BB, dl, TII->get(Mips::SLL), Shift).addReg(Tmp2).addImm(3);
+  BuildMI(BB, dl, TII->get(Mips::ORi), Tmp3).addReg(Mips::ZERO).addImm(MaskImm);
+  BuildMI(BB, dl, TII->get(Mips::SLL), Mask).addReg(Tmp3).addReg(Shift);
+  BuildMI(BB, dl, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
+  BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp4).addReg(Oldval).addImm(MaskImm);
+  BuildMI(BB, dl, TII->get(Mips::SLL), Oldval2).addReg(Tmp4).addReg(Shift);
+  BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp5).addReg(Newval).addImm(MaskImm);
+  BuildMI(BB, dl, TII->get(Mips::SLL), Newval2).addReg(Tmp5).addReg(Shift);
+  BB->addSuccessor(loop1MBB);
+
+  //  loop1MBB:
+  //    ll      oldval3,0(addr)
+  //    and     oldval4,oldval3,mask
+  //    bne     oldval4,oldval2,exitMBB
+  BB = loop1MBB;
+  BuildMI(BB, dl, TII->get(Mips::LL), Oldval3).addImm(0).addReg(Addr);
+  BuildMI(BB, dl, TII->get(Mips::AND), Oldval4).addReg(Oldval3).addReg(Mask);
+  BuildMI(BB, dl, TII->get(Mips::BNE))
+      .addReg(Oldval4).addReg(Oldval2).addMBB(exitMBB);
+  BB->addSuccessor(exitMBB);
+  BB->addSuccessor(loop2MBB);
+
+  //  loop2MBB:
+  //    and     tmp6,oldval3,mask2
+  //    or      tmp7,tmp6,newval2
+  //    sc      tmp7,0(addr)
+  //    beq     tmp7,$0,loop1MBB
+  BB = loop2MBB;
+  BuildMI(BB, dl, TII->get(Mips::AND), Tmp6).addReg(Oldval3).addReg(Mask2);
+  BuildMI(BB, dl, TII->get(Mips::OR), Tmp7).addReg(Tmp6).addReg(Newval2);
+  BuildMI(BB, dl, TII->get(Mips::SC), Tmp7)
+      .addReg(Tmp7).addImm(0).addReg(Addr);
+  BuildMI(BB, dl, TII->get(Mips::BEQ))
+      .addReg(Tmp7).addReg(Mips::ZERO).addMBB(loop1MBB);
+  BB->addSuccessor(loop1MBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //    srl     tmp8,oldval4,shift
+  //    sll     tmp9,tmp8,24
+  //    sra     dest,tmp9,24
+  BB = exitMBB;
+  int64_t ShiftImm = (Size == 1) ? 24 : 16;
+  // reverse order
+  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRA), Dest)
+      .addReg(Tmp9).addImm(ShiftImm);
+  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SLL), Tmp9)
+      .addReg(Tmp8).addImm(ShiftImm);
+  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRL), Tmp8)
+      .addReg(Oldval4).addReg(Shift);
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+//  Misc Lower Operation implementation
+//===----------------------------------------------------------------------===//
 SDValue MipsTargetLowering::
 LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
 {
+  unsigned StackAlignment =
+    getTargetMachine().getFrameLowering()->getStackAlignment();
+  assert(StackAlignment >=
+         cast<ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue() &&
+         "Cannot lower if the alignment of the allocated space is larger than \
+          that of the stack.");
+
   SDValue Chain = Op.getOperand(0);
   SDValue Size = Op.getOperand(1);
   DebugLoc dl = Op.getDebugLoc();
@@ -706,11 +1209,25 @@ LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
 
   // The Sub result contains the new stack start address, so it
   // must be placed in the stack pointer register.
-  Chain = DAG.getCopyToReg(StackPointer.getValue(1), dl, Mips::SP, Sub);
+  Chain = DAG.getCopyToReg(StackPointer.getValue(1), dl, Mips::SP, Sub,
+                           SDValue());
+  // Retrieve updated $sp. There is a glue input to prevent instructions that
+  // clobber $sp from being inserted between copytoreg and copyfromreg.
+  SDValue NewSP = DAG.getCopyFromReg(Chain, dl, Mips::SP, MVT::i32,
+                                     Chain.getValue(1));
+
+  // The stack space reserved by alloca is located right above the argument
+  // area. It is aligned on a boundary that is a multiple of StackAlignment.
+  MachineFunction &MF = DAG.getMachineFunction();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  unsigned SPOffset = (MipsFI->getMaxCallFrameSize() + StackAlignment - 1) /
+                      StackAlignment * StackAlignment;
+  SDValue AllocPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, NewSP,
+                                 DAG.getConstant(SPOffset, MVT::i32));
 
   // This node always has two return values: a new stack pointer
   // value and a chain
-  SDValue Ops[2] = { Sub, Chain };
+  SDValue Ops[2] = { AllocPtr, NewSP.getValue(1) };
   return DAG.getMergeValues(Ops, 2, dl);
 }
 
@@ -778,25 +1295,23 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
     SDValue HiPart = DAG.getNode(MipsISD::Hi, dl, VTs, &GAHi, 1);
     SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GALo);
     return DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
-  } else {
-    SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
-                                            MipsII::MO_GOT);
-    SDValue ResNode = DAG.getLoad(MVT::i32, dl,
-                                  DAG.getEntryNode(), GA, MachinePointerInfo(),
-                                  false, false, 0);
-    // On functions and global targets not internal linked only
-    // a load from got/GP is necessary for PIC to work.
-    if (!GV->hasInternalLinkage() &&
-        (!GV->hasLocalLinkage() || isa<Function>(GV)))
-      return ResNode;
-    SDValue GALo = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
-                                              MipsII::MO_ABS_LO);
-    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GALo);
-    return DAG.getNode(ISD::ADD, dl, MVT::i32, ResNode, Lo);
   }
 
-  llvm_unreachable("Dont know how to handle GlobalAddress");
-  return SDValue(0,0);
+  SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+                                          MipsII::MO_GOT);
+  GA = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, GA);
+  SDValue ResNode = DAG.getLoad(MVT::i32, dl,
+                                DAG.getEntryNode(), GA, MachinePointerInfo(),
+                                false, false, 0);
+  // On functions and global targets not internal linked only
+  // a load from got/GP is necessary for PIC to work.
+  if (!GV->hasInternalLinkage() &&
+      (!GV->hasLocalLinkage() || isa<Function>(GV)))
+    return ResNode;
+  SDValue GALo = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+                                            MipsII::MO_ABS_LO);
+  SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GALo);
+  return DAG.getNode(ISD::ADD, dl, MVT::i32, ResNode, Lo);
 }
 
 SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
@@ -818,6 +1333,7 @@ SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
 
   SDValue BAGOTOffset = DAG.getBlockAddress(BA, MVT::i32, true,
                                             MipsII::MO_GOT);
+  BAGOTOffset = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, BAGOTOffset);
   SDValue BALOOffset = DAG.getBlockAddress(BA, MVT::i32, true,
                                            MipsII::MO_ABS_LO);
   SDValue Load = DAG.getLoad(MVT::i32, dl,
@@ -830,8 +1346,60 @@ SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
 SDValue MipsTargetLowering::
 LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
 {
-  llvm_unreachable("TLS not implemented for MIPS.");
-  return SDValue(); // Not reached
+  // If the relocation model is PIC, use the General Dynamic TLS Model,
+  // otherwise use the Initial Exec or Local Exec TLS Model.
+  // TODO: implement Local Dynamic TLS model
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  DebugLoc dl = GA->getDebugLoc();
+  const GlobalValue *GV = GA->getGlobal();
+  EVT PtrVT = getPointerTy();
+
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+    // General Dynamic TLS Model
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32,
+                                                 0, MipsII::MO_TLSGD);
+    SDValue Tlsgd = DAG.getNode(MipsISD::TlsGd, dl, MVT::i32, TGA);
+    SDValue GP = DAG.getRegister(Mips::GP, MVT::i32);
+    SDValue Argument = DAG.getNode(ISD::ADD, dl, MVT::i32, GP, Tlsgd);
+
+    ArgListTy Args;
+    ArgListEntry Entry;
+    Entry.Node = Argument;
+    Entry.Ty = (const Type *) Type::getInt32Ty(*DAG.getContext());
+    Args.push_back(Entry);
+    std::pair<SDValue, SDValue> CallResult =
+        LowerCallTo(DAG.getEntryNode(),
+                 (const Type *) Type::getInt32Ty(*DAG.getContext()),
+                 false, false, false, false,
+                 0, CallingConv::C, false, true,
+                 DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl);
+
+    return CallResult.first;
+  } else {
+    SDValue Offset;
+    if (GV->isDeclaration()) {
+      // Initial Exec TLS Model
+      SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+                                              MipsII::MO_GOTTPREL);
+      Offset = DAG.getLoad(MVT::i32, dl,
+                                  DAG.getEntryNode(), TGA, MachinePointerInfo(),
+                                  false, false, 0);
+    } else {
+      // Local Exec TLS Model
+      SDVTList VTs = DAG.getVTList(MVT::i32);
+      SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+                                              MipsII::MO_TPREL_HI);
+      SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+                                              MipsII::MO_TPREL_LO);
+      SDValue Hi = DAG.getNode(MipsISD::TprelHi, dl, VTs, &TGAHi, 1);
+      SDValue Lo = DAG.getNode(MipsISD::TprelLo, dl, MVT::i32, TGALo);
+      Offset = DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, Lo);
+    }
+
+    SDValue ThreadPointer = DAG.getNode(MipsISD::ThreadPointer, dl, PtrVT);
+    return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+  }
 }
 
 SDValue MipsTargetLowering::
@@ -852,10 +1420,12 @@ LowerJumpTable(SDValue Op, SelectionDAG &DAG) const
   if (!IsPIC) {
     SDValue Ops[] = { JTI };
     HiPart = DAG.getNode(MipsISD::Hi, dl, DAG.getVTList(MVT::i32), Ops, 1);
-  } else // Emit Load from Global Pointer
+  } else {// Emit Load from Global Pointer
+    JTI = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, JTI);
     HiPart = DAG.getLoad(MVT::i32, dl, DAG.getEntryNode(), JTI,
                          MachinePointerInfo(),
                          false, false, 0);
+  }
 
   SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
                                          MipsII::MO_ABS_LO);
@@ -895,6 +1465,7 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
   } else {
     SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
                                            N->getOffset(), MipsII::MO_GOT);
+    CP = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, CP);
     SDValue Load = DAG.getLoad(MVT::i32, dl, DAG.getEntryNode(),
                                CP, MachinePointerInfo::getConstantPool(),
                                false, false, 0);
@@ -923,6 +1494,74 @@ SDValue MipsTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                       false, false, 0);
 }
 
+static SDValue LowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG) {
+  // FIXME: Use ext/ins instructions if target architecture is Mips32r2.
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op.getOperand(0));
+  SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op.getOperand(1));
+  SDValue And0 = DAG.getNode(ISD::AND, dl, MVT::i32, Op0,
+                             DAG.getConstant(0x7fffffff, MVT::i32));
+  SDValue And1 = DAG.getNode(ISD::AND, dl, MVT::i32, Op1,
+                             DAG.getConstant(0x80000000, MVT::i32));
+  SDValue Result = DAG.getNode(ISD::OR, dl, MVT::i32, And0, And1);
+  return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Result);
+}
+
+static SDValue LowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, bool isLittle) {
+  // FIXME:
+  //  Use ext/ins instructions if target architecture is Mips32r2.
+  //  Eliminate redundant mfc1 and mtc1 instructions.
+  unsigned LoIdx = 0, HiIdx = 1;
+
+  if (!isLittle)
+    std::swap(LoIdx, HiIdx);
+
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Word0 = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32,
+                              Op.getOperand(0),
+                              DAG.getConstant(LoIdx, MVT::i32));
+  SDValue Hi0 = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32,
+                            Op.getOperand(0), DAG.getConstant(HiIdx, MVT::i32));
+  SDValue Hi1 = DAG.getNode(MipsISD::ExtractElementF64, dl, MVT::i32,
+                            Op.getOperand(1), DAG.getConstant(HiIdx, MVT::i32));
+  SDValue And0 = DAG.getNode(ISD::AND, dl, MVT::i32, Hi0,
+                             DAG.getConstant(0x7fffffff, MVT::i32));
+  SDValue And1 = DAG.getNode(ISD::AND, dl, MVT::i32, Hi1,
+                             DAG.getConstant(0x80000000, MVT::i32));
+  SDValue Word1 = DAG.getNode(ISD::OR, dl, MVT::i32, And0, And1);
+
+  if (!isLittle)
+    std::swap(Word0, Word1);
+
+  return DAG.getNode(MipsISD::BuildPairF64, dl, MVT::f64, Word0, Word1);
+}
+
+SDValue MipsTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
+  const {
+  EVT Ty = Op.getValueType();
+
+  assert(Ty == MVT::f32 || Ty == MVT::f64);
+
+  if (Ty == MVT::f32)
+    return LowerFCOPYSIGN32(Op, DAG);
+  else
+    return LowerFCOPYSIGN64(Op, DAG, Subtarget->isLittle());
+}
+
+SDValue MipsTargetLowering::
+LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  assert((Depth == 0) &&
+         "Frame address can only be determined for current frame.");
+
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, Mips::FP, VT);
+  return FrameAddr;
+}
+
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -940,6 +1579,8 @@ SDValue MipsTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
 //       yet to hold an argument. Otherwise, use A2, A3 and stack. If A1 is
 //       not used, it must be shadowed. If only A3 is avaiable, shadow it and
 //       go to stack.
+//
+//  For vararg functions, all arguments are passed in A0, A1, A2, A3 and stack.
 //===----------------------------------------------------------------------===//
 
 static bool CC_MipsO32(unsigned ValNo, MVT ValVT,
@@ -958,90 +1599,17 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT,
       Mips::D6, Mips::D7
   };
 
-  unsigned Reg = 0;
-  static bool IntRegUsed = false;
-
-  // This must be the first arg of the call if no regs have been allocated.
-  // Initialize IntRegUsed in that case.
-  if (IntRegs[State.getFirstUnallocated(IntRegs, IntRegsSize)] == Mips::A0 &&
-      F32Regs[State.getFirstUnallocated(F32Regs, FloatRegsSize)] == Mips::F12 &&
-      F64Regs[State.getFirstUnallocated(F64Regs, FloatRegsSize)] == Mips::D6)
-    IntRegUsed = false;
-
-  // Promote i8 and i16
-  if (LocVT == MVT::i8 || LocVT == MVT::i16) {
-    LocVT = MVT::i32;
-    if (ArgFlags.isSExt())
-      LocInfo = CCValAssign::SExt;
-    else if (ArgFlags.isZExt())
-      LocInfo = CCValAssign::ZExt;
-    else
-      LocInfo = CCValAssign::AExt;
+  // ByVal Args
+  if (ArgFlags.isByVal()) {
+    State.HandleByVal(ValNo, ValVT, LocVT, LocInfo,
+                      1 /*MinSize*/, 4 /*MinAlign*/, ArgFlags);
+    unsigned NextReg = (State.getNextStackOffset() + 3) / 4;
+    for (unsigned r = State.getFirstUnallocated(IntRegs, IntRegsSize);
+         r < std::min(IntRegsSize, NextReg); ++r)
+      State.AllocateReg(IntRegs[r]);
+    return false;
   }
 
-  if (ValVT == MVT::i32) {
-    Reg = State.AllocateReg(IntRegs, IntRegsSize);
-    IntRegUsed = true;
-  } else if (ValVT == MVT::f32) {
-    // An int reg has to be marked allocated regardless of whether or not
-    // IntRegUsed is true.
-    Reg = State.AllocateReg(IntRegs, IntRegsSize);
-
-    if (IntRegUsed) {
-      if (Reg) // Int reg is available
-        LocVT = MVT::i32;
-    } else {
-      unsigned FReg = State.AllocateReg(F32Regs, FloatRegsSize);
-      if (FReg) // F32 reg is available
-        Reg = FReg;
-      else if (Reg) // No F32 regs are available, but an int reg is available.
-        LocVT = MVT::i32;
-    }
-  } else if (ValVT == MVT::f64) {
-    // Int regs have to be marked allocated regardless of whether or not
-    // IntRegUsed is true.
-    Reg = State.AllocateReg(IntRegs, IntRegsSize);
-    if (Reg == Mips::A1)
-      Reg = State.AllocateReg(IntRegs, IntRegsSize);
-    else if (Reg == Mips::A3)
-      Reg = 0;
-    State.AllocateReg(IntRegs, IntRegsSize);
-
-    // At this point, Reg is A0, A2 or 0, and all the unavailable integer regs
-    // are marked as allocated.
-    if (IntRegUsed) {
-      if (Reg)// if int reg is available
-        LocVT = MVT::i32;
-    } else {
-      unsigned FReg = State.AllocateReg(F64Regs, FloatRegsSize);
-      if (FReg) // F64 reg is available.
-        Reg = FReg;
-      else if (Reg) // No F64 regs are available, but an int reg is available.
-        LocVT = MVT::i32;
-    }
-  } else
-    assert(false && "cannot handle this ValVT");
-
-  if (!Reg) {
-    unsigned SizeInBytes = ValVT.getSizeInBits() >> 3;
-    unsigned Offset = State.AllocateStack(SizeInBytes, SizeInBytes);
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-  } else
-    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-
-  return false; // CC must always match
-}
-
-static bool CC_MipsO32_VarArgs(unsigned ValNo, MVT ValVT,
-                       MVT LocVT, CCValAssign::LocInfo LocInfo,
-                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
-
-  static const unsigned IntRegsSize=4;
-
-  static const unsigned IntRegs[] = {
-      Mips::A0, Mips::A1, Mips::A2, Mips::A3
-  };
-
   // Promote i8 and i16
   if (LocVT == MVT::i8 || LocVT == MVT::i16) {
     LocVT = MVT::i32;
@@ -1055,23 +1623,52 @@ static bool CC_MipsO32_VarArgs(unsigned ValNo, MVT ValVT,
 
   unsigned Reg;
 
-  if (ValVT == MVT::i32 || ValVT == MVT::f32) {
+  // f32 and f64 are allocated in A0, A1, A2, A3 when either of the following
+  // is true: function is vararg, argument is 3rd or higher, there is previous
+  // argument which is not f32 or f64.
+  bool AllocateFloatsInIntReg = State.isVarArg() || ValNo > 1
+      || State.getFirstUnallocated(F32Regs, FloatRegsSize) != ValNo;
+  unsigned OrigAlign = ArgFlags.getOrigAlign();
+  bool isI64 = (ValVT == MVT::i32 && OrigAlign == 8);
+
+  if (ValVT == MVT::i32 || (ValVT == MVT::f32 && AllocateFloatsInIntReg)) {
     Reg = State.AllocateReg(IntRegs, IntRegsSize);
+    // If this is the first part of an i64 arg,
+    // the allocated register must be either A0 or A2.
+    if (isI64 && (Reg == Mips::A1 || Reg == Mips::A3))
+      Reg = State.AllocateReg(IntRegs, IntRegsSize);
     LocVT = MVT::i32;
-  } else if (ValVT == MVT::f64) {
+  } else if (ValVT == MVT::f64 && AllocateFloatsInIntReg) {
+    // Allocate int register and shadow next int register. If first
+    // available register is Mips::A1 or Mips::A3, shadow it too.
     Reg = State.AllocateReg(IntRegs, IntRegsSize);
     if (Reg == Mips::A1 || Reg == Mips::A3)
       Reg = State.AllocateReg(IntRegs, IntRegsSize);
     State.AllocateReg(IntRegs, IntRegsSize);
     LocVT = MVT::i32;
+  } else if (ValVT.isFloatingPoint() && !AllocateFloatsInIntReg) {
+    // we are guaranteed to find an available float register
+    if (ValVT == MVT::f32) {
+      Reg = State.AllocateReg(F32Regs, FloatRegsSize);
+      // Shadow int register
+      State.AllocateReg(IntRegs, IntRegsSize);
+    } else {
+      Reg = State.AllocateReg(F64Regs, FloatRegsSize);
+      // Shadow int registers
+      unsigned Reg2 = State.AllocateReg(IntRegs, IntRegsSize);
+      if (Reg2 == Mips::A1 || Reg2 == Mips::A3)
+        State.AllocateReg(IntRegs, IntRegsSize);
+      State.AllocateReg(IntRegs, IntRegsSize);
+    }
   } else
     llvm_unreachable("Cannot handle this ValVT.");
 
-  if (!Reg) {
-    unsigned SizeInBytes = ValVT.getSizeInBits() >> 3;
-    unsigned Offset = State.AllocateStack(SizeInBytes, SizeInBytes);
+  unsigned SizeInBytes = ValVT.getSizeInBits() >> 3;
+  unsigned Offset = State.AllocateStack(SizeInBytes, OrigAlign);
+
+  if (!Reg)
     State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-  } else
+  else
     State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
 
   return false; // CC must always match
@@ -1081,6 +1678,56 @@ static bool CC_MipsO32_VarArgs(unsigned ValNo, MVT ValVT,
 //                  Call Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
+static const unsigned O32IntRegsSize = 4;
+
+static const unsigned O32IntRegs[] = {
+  Mips::A0, Mips::A1, Mips::A2, Mips::A3
+};
+
+// Write ByVal Arg to arg registers and stack.
+static void
+WriteByValArg(SDValue& Chain, DebugLoc dl,
+              SmallVector<std::pair<unsigned, SDValue>, 16>& RegsToPass,
+              SmallVector<SDValue, 8>& MemOpChains, int& LastFI,
+              MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
+              const CCValAssign &VA, const ISD::ArgFlagsTy& Flags,
+              MVT PtrType) {
+  unsigned FirstWord = VA.getLocMemOffset() / 4;
+  unsigned NumWords = (Flags.getByValSize() + 3) / 4;
+  unsigned LastWord = FirstWord + NumWords;
+  unsigned CurWord;
+
+  // copy the first 4 words of byval arg to registers A0 - A3
+  for (CurWord = FirstWord; CurWord < std::min(LastWord, O32IntRegsSize);
+       ++CurWord) {
+    SDValue LoadPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg,
+                                  DAG.getConstant((CurWord - FirstWord) * 4,
+                                                  MVT::i32));
+    SDValue LoadVal = DAG.getLoad(MVT::i32, dl, Chain, LoadPtr,
+                                  MachinePointerInfo(),
+                                  false, false, 0);
+    MemOpChains.push_back(LoadVal.getValue(1));
+    unsigned DstReg = O32IntRegs[CurWord];
+    RegsToPass.push_back(std::make_pair(DstReg, LoadVal));
+  }
+
+  // copy remaining part of byval arg to stack.
+  if (CurWord < LastWord) {
+    unsigned SizeInBytes = (LastWord - CurWord) * 4;
+    SDValue Src = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg,
+                              DAG.getConstant((CurWord - FirstWord) * 4,
+                                              MVT::i32));
+    LastFI = MFI->CreateFixedObject(SizeInBytes, CurWord * 4, true);
+    SDValue Dst = DAG.getFrameIndex(LastFI, PtrType);
+    Chain = DAG.getMemcpy(Chain, dl, Dst, Src,
+                          DAG.getConstant(SizeInBytes, MVT::i32),
+                          /*Align*/4,
+                          /*isVolatile=*/false, /*AlwaysInline=*/false,
+                          MachinePointerInfo(0), MachinePointerInfo(0));
+    MemOpChains.push_back(Chain);
+  }
+}
+
 /// LowerCall - functions arguments are copied from virtual regs to
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
 /// TODO: isTailCall.
@@ -1098,35 +1745,57 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetFrameLowering *TFL = MF.getTarget().getFrameLowering();
   bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
-                 *DAG.getContext());
-
-  // To meet O32 ABI, Mips must always allocate 16 bytes on
-  // the stack (even if less than 4 are used as arguments)
-  if (Subtarget->isABI_O32()) {
-    int VTsize = MVT(MVT::i32).getSizeInBits()/8;
-    MFI->CreateFixedObject(VTsize, (VTsize*3), true);
-    CCInfo.AnalyzeCallOperands(Outs,
-                     isVarArg ? CC_MipsO32_VarArgs : CC_MipsO32);
-  } else
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  if (Subtarget->isABI_O32())
+    CCInfo.AnalyzeCallOperands(Outs, CC_MipsO32);
+  else
     CCInfo.AnalyzeCallOperands(Outs, CC_Mips);
 
   // Get a count of how many bytes are to be pushed on the stack.
-  unsigned NumBytes = CCInfo.getNextStackOffset();
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  unsigned NextStackOffset = CCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NextStackOffset,
+                                                            true));
+
+  // If this is the first call, create a stack frame object that points to
+  // a location to which .cprestore saves $gp.
+  if (IsPIC && !MipsFI->getGPFI())
+    MipsFI->setGPFI(MFI->CreateFixedObject(4, 0, true));
+
+  // Update size of the maximum argument space.
+  // For O32, a minimum of four words (16 bytes) of argument space is
+  // allocated.
+  if (Subtarget->isABI_O32())
+    NextStackOffset = std::max(NextStackOffset, (unsigned)16);
+
+  unsigned MaxCallFrameSize = MipsFI->getMaxCallFrameSize();
+
+  if (MaxCallFrameSize < NextStackOffset) {
+    MipsFI->setMaxCallFrameSize(NextStackOffset);
+
+    if (IsPIC) {
+      // $gp restore slot must be aligned.
+      unsigned StackAlignment = TFL->getStackAlignment();
+      NextStackOffset = (NextStackOffset + StackAlignment - 1) /
+                        StackAlignment * StackAlignment;
+      int GPFI = MipsFI->getGPFI();
+      MFI->setObjectOffset(GPFI, NextStackOffset);
+    }
+  }
 
   // With EABI is it possible to have 16 args on registers.
   SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
 
-  // First/LastArgStackLoc contains the first/last
-  // "at stack" argument location.
-  int LastArgStackLoc = 0;
-  unsigned FirstStackArgLoc = (Subtarget->isABI_EABI() ? 0 : 16);
+  int FirstFI = -MFI->getNumFixedObjects() - 1, LastFI = 0;
 
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -1174,15 +1843,22 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     // Register can't get to this point...
     assert(VA.isMemLoc());
 
-    // Create the frame index object for this incoming parameter
-    // This guarantees that when allocating Local Area the firsts
-    // 16 bytes which are alwayes reserved won't be overwritten
-    // if O32 ABI is used. For EABI the first address is zero.
-    LastArgStackLoc = (FirstStackArgLoc + VA.getLocMemOffset());
-    int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
-                                    LastArgStackLoc, true);
+    // ByVal Arg.
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    if (Flags.isByVal()) {
+      assert(Subtarget->isABI_O32() &&
+             "No support for ByVal args by ABIs other than O32 yet.");
+      assert(Flags.getByValSize() &&
+             "ByVal args of size 0 should have been ignored by front-end.");
+      WriteByValArg(Chain, dl, RegsToPass, MemOpChains, LastFI, MFI, DAG, Arg,
+                    VA, Flags, getPointerTy());
+      continue;
+    }
 
-    SDValue PtrOff = DAG.getFrameIndex(FI,getPointerTy());
+    // Create the frame index object for this incoming parameter
+    LastFI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
+                                    VA.getLocMemOffset(), true);
+    SDValue PtrOff = DAG.getFrameIndex(LastFI, getPointerTy());
 
     // emit ISD::STORE whichs stores the
     // parameter value to a stack Location
@@ -1191,23 +1867,18 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                                        false, false, 0));
   }
 
+  // Extend range of indices of frame objects for outgoing arguments that were
+  // created during this function call. Skip this step if no such objects were
+  // created.
+  if (LastFI)
+    MipsFI->extendOutArgFIRange(FirstFI, LastFI);
+
   // Transform all store nodes into one single node because all store
   // nodes are independent of each other.
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
 
-  // Build a sequence of copy-to-reg nodes chained together with token
-  // chain and flag operands which copy the outgoing args into registers.
-  // The InFlag in necessary since all emitted instructions must be
-  // stuck together.
-  SDValue InFlag;
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                             RegsToPass[i].second, InFlag);
-    InFlag = Chain.getValue(1);
-  }
-
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
@@ -1234,10 +1905,13 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     LoadSymAddr = true;
   }
 
+  SDValue InFlag;
+
   // Create nodes that load address of callee and copy it to T9
   if (IsPIC) {
     if (LoadSymAddr) {
       // Load callee address
+      Callee = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, Callee);
       SDValue LoadValue = DAG.getLoad(MVT::i32, dl, Chain, Callee,
                                       MachinePointerInfo::getGOT(),
                                       false, false, 0);
@@ -1249,7 +1923,7 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       } else
         Callee = LoadValue;
 
-      // Use chain output from LoadValue 
+      // Use chain output from LoadValue
       Chain = LoadValue.getValue(1);
     }
 
@@ -1259,6 +1933,16 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     Callee = DAG.getRegister(Mips::T9, MVT::i32);
   }
 
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emitted instructions must be
+  // stuck together.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
   // MipsJmpLink = #chain, #target_address, #opt_in_flags...
   //             = Chain, Callee, Reg#1, Reg#2, ...
   //
@@ -1280,39 +1964,8 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   Chain  = DAG.getNode(MipsISD::JmpLink, dl, NodeTys, &Ops[0], Ops.size());
   InFlag = Chain.getValue(1);
 
-  // Create a stack location to hold GP when PIC is used. This stack
-  // location is used on function prologue to save GP and also after all
-  // emitted CALL's to restore GP.
-  if (IsPIC) {
-      // Function can have an arbitrary number of calls, so
-      // hold the LastArgStackLoc with the biggest offset.
-      int FI;
-      MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
-      if (LastArgStackLoc >= MipsFI->getGPStackOffset()) {
-        LastArgStackLoc = (!LastArgStackLoc) ? (16) : (LastArgStackLoc+4);
-        // Create the frame index only once. SPOffset here can be anything
-        // (this will be fixed on processFunctionBeforeFrameFinalized)
-        if (MipsFI->getGPStackOffset() == -1) {
-          FI = MFI->CreateFixedObject(4, 0, true);
-          MipsFI->setGPFI(FI);
-        }
-        MipsFI->setGPStackOffset(LastArgStackLoc);
-      }
-
-      // Reload GP value.
-      FI = MipsFI->getGPFI();
-      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-      SDValue GPLoad = DAG.getLoad(MVT::i32, dl, Chain, FIN,
-                                   MachinePointerInfo::getFixedStack(FI),
-                                   false, false, 0);
-      Chain = GPLoad.getValue(1);
-      Chain = DAG.getCopyToReg(Chain, dl, DAG.getRegister(Mips::GP, MVT::i32),
-                               GPLoad, SDValue(0,0));
-      InFlag = Chain.getValue(1);
-  }
-
   // Create the CALLSEQ_END node.
-  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NextStackOffset, true),
                              DAG.getIntPtrConstant(0, true), InFlag);
   InFlag = Chain.getValue(1);
 
@@ -1330,11 +1983,10 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                     DebugLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const {
-
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_Mips);
 
@@ -1352,6 +2004,29 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 //===----------------------------------------------------------------------===//
 //             Formal Arguments Calling Convention Implementation
 //===----------------------------------------------------------------------===//
+static void ReadByValArg(MachineFunction &MF, SDValue Chain, DebugLoc dl,
+                         std::vector<SDValue>& OutChains,
+                         SelectionDAG &DAG, unsigned NumWords, SDValue FIN,
+                         const CCValAssign &VA, const ISD::ArgFlagsTy& Flags) {
+  unsigned LocMem = VA.getLocMemOffset();
+  unsigned FirstWord = LocMem / 4;
+
+  // copy register A0 - A3 to frame object
+  for (unsigned i = 0; i < NumWords; ++i) {
+    unsigned CurWord = FirstWord + i;
+    if (CurWord >= O32IntRegsSize)
+      break;
+
+    unsigned SrcReg = O32IntRegs[CurWord];
+    unsigned Reg = AddLiveIn(MF, SrcReg, Mips::CPURegsRegisterClass);
+    SDValue StorePtr = DAG.getNode(ISD::ADD, dl, MVT::i32, FIN,
+                                   DAG.getConstant(i * 4, MVT::i32));
+    SDValue Store = DAG.getStore(Chain, dl, DAG.getRegister(Reg, MVT::i32),
+                                 StorePtr, MachinePointerInfo(), false,
+                                 false, 0);
+    OutChains.push_back(Store);
+  }
+}
 
 /// LowerFormalArguments - transform physical registers into virtual registers
 /// and generate load operations for arguments places on the stack.
@@ -1364,7 +2039,6 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
                                          DebugLoc dl, SelectionDAG &DAG,
                                          SmallVectorImpl<SDValue> &InVals)
                                           const {
-
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
@@ -1374,23 +2048,17 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   // Used with vargs to acumulate store chains.
   std::vector<SDValue> OutChains;
 
-  // Keep track of the last register used for arguments
-  unsigned ArgRegEnd = 0;
-
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   if (Subtarget->isABI_O32())
-    CCInfo.AnalyzeFormalArguments(Ins,
-                        isVarArg ? CC_MipsO32_VarArgs : CC_MipsO32);
+    CCInfo.AnalyzeFormalArguments(Ins, CC_MipsO32);
   else
     CCInfo.AnalyzeFormalArguments(Ins, CC_Mips);
 
-  unsigned FirstStackArgLoc = (Subtarget->isABI_EABI() ? 0 : 16);
-  unsigned LastStackArgEndOffset = 0;
-  EVT LastRegArgValVT;
+  int LastFI = 0;// MipsFI->LastInArgFI is 0 at the entry of this function.
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
@@ -1398,8 +2066,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
     // Arguments stored on registers
     if (VA.isRegLoc()) {
       EVT RegVT = VA.getLocVT();
-      ArgRegEnd = VA.getLocReg();
-      LastRegArgValVT = VA.getValVT();
+      unsigned ArgReg = VA.getLocReg();
       TargetRegisterClass *RC = 0;
 
       if (RegVT == MVT::i32)
@@ -1414,7 +2081,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
 
       // Transform the arguments stored on
       // physical registers into virtual ones
-      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), ArgRegEnd, RC);
+      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), ArgReg, RC);
       SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
 
       // If this is an 8 or 16-bit value, it has been passed promoted
@@ -1453,26 +2120,31 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
       // sanity check
       assert(VA.isMemLoc());
 
-      // The last argument is not a register anymore
-      ArgRegEnd = 0;
+      ISD::ArgFlagsTy Flags = Ins[i].Flags;
+
+      if (Flags.isByVal()) {
+        assert(Subtarget->isABI_O32() &&
+               "No support for ByVal args by ABIs other than O32 yet.");
+        assert(Flags.getByValSize() &&
+               "ByVal args of size 0 should have been ignored by front-end.");
+        unsigned NumWords = (Flags.getByValSize() + 3) / 4;
+        LastFI = MFI->CreateFixedObject(NumWords * 4, VA.getLocMemOffset(),
+                                        true);
+        SDValue FIN = DAG.getFrameIndex(LastFI, getPointerTy());
+        InVals.push_back(FIN);
+        ReadByValArg(MF, Chain, dl, OutChains, DAG, NumWords, FIN, VA, Flags);
+
+        continue;
+      }
 
       // The stack pointer offset is relative to the caller stack frame.
-      // Since the real stack size is unknown here, a negative SPOffset
-      // is used so there's a way to adjust these offsets when the stack
-      // size get known (on EliminateFrameIndex). A dummy SPOffset is
-      // used instead of a direct negative address (which is recorded to
-      // be used on emitPrologue) to avoid mis-calc of the first stack
-      // offset on PEI::calculateFrameObjectOffsets.
-      unsigned ArgSize = VA.getValVT().getSizeInBits()/8;
-      LastStackArgEndOffset = FirstStackArgLoc + VA.getLocMemOffset() + ArgSize;
-      int FI = MFI->CreateFixedObject(ArgSize, 0, true);
-      MipsFI->recordLoadArgsFI(FI, -(4 +
-        (FirstStackArgLoc + VA.getLocMemOffset())));
+      LastFI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
+                                      VA.getLocMemOffset(), true);
 
       // Create load nodes to retrieve arguments from the stack
-      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      SDValue FIN = DAG.getFrameIndex(LastFI, getPointerTy());
       InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
-                                   MachinePointerInfo::getFixedStack(FI),
+                                   MachinePointerInfo::getFixedStack(LastFI),
                                    false, false, 0));
     }
   }
@@ -1490,58 +2162,33 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
   }
 
-  // To meet ABI, when VARARGS are passed on registers, the registers
-  // must have their values written to the caller stack frame. If the last
-  // argument was placed in the stack, there's no need to save any register.
   if (isVarArg && Subtarget->isABI_O32()) {
-    if (ArgRegEnd) {
-      // Last named formal argument is passed in register.
-
-      // The last register argument that must be saved is Mips::A3
+    // Record the frame index of the first variable argument
+    // which is a value necessary to VASTART.
+    unsigned NextStackOffset = CCInfo.getNextStackOffset();
+    assert(NextStackOffset % 4 == 0 &&
+           "NextStackOffset must be aligned to 4-byte boundaries.");
+    LastFI = MFI->CreateFixedObject(4, NextStackOffset, true);
+    MipsFI->setVarArgsFrameIndex(LastFI);
+
+    // If NextStackOffset is smaller than o32's 16-byte reserved argument area,
+    // copy the integer registers that have not been used for argument passing
+    // to the caller's stack frame.
+    for (; NextStackOffset < 16; NextStackOffset += 4) {
       TargetRegisterClass *RC = Mips::CPURegsRegisterClass;
-      if (LastRegArgValVT == MVT::f64)
-        ArgRegEnd++;
-
-      if (ArgRegEnd < Mips::A3) {
-        // Both the last named formal argument and the first variable
-        // argument are passed in registers.
-        for (++ArgRegEnd; ArgRegEnd <= Mips::A3; ++ArgRegEnd) {
-          unsigned Reg = AddLiveIn(DAG.getMachineFunction(), ArgRegEnd, RC);
-          SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, MVT::i32);
-
-          int FI = MFI->CreateFixedObject(4, 0, true);
-          MipsFI->recordStoreVarArgsFI(FI, -(4+(ArgRegEnd-Mips::A0)*4));
-          SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
-          OutChains.push_back(DAG.getStore(Chain, dl, ArgValue, PtrOff,
-                                           MachinePointerInfo(),
-                                           false, false, 0));
-
-          // Record the frame index of the first variable argument
-          // which is a value necessary to VASTART.
-          if (!MipsFI->getVarArgsFrameIndex()) {
-            MFI->setObjectAlignment(FI, 4);
-            MipsFI->setVarArgsFrameIndex(FI);
-          }
-        }
-      } else {
-        // Last named formal argument is in register Mips::A3, and the first
-        // variable argument is on stack. Record the frame index of the first
-        // variable argument.
-        int FI = MFI->CreateFixedObject(4, 0, true);
-        MFI->setObjectAlignment(FI, 4);
-        MipsFI->recordStoreVarArgsFI(FI, -20);
-        MipsFI->setVarArgsFrameIndex(FI);
-      }
-    } else {
-      // Last named formal argument and all the variable arguments are passed
-      // on stack. Record the frame index of the first variable argument.
-      int FI = MFI->CreateFixedObject(4, 0, true);
-      MFI->setObjectAlignment(FI, 4);
-      MipsFI->recordStoreVarArgsFI(FI, -(4+LastStackArgEndOffset));
-      MipsFI->setVarArgsFrameIndex(FI);
+      unsigned Idx = NextStackOffset / 4;
+      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), O32IntRegs[Idx], RC);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, MVT::i32);
+      LastFI = MFI->CreateFixedObject(4, NextStackOffset, true);
+      SDValue PtrOff = DAG.getFrameIndex(LastFI, getPointerTy());
+      OutChains.push_back(DAG.getStore(Chain, dl, ArgValue, PtrOff,
+                                       MachinePointerInfo(),
+                                       false, false, 0));
     }
   }
 
+  MipsFI->setLastInArgFI(LastFI);
+
   // All stores are grouped in one node to allow the matching between
   // the size of Ins and InVals. This only happens when on varg functions
   if (!OutChains.empty()) {
@@ -1569,8 +2216,8 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_Mips);
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index e4d0c3d..fbcedfd 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -40,6 +40,16 @@ namespace llvm {
       // Handle gp_rel (small data/bss sections) relocation.
       GPRel,
 
+      // General Dynamic TLS
+      TlsGd,
+
+      // Local Exec TLS
+      TprelHi,
+      TprelLo,
+
+      // Thread Pointer
+      ThreadPointer,
+
       // Floating Point Branch Conditional
       FPBrcond,
 
@@ -67,7 +77,9 @@ namespace llvm {
       DivRemU,
 
       BuildPairF64,
-      ExtractElementF64
+      ExtractElementF64,
+
+      WrapperPIC
     };
   }
 
@@ -89,9 +101,6 @@ namespace llvm {
     /// getSetCCResultType - get the ISD::SETCC result ValueType
     MVT::SimpleValueType getSetCCResultType(EVT VT) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const;
-
     virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   private:
     // Subtarget Info
@@ -109,13 +118,14 @@ namespace llvm {
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
@@ -167,6 +177,16 @@ namespace llvm {
     /// specified FP immediate natively. If false, the legalizer will
     /// materialize the FP immediate as a load from a constant pool.
     virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+
+    MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                    unsigned Size, unsigned BinOpcode, bool Nand = false) const;
+    MachineBasicBlock *EmitAtomicBinaryPartword(MachineInstr *MI,
+                    MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
+                    bool Nand = false) const;
+    MachineBasicBlock *EmitAtomicCmpSwap(MachineInstr *MI,
+                                  MachineBasicBlock *BB, unsigned Size) const;
+    MachineBasicBlock *EmitAtomicCmpSwapPartword(MachineInstr *MI,
+                                  MachineBasicBlock *BB, unsigned Size) const;
   };
 }
 
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index a86c5c7..021c167 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the Mips implementation of the TargetInstrInfo class.
+// This file describes the Mips FPU instruction set.
 //
 //===----------------------------------------------------------------------===//
 
@@ -77,40 +77,42 @@ def IsNotMipsI       : Predicate<"!Subtarget.isMips1()">;
 multiclass FFR1_1<bits<6> funct, string asmstr>
 {
   def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
-      !strconcat(asmstr, ".s $fd, $fs"), []>;
+      !strconcat(asmstr, ".s\t$fd, $fs"), []>;
 
   def _D32  : FFR<0x11, funct, 0x1, (outs FGR32:$fd), (ins AFGR64:$fs),
-      !strconcat(asmstr, ".d $fd, $fs"), []>, Requires<[In32BitMode]>;
+      !strconcat(asmstr, ".d\t$fd, $fs"), []>, Requires<[In32BitMode]>;
 }
 
 multiclass FFR1_2<bits<6> funct, string asmstr, SDNode FOp>
 {
   def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
-                 !strconcat(asmstr, ".s $fd, $fs"),
+                 !strconcat(asmstr, ".s\t$fd, $fs"),
                  [(set FGR32:$fd, (FOp FGR32:$fs))]>;
 
   def _D32  : FFR<0x11, funct, 0x1, (outs AFGR64:$fd), (ins AFGR64:$fs),
-                 !strconcat(asmstr, ".d $fd, $fs"),
+                 !strconcat(asmstr, ".d\t$fd, $fs"),
                  [(set AFGR64:$fd, (FOp AFGR64:$fs))]>, Requires<[In32BitMode]>;
 }
 
 class FFR1_3<bits<6> funct, bits<5> fmt, RegisterClass RcSrc,
               RegisterClass RcDst, string asmstr>:
   FFR<0x11, funct, fmt, (outs RcSrc:$fd), (ins RcDst:$fs),
-      !strconcat(asmstr, " $fd, $fs"), []>;
+      !strconcat(asmstr, "\t$fd, $fs"), []>;
 
 
-multiclass FFR1_4<bits<6> funct, string asmstr, SDNode FOp> {
+multiclass FFR1_4<bits<6> funct, string asmstr, SDNode FOp, bit isComm = 0> {
+  let isCommutable = isComm in {
   def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd),
                  (ins FGR32:$fs, FGR32:$ft),
-                 !strconcat(asmstr, ".s $fd, $fs, $ft"),
+                 !strconcat(asmstr, ".s\t$fd, $fs, $ft"),
                  [(set FGR32:$fd, (FOp FGR32:$fs, FGR32:$ft))]>;
 
   def _D32 : FFR<0x11, funct, 0x1, (outs AFGR64:$fd),
                  (ins AFGR64:$fs, AFGR64:$ft),
-                 !strconcat(asmstr, ".d $fd, $fs, $ft"),
+                 !strconcat(asmstr, ".d\t$fd, $fs, $ft"),
                  [(set AFGR64:$fd, (FOp AFGR64:$fs, AFGR64:$ft))]>,
                  Requires<[In32BitMode]>;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -170,42 +172,42 @@ let ft = 0 in {
 let fd = 0 in {
   /// Move Control Registers From/To CPU Registers
   def CFC1  : FFR<0x11, 0x0, 0x2, (outs CPURegs:$rt), (ins CCR:$fs),
-                  "cfc1 $rt, $fs", []>;
+                  "cfc1\t$rt, $fs", []>;
 
   def CTC1  : FFR<0x11, 0x0, 0x6, (outs CCR:$rt), (ins CPURegs:$fs),
-                  "ctc1 $fs, $rt", []>;
+                  "ctc1\t$fs, $rt", []>;
 
   def MFC1  : FFR<0x11, 0x00, 0x00, (outs CPURegs:$rt), (ins FGR32:$fs),
-                  "mfc1 $rt, $fs", []>;
+                  "mfc1\t$rt, $fs", []>;
 
   def MTC1  : FFR<0x11, 0x00, 0x04, (outs FGR32:$fs), (ins CPURegs:$rt),
-                  "mtc1 $rt, $fs", []>;
+                  "mtc1\t$rt, $fs", []>;
 }
 
 def FMOV_S32 : FFR<0x11, 0b000110, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
-                   "mov.s $fd, $fs", []>;
+                   "mov.s\t$fd, $fs", []>;
 def FMOV_D32 : FFR<0x11, 0b000110, 0x1, (outs AFGR64:$fd), (ins AFGR64:$fs),
-                   "mov.d $fd, $fs", []>;
+                   "mov.d\t$fd, $fs", []>;
 
 /// Floating Point Memory Instructions
 let Predicates = [IsNotSingleFloat, IsNotMipsI] in {
   def LDC1 : FFI<0b110101, (outs AFGR64:$ft), (ins mem:$addr),
-                 "ldc1 $ft, $addr", [(set AFGR64:$ft, (load addr:$addr))]>;
+                 "ldc1\t$ft, $addr", [(set AFGR64:$ft, (load addr:$addr))]>;
 
   def SDC1 : FFI<0b111101, (outs), (ins AFGR64:$ft, mem:$addr),
-                 "sdc1 $ft, $addr", [(store AFGR64:$ft, addr:$addr)]>;
+                 "sdc1\t$ft, $addr", [(store AFGR64:$ft, addr:$addr)]>;
 }
 
 // LWC1 and SWC1 can always be emitted with odd registers.
-def LWC1  : FFI<0b110001, (outs FGR32:$ft), (ins mem:$addr), "lwc1 $ft, $addr",
+def LWC1  : FFI<0b110001, (outs FGR32:$ft), (ins mem:$addr), "lwc1\t$ft, $addr",
                [(set FGR32:$ft, (load addr:$addr))]>;
-def SWC1  : FFI<0b111001, (outs), (ins FGR32:$ft, mem:$addr), "swc1 $ft, $addr",
-               [(store FGR32:$ft, addr:$addr)]>;
+def SWC1  : FFI<0b111001, (outs), (ins FGR32:$ft, mem:$addr),
+               "swc1\t$ft, $addr", [(store FGR32:$ft, addr:$addr)]>;
 
 /// Floating-point Aritmetic
-defm FADD : FFR1_4<0x10, "add", fadd>;
+defm FADD : FFR1_4<0x10, "add", fadd, 1>;
 defm FDIV : FFR1_4<0x03, "div", fdiv>;
-defm FMUL : FFR1_4<0x02, "mul", fmul>;
+defm FMUL : FFR1_4<0x02, "mul", fmul, 1>;
 defm FSUB : FFR1_4<0x01, "sub", fsub>;
 
 //===----------------------------------------------------------------------===//
@@ -221,7 +223,7 @@ def MIPS_BRANCH_TL : PatLeaf<(i32 3)>;
 /// Floating Point Branch of False/True (Likely)
 let isBranch=1, isTerminator=1, hasDelaySlot=1, base=0x8, Uses=[FCR31] in
   class FBRANCH<PatLeaf op, string asmstr> : FFI<0x11, (outs),
-        (ins brtarget:$dst), !strconcat(asmstr, " $dst"),
+        (ins brtarget:$dst), !strconcat(asmstr, "\t$dst"),
         [(MipsFPBrcond op, bb:$dst)]>;
 
 def BC1F  : FBRANCH<MIPS_BRANCH_F,  "bc1f">;
@@ -254,11 +256,11 @@ def MIPS_FCOND_NGT  : PatLeaf<(i32 15)>;
 /// Floating Point Compare
 let hasDelaySlot = 1, Defs=[FCR31] in {
   def FCMP_S32 : FCC<0x0, (outs), (ins FGR32:$fs, FGR32:$ft, condcode:$cc),
-                     "c.$cc.s $fs, $ft",
+                     "c.$cc.s\t$fs, $ft",
                      [(MipsFPCmp FGR32:$fs, FGR32:$ft, imm:$cc)]>;
 
   def FCMP_D32 : FCC<0x1, (outs), (ins AFGR64:$fs, AFGR64:$ft, condcode:$cc),
-                     "c.$cc.d $fs, $ft",
+                     "c.$cc.d\t$fs, $ft",
                      [(MipsFPCmp AFGR64:$fs, AFGR64:$ft, imm:$cc)]>,
                      Requires<[In32BitMode]>;
 }
@@ -357,6 +359,7 @@ def : Pat<(f32 (sint_to_fp CPURegs:$src)), (CVTS_W32 (MTC1 CPURegs:$src))>;
 def : Pat<(f64 (sint_to_fp CPURegs:$src)), (CVTD_W32 (MTC1 CPURegs:$src))>;
 
 def : Pat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S32 FGR32:$src))>;
+def : Pat<(i32 (fp_to_sint AFGR64:$src)), (MFC1 (TRUNC_W_D32 AFGR64:$src))>;
 
 def : Pat<(i32 (bitconvert FGR32:$src)),  (MFC1 FGR32:$src)>;
 def : Pat<(f32 (bitconvert CPURegs:$src)), (MTC1 CPURegs:$src)>;
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 9dfcdfb..9f55fb3 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -1,4 +1,4 @@
-//===- MipsRegisterInfo.td - Mips Register defs ------------*- tablegen -*-===//
+//===- MipsInstrFormats.td - Mips Instruction Formats ------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 5fdbf1f..abf6773 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -146,7 +146,21 @@ namespace MipsII {
     /// MO_ABS_HI/LO - Represents the hi or low part of an absolute symbol
     /// address.
     MO_ABS_HI,
-    MO_ABS_LO
+    MO_ABS_LO,
+
+    /// MO_TLSGD - Represents the offset into the global offset table at which
+    // the module ID and TSL block offset reside during execution (General
+    // Dynamic TLS).
+    MO_TLSGD,
+
+    /// MO_GOTTPREL - Represents the offset from the thread pointer (Initial
+    // Exec TLS).
+    MO_GOTTPREL,
+
+    /// MO_TPREL_HI/LO - Represents the hi and low part of the offset from
+    // the thread pointer (Local Exec TLS).
+    MO_TPREL_HI,
+    MO_TPREL_LO
   };
 }
 
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 19b9c35..329a002 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -1,4 +1,4 @@
-//===- MipsInstrInfo.td - Mips Register defs ---------------*- tablegen -*-===//
+//===- MipsInstrInfo.td - Target Description for Mips Target -*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,6 +6,10 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // Instruction format superclass
@@ -33,6 +37,8 @@ def SDT_MipsDivRem       : SDTypeProfile<0, 2,
                                          [SDTCisVT<0, i32>,
                                           SDTCisSameAs<0, 1>]>;
 
+def SDT_MipsThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
+
 // Call
 def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink,
                          [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
@@ -45,6 +51,16 @@ def MipsHi    : SDNode<"MipsISD::Hi", SDTIntUnaryOp>;
 def MipsLo    : SDNode<"MipsISD::Lo", SDTIntUnaryOp>;
 def MipsGPRel : SDNode<"MipsISD::GPRel", SDTIntUnaryOp>;
 
+// TlsGd node is used to handle General Dynamic TLS
+def MipsTlsGd : SDNode<"MipsISD::TlsGd", SDTIntUnaryOp>;
+
+// TprelHi and TprelLo nodes are used to handle Local Exec TLS
+def MipsTprelHi    : SDNode<"MipsISD::TprelHi", SDTIntUnaryOp>;
+def MipsTprelLo    : SDNode<"MipsISD::TprelLo", SDTIntUnaryOp>;
+
+// Thread pointer
+def MipsThreadPointer: SDNode<"MipsISD::ThreadPointer", SDT_MipsThreadPointer>;
+
 // Return
 def MipsRet : SDNode<"MipsISD::Ret", SDT_MipsRet, [SDNPHasChain,
                      SDNPOptInGlue]>;
@@ -71,6 +87,18 @@ def MipsDivRem    : SDNode<"MipsISD::DivRem", SDT_MipsDivRem,
 def MipsDivRemU   : SDNode<"MipsISD::DivRemU", SDT_MipsDivRem,
                            [SDNPOutGlue]>;
 
+// Target constant nodes that are not part of any isel patterns and remain
+// unchanged can cause instructions with illegal operands to be emitted.
+// Wrapper node patterns give the instruction selector a chance to replace
+// target constant nodes that would otherwise remain unchanged with ADDiu
+// nodes. Without these wrapper node patterns, the following conditional move
+// instrucion is emitted when function cmov2 in test/CodeGen/Mips/cmov.ll is
+// compiled: 
+//  movn  %got(d)($gp), %got(c)($gp), $4
+// This instruction is illegal since movn can take only register operands.
+
+def MipsWrapperPIC    : SDNode<"MipsISD::WrapperPIC",  SDTIntUnaryOp>;
+
 //===----------------------------------------------------------------------===//
 // Mips Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
@@ -141,17 +169,20 @@ def addr : ComplexPattern<iPTR, 2, "SelectAddr", [frameindex], []>;
 //===----------------------------------------------------------------------===//
 
 // Arithmetic 3 register operands
-let isCommutable = 1 in
 class ArithR<bits<6> op, bits<6> func, string instr_asm, SDNode OpNode,
-             InstrItinClass itin>:
+             InstrItinClass itin, bit isComm = 0>:
   FR<op, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
      !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], itin>;
+     [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], itin> {
+  let isCommutable = isComm;
+}
 
-let isCommutable = 1 in
-class ArithOverflowR<bits<6> op, bits<6> func, string instr_asm>:
+class ArithOverflowR<bits<6> op, bits<6> func, string instr_asm,
+                     bit isComm = 0>:
   FR<op, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"), [], IIAlu>;
+     !strconcat(instr_asm, "\t$dst, $b, $c"), [], IIAlu> {
+  let isCommutable = isComm;
+}
 
 // Arithmetic 2 register operands
 class ArithI<bits<6> op, string instr_asm, SDNode OpNode,
@@ -167,12 +198,15 @@ class ArithOverflowI<bits<6> op, string instr_asm, SDNode OpNode,
 
 // Arithmetic Multiply ADD/SUB
 let rd = 0, shamt = 0, Defs = [HI, LO], Uses = [HI, LO] in
-class MArithR<bits<6> func, string instr_asm, SDNode op> :
+class MArithR<bits<6> func, string instr_asm, SDNode op, bit isComm = 0> :
   FR<0x1c, func, (outs), (ins CPURegs:$rs, CPURegs:$rt),
      !strconcat(instr_asm, "\t$rs, $rt"),
-     [(op CPURegs:$rs, CPURegs:$rt, LO, HI)], IIImul>;
+     [(op CPURegs:$rs, CPURegs:$rt, LO, HI)], IIImul> {
+  let isCommutable = isComm;
+}
 
 //  Logical
+let isCommutable = 1 in
 class LogicR<bits<6> func, string instr_asm, SDNode OpNode>:
   FR<0x00, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
      !strconcat(instr_asm, "\t$dst, $b, $c"),
@@ -183,6 +217,7 @@ class LogicI<bits<6> op, string instr_asm, SDNode OpNode>:
      !strconcat(instr_asm, "\t$dst, $b, $c"),
      [(set CPURegs:$dst, (OpNode CPURegs:$b, immZExt16:$c))], IIAlu>;
 
+let isCommutable = 1 in
 class LogicNOR<bits<6> op, bits<6> func, string instr_asm>:
   FR<op, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
      !strconcat(instr_asm, "\t$dst, $b, $c"),
@@ -288,6 +323,7 @@ let isCall=1, hasDelaySlot=1,
 
 // Mul, Div
 let Defs = [HI, LO] in {
+  let isCommutable = 1 in
   class Mul<bits<6> func, string instr_asm, InstrItinClass itin>:
     FR<0x00, func, (outs), (ins CPURegs:$a, CPURegs:$b),
        !strconcat(instr_asm, "\t$a, $b"), [], itin>;
@@ -338,6 +374,13 @@ class CondMov<bits<6> func, string instr_asm, PatLeaf MovCode>:
      CPURegs:$cond), !strconcat(instr_asm, "\t$dst, $T, $cond"),
      [], NoItinerary>;
 
+// Read Hardware
+class ReadHardware: FR<0x1f, 0x3b, (outs CPURegs:$dst), (ins HWRegs:$src),
+    "rdhwr\t$dst, $src", [], IIAlu> {
+  let rs = 0;
+  let shamt = 0;
+}
+
 //===----------------------------------------------------------------------===//
 // Pseudo instructions
 //===----------------------------------------------------------------------===//
@@ -369,7 +412,116 @@ def ATMACRO   : MipsPseudo<(outs), (ins), ".set\tat", []>;
 // are used, we have the same behavior, but get also a bunch of warnings
 // from the assembler.
 def CPLOAD : MipsPseudo<(outs), (ins CPURegs:$picreg), ".cpload\t$picreg", []>;
-def CPRESTORE : MipsPseudo<(outs), (ins uimm16:$loc), ".cprestore\t$loc\n", []>;
+def CPRESTORE : MipsPseudo<(outs), (ins i32imm:$loc), ".cprestore\t$loc\n", []>;
+
+let usesCustomInserter = 1 in {
+  def ATOMIC_LOAD_ADD_I8 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_add_8\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_add_8 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_ADD_I16 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_add_16\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_add_16 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_ADD_I32 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_add_32\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_add_32 CPURegs:$ptr, CPURegs:$incr))]>;
+
+  def ATOMIC_LOAD_SUB_I8 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_sub_8\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_sub_8 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_SUB_I16 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_sub_16\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_sub_16 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_SUB_I32 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_sub_32\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_sub_32 CPURegs:$ptr, CPURegs:$incr))]>;
+
+  def ATOMIC_LOAD_AND_I8 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_and_8\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_and_8 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_AND_I16 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_and_16\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_and_16 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_AND_I32 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_and_32\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_and_32 CPURegs:$ptr, CPURegs:$incr))]>;
+
+  def ATOMIC_LOAD_OR_I8 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_or_8\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_or_8 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_OR_I16 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_or_16\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_or_16 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_OR_I32 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_or_32\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_or_32 CPURegs:$ptr, CPURegs:$incr))]>;
+
+  def ATOMIC_LOAD_XOR_I8 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_xor_8\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_xor_8 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_XOR_I16 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_xor_16\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_xor_16 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_XOR_I32 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_xor_32\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_xor_32 CPURegs:$ptr, CPURegs:$incr))]>;
+
+  def ATOMIC_LOAD_NAND_I8 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_nand_8\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_nand_8 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_NAND_I16 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_nand_16\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_nand_16 CPURegs:$ptr, CPURegs:$incr))]>;
+  def ATOMIC_LOAD_NAND_I32 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+    "atomic_load_nand_32\t$dst, $ptr, $incr",
+    [(set CPURegs:$dst, (atomic_load_nand_32 CPURegs:$ptr, CPURegs:$incr))]>;
+
+  def ATOMIC_SWAP_I8 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$val),
+    "atomic_swap_8\t$dst, $ptr, $val",
+    [(set CPURegs:$dst, (atomic_swap_8 CPURegs:$ptr, CPURegs:$val))]>;
+  def ATOMIC_SWAP_I16 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$val),
+    "atomic_swap_16\t$dst, $ptr, $val",
+    [(set CPURegs:$dst, (atomic_swap_16 CPURegs:$ptr, CPURegs:$val))]>;
+  def ATOMIC_SWAP_I32 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$val),
+    "atomic_swap_32\t$dst, $ptr, $val",
+    [(set CPURegs:$dst, (atomic_swap_32 CPURegs:$ptr, CPURegs:$val))]>;
+
+  def ATOMIC_CMP_SWAP_I8 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval),
+    "atomic_cmp_swap_8\t$dst, $ptr, $oldval, $newval",
+    [(set CPURegs:$dst,
+         (atomic_cmp_swap_8 CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval))]>;
+  def ATOMIC_CMP_SWAP_I16 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval),
+    "atomic_cmp_swap_16\t$dst, $ptr, $oldval, $newval",
+    [(set CPURegs:$dst,
+         (atomic_cmp_swap_16 CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval))]>;
+  def ATOMIC_CMP_SWAP_I32 : MipsPseudo<
+    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval),
+    "atomic_cmp_swap_32\t$dst, $ptr, $oldval, $newval",
+    [(set CPURegs:$dst,
+         (atomic_cmp_swap_32 CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval))]>;
+}
 
 //===----------------------------------------------------------------------===//
 // Instruction definition
@@ -390,9 +542,9 @@ def XORi    : LogicI<0x0e, "xori",  xor>;
 def LUi     : LoadUpper<0x0f, "lui">;
 
 /// Arithmetic Instructions (3-Operand, R-Type)
-def ADDu    : ArithR<0x00, 0x21, "addu", add, IIAlu>;
+def ADDu    : ArithR<0x00, 0x21, "addu", add, IIAlu, 1>;
 def SUBu    : ArithR<0x00, 0x23, "subu", sub, IIAlu>;
-def ADD     : ArithOverflowR<0x00, 0x20, "add">;
+def ADD     : ArithOverflowR<0x00, 0x20, "add", 1>;
 def SUB     : ArithOverflowR<0x00, 0x22, "sub">;
 def SLT     : SetCC_R<0x00, 0x2a, "slt", setlt>;
 def SLTu    : SetCC_R<0x00, 0x2b, "sltu", setult>;
@@ -425,6 +577,14 @@ def SB      : StoreM<0x28, "sb", truncstorei8>;
 def SH      : StoreM<0x29, "sh", truncstorei16>;
 def SW      : StoreM<0x2b, "sw", store>;
 
+/// Load-linked, Store-conditional
+let hasDelaySlot = 1 in
+  def LL    : FI<0x30, (outs CPURegs:$dst), (ins mem:$addr),
+              "ll\t$dst, $addr", [], IILoad>;
+let Constraints = "$src = $dst" in
+  def SC    : FI<0x38, (outs CPURegs:$dst), (ins CPURegs:$src, mem:$addr),
+              "sc\t$src, $addr", [], IIStore>;
+
 /// Jump and Branch Instructions
 def J       : JumpFJ<0x02, "j">;
 def JR      : JumpFR<0x00, 0x08, "jr">;
@@ -516,14 +676,16 @@ let addr=0 in
 def LEA_ADDiu : EffectiveAddress<"addiu\t$dst, ${addr:stackloc}">;
 
 // MADD*/MSUB*
-def MADD  : MArithR<0, "madd", MipsMAdd>;
-def MADDU : MArithR<1, "maddu", MipsMAddu>;
+def MADD  : MArithR<0, "madd", MipsMAdd, 1>;
+def MADDU : MArithR<1, "maddu", MipsMAddu, 1>;
 def MSUB  : MArithR<4, "msub", MipsMSub>;
 def MSUBU : MArithR<5, "msubu", MipsMSubu>;
 
 // MUL is a assembly macro in the current used ISAs. In recent ISA's
 // it is a real instruction.
-def MUL   : ArithR<0x1c, 0x02, "mul", mul, IIImul>, Requires<[IsMips32]>;
+def MUL   : ArithR<0x1c, 0x02, "mul", mul, IIImul, 1>, Requires<[IsMips32]>;
+
+def RDHWR : ReadHardware;
 
 //===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
@@ -577,6 +739,26 @@ def : Pat<(add CPURegs:$gp, (MipsGPRel tglobaladdr:$in)),
 def : Pat<(add CPURegs:$gp, (MipsGPRel tconstpool:$in)),
           (ADDiu CPURegs:$gp, tconstpool:$in)>;
 
+// tlsgd
+def : Pat<(add CPURegs:$gp, (MipsTlsGd tglobaltlsaddr:$in)),
+          (ADDiu CPURegs:$gp, tglobaltlsaddr:$in)>;
+
+// tprel hi/lo
+def : Pat<(MipsTprelHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>;
+def : Pat<(add CPURegs:$hi, (MipsTprelLo tglobaltlsaddr:$lo)),
+          (ADDiu CPURegs:$hi, tglobaltlsaddr:$lo)>;
+
+// wrapper_pic
+class WrapperPICPat<SDNode node>:
+      Pat<(MipsWrapperPIC node:$in),
+          (ADDiu GP, node:$in)>;
+
+def : WrapperPICPat<tglobaladdr>;
+def : WrapperPICPat<tconstpool>;
+def : WrapperPICPat<texternalsym>;
+def : WrapperPICPat<tblockaddress>;
+def : WrapperPICPat<tjumptable>;
+
 // Mips does not have "not", so we expand our way
 def : Pat<(not CPURegs:$in),
           (NOR CPURegs:$in, ZERO)>;
@@ -644,13 +826,6 @@ multiclass MovnPats<RegisterClass RC, Instruction MOVNInst> {
 defm : MovzPats<CPURegs, MOVZ_I>;
 defm : MovnPats<CPURegs, MOVN_I>;
 
-// select patterns with got access
-let AddedComplexity = 10 in
-  def : Pat<(select (setne CPURegs:$lhs, CPURegs:$rhs),
-                    (i32 tglobaladdr:$T), CPURegs:$F),
-            (MOVN_I CPURegs:$F, (ADDiu GP, tglobaladdr:$T),
-                    (XOR CPURegs:$lhs, CPURegs:$rhs))>;
-
 // setcc patterns
 def : Pat<(seteq CPURegs:$lhs, CPURegs:$rhs),
           (SLTu (XOR CPURegs:$lhs, CPURegs:$rhs), 1)>;
diff --git a/lib/Target/Mips/MipsMCAsmInfo.cpp b/lib/Target/Mips/MipsMCAsmInfo.cpp
index fe48ab7..c86bf405 100644
--- a/lib/Target/Mips/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MipsMCAsmInfo.cpp
@@ -17,11 +17,15 @@ using namespace llvm;
 MipsMCAsmInfo::MipsMCAsmInfo(const Target &T, StringRef TT) {
   AlignmentIsInBytes          = false;
   Data16bitsDirective         = "\t.half\t";
-  Data32bitsDirective         = "\t.word\t";
+  Data32bitsDirective         = "\t.4byte\t";
   Data64bitsDirective         = 0;
   PrivateGlobalPrefix         = "$";
   CommentString               = "#";
   ZeroDirective               = "\t.space\t";
   GPRel32Directive            = "\t.gpword\t";
-  HasSetDirective             = false;
+  WeakRefDirective            = "\t.weak\t";
+
+  SupportsDebugInformation = true;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+  HasLEB128 = true;
 }
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index 1e8e4fe..df40e6c 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -14,6 +14,7 @@
 #ifndef MIPS_MACHINE_FUNCTION_INFO_H
 #define MIPS_MACHINE_FUNCTION_INFO_H
 
+#include <utility>
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/VectorExtras.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -26,50 +27,6 @@ namespace llvm {
 class MipsFunctionInfo : public MachineFunctionInfo {
 
 private:
-  /// Holds for each function where on the stack the Frame Pointer must be
-  /// saved. This is used on Prologue and Epilogue to emit FP save/restore
-  int FPStackOffset;
-
-  /// Holds for each function where on the stack the Return Address must be
-  /// saved. This is used on Prologue and Epilogue to emit RA save/restore
-  int RAStackOffset;
-
-  /// At each function entry, two special bitmask directives must be emitted
-  /// to help debugging, for CPU and FPU callee saved registers. Both need
-  /// the negative offset from the final stack size and its higher registers
-  /// location on the stack.
-  int CPUTopSavedRegOff;
-  int FPUTopSavedRegOff;
-
-  /// MipsFIHolder - Holds a FrameIndex and it's Stack Pointer Offset
-  struct MipsFIHolder {
-
-    int FI;
-    int SPOffset;
-
-    MipsFIHolder(int FrameIndex, int StackPointerOffset)
-      : FI(FrameIndex), SPOffset(StackPointerOffset) {}
-  };
-
-  /// When PIC is used the GP must be saved on the stack on the function
-  /// prologue and must be reloaded from this stack location after every
-  /// call. A reference to its stack location and frame index must be kept
-  /// to be used on emitPrologue and processFunctionBeforeFrameFinalized.
-  MipsFIHolder GPHolder;
-
-  /// On LowerFormalArguments the stack size is unknown, so the Stack
-  /// Pointer Offset calculation of "not in register arguments" must be
-  /// postponed to emitPrologue.
-  SmallVector<MipsFIHolder, 16> FnLoadArgs;
-  bool HasLoadArgs;
-
-  // When VarArgs, we must write registers back to caller stack, preserving
-  // on register arguments. Since the stack size is unknown on
-  // LowerFormalArguments, the Stack Pointer Offset calculation must be
-  // postponed to emitPrologue.
-  SmallVector<MipsFIHolder, 4> FnStoreVarArgs;
-  bool HasStoreVarArgs;
-
   /// SRetReturnReg - Some subtargets require that sret lowering includes
   /// returning the value of the returned struct in a register. This field
   /// holds the virtual register into which the sret argument is passed.
@@ -83,55 +40,47 @@ private:
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
   int VarArgsFrameIndex;
 
+  // Range of frame object indices.
+  // InArgFIRange: Range of indices of all frame objects created during call to
+  //               LowerFormalArguments.
+  // OutArgFIRange: Range of indices of all frame objects created during call to
+  //                LowerCall except for the frame object for restoring $gp. 
+  std::pair<int, int> InArgFIRange, OutArgFIRange;
+  int GPFI; // Index of the frame object for restoring $gp 
+  unsigned MaxCallFrameSize;
+
+  /// AtomicFrameIndex - To implement atomic.swap and atomic.cmp.swap
+  /// intrinsics, it is necessary to use a temporary stack location.
+  /// This field holds the frame index of this location.
+  int AtomicFrameIndex;
 public:
   MipsFunctionInfo(MachineFunction& MF)
-  : FPStackOffset(0), RAStackOffset(0), CPUTopSavedRegOff(0),
-    FPUTopSavedRegOff(0), GPHolder(-1,-1), HasLoadArgs(false),
-    HasStoreVarArgs(false), SRetReturnReg(0), GlobalBaseReg(0),
-    VarArgsFrameIndex(0)
+  : SRetReturnReg(0), GlobalBaseReg(0),
+    VarArgsFrameIndex(0), InArgFIRange(std::make_pair(-1, 0)),
+    OutArgFIRange(std::make_pair(-1, 0)), GPFI(0), MaxCallFrameSize(0),
+    AtomicFrameIndex(-1)
   {}
 
-  int getFPStackOffset() const { return FPStackOffset; }
-  void setFPStackOffset(int Off) { FPStackOffset = Off; }
-
-  int getRAStackOffset() const { return RAStackOffset; }
-  void setRAStackOffset(int Off) { RAStackOffset = Off; }
-
-  int getCPUTopSavedRegOff() const { return CPUTopSavedRegOff; }
-  void setCPUTopSavedRegOff(int Off) { CPUTopSavedRegOff = Off; }
-
-  int getFPUTopSavedRegOff() const { return FPUTopSavedRegOff; }
-  void setFPUTopSavedRegOff(int Off) { FPUTopSavedRegOff = Off; }
-
-  int getGPStackOffset() const { return GPHolder.SPOffset; }
-  int getGPFI() const { return GPHolder.FI; }
-  void setGPStackOffset(int Off) { GPHolder.SPOffset = Off; }
-  void setGPFI(int FI) { GPHolder.FI = FI; }
-  bool needGPSaveRestore() const { return GPHolder.SPOffset != -1; }
-
-  bool hasLoadArgs() const { return HasLoadArgs; }
-  bool hasStoreVarArgs() const { return HasStoreVarArgs; }
-
-  void recordLoadArgsFI(int FI, int SPOffset) {
-    if (!HasLoadArgs) HasLoadArgs=true;
-    FnLoadArgs.push_back(MipsFIHolder(FI, SPOffset));
-  }
-  void recordStoreVarArgsFI(int FI, int SPOffset) {
-    if (!HasStoreVarArgs) HasStoreVarArgs=true;
-    FnStoreVarArgs.push_back(MipsFIHolder(FI, SPOffset));
+  bool isInArgFI(int FI) const {
+    return FI <= InArgFIRange.first && FI >= InArgFIRange.second;
   }
+  void setLastInArgFI(int FI) { InArgFIRange.second = FI; }
 
-  void adjustLoadArgsFI(MachineFrameInfo *MFI) const {
-    if (!hasLoadArgs()) return;
-    for (unsigned i = 0, e = FnLoadArgs.size(); i != e; ++i)
-      MFI->setObjectOffset( FnLoadArgs[i].FI, FnLoadArgs[i].SPOffset );
+  bool isOutArgFI(int FI) const { 
+    return FI <= OutArgFIRange.first && FI >= OutArgFIRange.second;
   }
-  void adjustStoreVarArgsFI(MachineFrameInfo *MFI) const {
-    if (!hasStoreVarArgs()) return;
-    for (unsigned i = 0, e = FnStoreVarArgs.size(); i != e; ++i)
-      MFI->setObjectOffset( FnStoreVarArgs[i].FI, FnStoreVarArgs[i].SPOffset );
+  void extendOutArgFIRange(int FirstFI, int LastFI) {
+    if (!OutArgFIRange.second)
+      // this must be the first time this function was called.
+      OutArgFIRange.first = FirstFI;
+    OutArgFIRange.second = LastFI;
   }
 
+  int getGPFI() const { return GPFI; }
+  void setGPFI(int FI) { GPFI = FI; }
+  bool needGPSaveRestore() const { return getGPFI(); }
+  bool isGPFI(int FI) const { return GPFI && GPFI == FI; }
+
   unsigned getSRetReturnReg() const { return SRetReturnReg; }
   void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
 
@@ -140,6 +89,12 @@ public:
 
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
   void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+  unsigned getMaxCallFrameSize() const { return MaxCallFrameSize; }
+  void setMaxCallFrameSize(unsigned S) { MaxCallFrameSize = S; }
+
+  int getAtomicFrameIndex() const { return AtomicFrameIndex; }
+  void setAtomicFrameIndex(int Index) { AtomicFrameIndex = Index; }
 };
 
 } // end of namespace llvm
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index c09b129..b0984af 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -65,16 +65,16 @@ getRegisterNumbering(unsigned RegEnum)
     case Mips::T5   : case Mips::F13: return 13;
     case Mips::T6   : case Mips::F14: case Mips::D7: return 14;
     case Mips::T7   : case Mips::F15: return 15;
-    case Mips::T8   : case Mips::F16: case Mips::D8: return 16;
-    case Mips::T9   : case Mips::F17: return 17;
-    case Mips::S0   : case Mips::F18: case Mips::D9: return 18;
-    case Mips::S1   : case Mips::F19: return 19;
-    case Mips::S2   : case Mips::F20: case Mips::D10: return 20;
-    case Mips::S3   : case Mips::F21: return 21;
-    case Mips::S4   : case Mips::F22: case Mips::D11: return 22;
-    case Mips::S5   : case Mips::F23: return 23;
-    case Mips::S6   : case Mips::F24: case Mips::D12: return 24;
-    case Mips::S7   : case Mips::F25: return 25;
+    case Mips::S0   : case Mips::F16: case Mips::D8: return 16;
+    case Mips::S1   : case Mips::F17: return 17;
+    case Mips::S2   : case Mips::F18: case Mips::D9: return 18;
+    case Mips::S3   : case Mips::F19: return 19;
+    case Mips::S4   : case Mips::F20: case Mips::D10: return 20;
+    case Mips::S5   : case Mips::F21: return 21;
+    case Mips::S6   : case Mips::F22: case Mips::D11: return 22;
+    case Mips::S7   : case Mips::F23: return 23;
+    case Mips::T8   : case Mips::F24: case Mips::D12: return 24;
+    case Mips::T9   : case Mips::F25: return 25;
     case Mips::K0   : case Mips::F26: case Mips::D13: return 26;
     case Mips::K1   : case Mips::F27: return 27;
     case Mips::GP   : case Mips::F28: case Mips::D14: return 28;
@@ -98,22 +98,22 @@ getCalleeSavedRegs(const MachineFunction *MF) const
 {
   // Mips callee-save register range is $16-$23, $f20-$f30
   static const unsigned SingleFloatOnlyCalleeSavedRegs[] = {
-    Mips::S0, Mips::S1, Mips::S2, Mips::S3,
-    Mips::S4, Mips::S5, Mips::S6, Mips::S7,
-    Mips::F20, Mips::F21, Mips::F22, Mips::F23, Mips::F24, Mips::F25,
-    Mips::F26, Mips::F27, Mips::F28, Mips::F29, Mips::F30, 0
+    Mips::F30, Mips::F29, Mips::F28, Mips::F27, Mips::F26,
+    Mips::F25, Mips::F24, Mips::F23, Mips::F22, Mips::F21, Mips::F20,
+    Mips::RA, Mips::FP, Mips::S7, Mips::S6, Mips::S5, Mips::S4,
+    Mips::S3, Mips::S2, Mips::S1, Mips::S0, 0
   };
 
-  static const unsigned BitMode32CalleeSavedRegs[] = {
-    Mips::S0, Mips::S1, Mips::S2, Mips::S3,
-    Mips::S4, Mips::S5, Mips::S6, Mips::S7,
-    Mips::F20, Mips::F22, Mips::F24, Mips::F26, Mips::F28, Mips::F30, 0
+  static const unsigned Mips32CalleeSavedRegs[] = {
+    Mips::D15, Mips::D14, Mips::D13, Mips::D12, Mips::D11, Mips::D10,
+    Mips::RA, Mips::FP, Mips::S7, Mips::S6, Mips::S5, Mips::S4,
+    Mips::S3, Mips::S2, Mips::S1, Mips::S0, 0
   };
 
   if (Subtarget.isSingleFloat())
     return SingleFloatOnlyCalleeSavedRegs;
   else
-    return BitMode32CalleeSavedRegs;
+    return Mips32CalleeSavedRegs;
 }
 
 BitVector MipsRegisterInfo::
@@ -127,9 +127,11 @@ getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(Mips::SP);
   Reserved.set(Mips::FP);
   Reserved.set(Mips::RA);
+  Reserved.set(Mips::F31);
+  Reserved.set(Mips::D15);
 
   // SRV4 requires that odd register can't be used.
-  if (!Subtarget.isSingleFloat())
+  if (!Subtarget.isSingleFloat() && !Subtarget.isMips32())
     for (unsigned FReg=(Mips::F0)+1; FReg < Mips::F30; FReg+=2)
       Reserved.set(FReg);
 
@@ -153,6 +155,8 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
                     RegScavenger *RS) const {
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
 
   unsigned i = 0;
   while (!MI.getOperand(i).isFI()) {
@@ -172,9 +176,19 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
                << "spOffset   : " << spOffset << "\n"
                << "stackSize  : " << stackSize << "\n");
 
-  // as explained on LowerFormalArguments, detect negative offsets
-  // and adjust SPOffsets considering the final stack size.
-  int Offset = ((spOffset < 0) ? (stackSize + (-(spOffset+4))) : (spOffset));
+  int Offset;
+
+  // Calculate final offset.
+  // - There is no need to change the offset if the frame object is an outgoing
+  //   argument or a $gp restore location,
+  // - If the frame object is any of the following, its offset must be adjusted
+  //   by adding the size of the stack:
+  //   incoming argument, callee-saved register location or local variable.  
+  if (MipsFI->isOutArgFI(FrameIndex) || MipsFI->isGPFI(FrameIndex))
+    Offset = spOffset;
+  else
+    Offset = spOffset + stackSize;
+
   Offset    += MI.getOperand(i-1).getImm();
 
   DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
@@ -183,26 +197,45 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
   int NewImm = 0;
   MachineBasicBlock &MBB = *MI.getParent();
   bool ATUsed;
-  unsigned OrigReg = getFrameRegister(MF);
-  int OrigImm = Offset;
+  unsigned FrameReg;
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  int MinCSFI = 0;
+  int MaxCSFI = -1;
+
+  if (CSI.size()) {
+    MinCSFI = CSI[0].getFrameIdx();
+    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  }
 
-// OrigImm fits in the 16-bit field
-  if (OrigImm < 0x8000 && OrigImm >= -0x8000) {
-    NewReg = OrigReg;
-    NewImm = OrigImm;
+  // The following stack frame objects are always referenced relative to $sp:
+  //  1. Outgoing arguments.
+  //  2. Pointer to dynamically allocated stack space.
+  //  3. Locations for callee-saved registers.
+  // Everything else is referenced relative to whatever register 
+  // getFrameRegister() returns.
+  if (MipsFI->isOutArgFI(FrameIndex) ||
+      (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI))
+    FrameReg = Mips::SP;
+  else
+    FrameReg = getFrameRegister(MF); 
+  
+  // Offset fits in the 16-bit field
+  if (Offset < 0x8000 && Offset >= -0x8000) {
+    NewReg = FrameReg;
+    NewImm = Offset;
     ATUsed = false;
   }
   else {
     const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
     DebugLoc DL = II->getDebugLoc();
-    int ImmLo = OrigImm & 0xffff;
-    int ImmHi = (((unsigned)OrigImm & 0xffff0000) >> 16) +
-                ((OrigImm & 0x8000) != 0);
+    int ImmLo = (short)(Offset & 0xffff);
+    int ImmHi = (((unsigned)Offset & 0xffff0000) >> 16) +
+                ((Offset & 0x8000) != 0);
 
     // FIXME: change this when mips goes MC".
     BuildMI(MBB, II, DL, TII->get(Mips::NOAT));
     BuildMI(MBB, II, DL, TII->get(Mips::LUi), Mips::AT).addImm(ImmHi);
-    BuildMI(MBB, II, DL, TII->get(Mips::ADDu), Mips::AT).addReg(OrigReg)
+    BuildMI(MBB, II, DL, TII->get(Mips::ADDu), Mips::AT).addReg(FrameReg)
                                                         .addReg(Mips::AT);
     NewReg = Mips::AT;
     NewImm = ImmLo;
@@ -218,15 +251,6 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
   MI.getOperand(i-1).ChangeToImmediate(NewImm);
 }
 
-void MipsRegisterInfo::
-processFunctionBeforeFrameFinalized(MachineFunction &MF) const {
-  // Set the stack offset where GP must be saved/loaded from.
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
-  if (MipsFI->needGPSaveRestore())
-    MFI->setObjectOffset(MipsFI->getGPFI(), MipsFI->getGPStackOffset());
-}
-
 unsigned MipsRegisterInfo::
 getRARegister() const {
   return Mips::RA;
@@ -253,8 +277,11 @@ getEHHandlerRegister() const {
 
 int MipsRegisterInfo::
 getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  llvm_unreachable("What is the dwarf register number");
-  return -1;
+  return MipsGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
+}
+
+int MipsRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
+  return MipsGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
 }
 
 #include "MipsGenRegisterInfo.inc"
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 767359f..76b0035 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -63,6 +63,7 @@ struct MipsRegisterInfo : public MipsGenRegisterInfo {
   unsigned getEHHandlerRegister() const;
 
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 9f9cae7..e97d450 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -44,6 +44,11 @@ class AFPR<bits<5> num, string n, list<Register> subregs>
   let SubRegIndices = [sub_fpeven, sub_fpodd];
 }
 
+// Mips Hardware Registers
+class HWR<bits<5> num, string n> : MipsReg<n> {
+  let Num = num;
+}
+
 //===----------------------------------------------------------------------===//
 //  Registers
 //===----------------------------------------------------------------------===//
@@ -55,7 +60,7 @@ let Namespace = "Mips" in {
   def AT   : MipsGPRReg< 1, "AT">,   DwarfRegNum<[1]>;
   def V0   : MipsGPRReg< 2, "2">,    DwarfRegNum<[2]>;
   def V1   : MipsGPRReg< 3, "3">,    DwarfRegNum<[3]>;
-  def A0   : MipsGPRReg< 4, "4">,    DwarfRegNum<[5]>;
+  def A0   : MipsGPRReg< 4, "4">,    DwarfRegNum<[4]>;
   def A1   : MipsGPRReg< 5, "5">,    DwarfRegNum<[5]>;
   def A2   : MipsGPRReg< 6, "6">,    DwarfRegNum<[6]>;
   def A3   : MipsGPRReg< 7, "7">,    DwarfRegNum<[7]>;
@@ -120,22 +125,22 @@ let Namespace = "Mips" in {
 
   /// Mips Double point precision FPU Registers (aliased
   /// with the single precision to hold 64 bit values)
-  def D0  : AFPR< 0,  "F0", [F0,   F1]>, DwarfRegNum<[32]>;
-  def D1  : AFPR< 2,  "F2", [F2,   F3]>, DwarfRegNum<[34]>;
-  def D2  : AFPR< 4,  "F4", [F4,   F5]>, DwarfRegNum<[36]>;
-  def D3  : AFPR< 6,  "F6", [F6,   F7]>, DwarfRegNum<[38]>;
-  def D4  : AFPR< 8,  "F8", [F8,   F9]>, DwarfRegNum<[40]>;
-  def D5  : AFPR<10, "F10", [F10, F11]>, DwarfRegNum<[42]>;
-  def D6  : AFPR<12, "F12", [F12, F13]>, DwarfRegNum<[44]>;
-  def D7  : AFPR<14, "F14", [F14, F15]>, DwarfRegNum<[46]>;
-  def D8  : AFPR<16, "F16", [F16, F17]>, DwarfRegNum<[48]>;
-  def D9  : AFPR<18, "F18", [F18, F19]>, DwarfRegNum<[50]>;
-  def D10 : AFPR<20, "F20", [F20, F21]>, DwarfRegNum<[52]>;
-  def D11 : AFPR<22, "F22", [F22, F23]>, DwarfRegNum<[54]>;
-  def D12 : AFPR<24, "F24", [F24, F25]>, DwarfRegNum<[56]>;
-  def D13 : AFPR<26, "F26", [F26, F27]>, DwarfRegNum<[58]>;
-  def D14 : AFPR<28, "F28", [F28, F29]>, DwarfRegNum<[60]>;
-  def D15 : AFPR<30, "F30", [F30, F31]>, DwarfRegNum<[62]>;
+  def D0  : AFPR< 0,  "F0", [F0,   F1]>;
+  def D1  : AFPR< 2,  "F2", [F2,   F3]>;
+  def D2  : AFPR< 4,  "F4", [F4,   F5]>;
+  def D3  : AFPR< 6,  "F6", [F6,   F7]>;
+  def D4  : AFPR< 8,  "F8", [F8,   F9]>;
+  def D5  : AFPR<10, "F10", [F10, F11]>;
+  def D6  : AFPR<12, "F12", [F12, F13]>;
+  def D7  : AFPR<14, "F14", [F14, F15]>;
+  def D8  : AFPR<16, "F16", [F16, F17]>;
+  def D9  : AFPR<18, "F18", [F18, F19]>;
+  def D10 : AFPR<20, "F20", [F20, F21]>;
+  def D11 : AFPR<22, "F22", [F22, F23]>;
+  def D12 : AFPR<24, "F24", [F24, F25]>;
+  def D13 : AFPR<26, "F26", [F26, F27]>;
+  def D14 : AFPR<28, "F28", [F28, F29]>;
+  def D15 : AFPR<30, "F30", [F30, F31]>;
 
   // Hi/Lo registers
   def HI  : Register<"hi">, DwarfRegNum<[64]>;
@@ -143,6 +148,9 @@ let Namespace = "Mips" in {
 
   // Status flags register
   def FCR31 : Register<"31">;
+
+  // Hardware register $29
+  def HWR29 : Register<"29">;
 }
 
 //===----------------------------------------------------------------------===//
@@ -157,19 +165,7 @@ def CPURegs : RegisterClass<"Mips", [i32], 32,
   // Callee save
   S0, S1, S2, S3, S4, S5, S6, S7,
   // Reserved
-  ZERO, AT, K0, K1, GP, SP, FP, RA]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    CPURegsClass::iterator
-    CPURegsClass::allocation_order_end(const MachineFunction &MF) const {
-      // The last 8 registers on the list above are reserved
-      return end()-8;
-    }
-  }];
-}
+  ZERO, AT, K0, K1, GP, SP, FP, RA]>;
 
 // 64bit fp:
 // * FGR64  - 32 64-bit registers
@@ -186,52 +182,7 @@ def FGR32 : RegisterClass<"Mips", [f32], 32,
   // Callee save
   F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30,
   // Reserved
-  F31]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-
-    static const unsigned MIPS_FGR32[] = {
-      Mips::F0,  Mips::F1,  Mips::F2,  Mips::F3,  Mips::F12,  Mips::F13,
-      Mips::F14, Mips::F15, Mips::F4,  Mips::F5,  Mips::F6,   Mips::F7,
-      Mips::F8,  Mips::F9,  Mips::F10, Mips::F11, Mips::F16,  Mips::F17,
-      Mips::F18, Mips::F19, Mips::F20, Mips::F21, Mips::F22,  Mips::F23,
-      Mips::F24, Mips::F25, Mips::F26, Mips::F27, Mips::F28,  Mips::F29,
-      Mips::F30
-    };
-
-    static const unsigned MIPS_SVR4_FGR32[] = {
-      Mips::F0,  Mips::F2,  Mips::F12, Mips::F14, Mips::F4,
-      Mips::F6,  Mips::F8,  Mips::F10, Mips::F16, Mips::F18,
-      Mips::F20, Mips::F22, Mips::F24, Mips::F26, Mips::F28, Mips::F30,
-    };
-
-    FGR32Class::iterator
-    FGR32Class::allocation_order_begin(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
-
-      if (Subtarget.isSingleFloat())
-        return MIPS_FGR32;
-      else
-        return MIPS_SVR4_FGR32;
-    }
-
-    FGR32Class::iterator
-    FGR32Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
-
-      if (Subtarget.isSingleFloat())
-        return MIPS_FGR32 + (sizeof(MIPS_FGR32) / sizeof(unsigned));
-      else
-        return MIPS_SVR4_FGR32 + (sizeof(MIPS_SVR4_FGR32) / sizeof(unsigned));
-    }
-  }];
-}
+  F31]>;
 
 def AFGR64 : RegisterClass<"Mips", [f64], 64,
   // Return Values and Arguments
@@ -241,19 +192,8 @@ def AFGR64 : RegisterClass<"Mips", [f64], 64,
   // Callee save
   D10, D11, D12, D13, D14,
   // Reserved
-  D15]>
-{
+  D15]> {
   let SubRegClasses = [(FGR32 sub_fpeven, sub_fpodd)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    AFGR64Class::iterator
-    AFGR64Class::allocation_order_end(const MachineFunction &MF) const {
-      // The last register on the list above is reserved
-      return end()-1;
-    }
-  }];
 }
 
 // Condition Register for floating point operations
@@ -262,3 +202,5 @@ def CCR  : RegisterClass<"Mips", [i32], 32, [FCR31]>;
 // Hi/Lo Registers
 def HILO : RegisterClass<"Mips", [i32], 32, [HI, LO]>;
 
+// Hardware registers
+def HWRegs : RegisterClass<"Mips", [i32], 32, [HWR29]>;
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 53190b4..cfbb92c 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -38,8 +38,9 @@ MipsTargetMachine(const Target &T, const std::string &TT, const std::string &FS,
                   bool isLittle=false):
   LLVMTargetMachine(T, TT),
   Subtarget(TT, FS, isLittle),
-  DataLayout(isLittle ? std::string("e-p:32:32:32-i8:8:32-i16:16:32-n32") :
-                        std::string("E-p:32:32:32-i8:8:32-i16:16:32-n32")),
+  DataLayout(isLittle ? 
+             std::string("e-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32") :
+             std::string("E-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32")),
   InstrInfo(*this),
   FrameLowering(Subtarget),
   TLInfo(*this), TSInfo(*this) {
@@ -77,6 +78,12 @@ addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel)
 }
 
 bool MipsTargetMachine::
+addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel) {
+  PM.add(createMipsEmitGPRestorePass(*this));
+  return true;
+}
+
+bool MipsTargetMachine::
 addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel) {
   PM.add(createMipsExpandPseudoPass(*this));
   return true;
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index badb652..102dd85 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -63,6 +63,8 @@ namespace llvm {
                                  CodeGenOpt::Level OptLevel);
     virtual bool addPreEmitPass(PassManagerBase &PM,
                                 CodeGenOpt::Level OptLevel);
+    virtual bool addPreRegAlloc(PassManagerBase &PM,
+                                CodeGenOpt::Level OptLevel);
     virtual bool addPostRegAlloc(PassManagerBase &, CodeGenOpt::Level);
   };
 
diff --git a/lib/Target/PTX/PTX.td b/lib/Target/PTX/PTX.td
index ae8326e..231866a 100644
--- a/lib/Target/PTX/PTX.td
+++ b/lib/Target/PTX/PTX.td
@@ -24,6 +24,9 @@ include "llvm/Target/Target.td"
 def FeatureDouble : SubtargetFeature<"double", "SupportsDouble", "true",
                                      "Do not demote .f64 to .f32">;
 
+def FeatureNoFMA  : SubtargetFeature<"no-fma","SupportsFMA", "false",
+                                     "Disable Fused-Multiply Add">;
+
 //===- PTX Version --------------------------------------------------------===//
 
 def FeaturePTX20 : SubtargetFeature<"ptx20", "PTXVersion", "PTX_VERSION_2_0",
@@ -38,6 +41,10 @@ def FeaturePTX22 : SubtargetFeature<"ptx22", "PTXVersion", "PTX_VERSION_2_2",
                                     "Use PTX Language Version 2.2",
                                     [FeaturePTX21]>;
 
+def FeaturePTX23 : SubtargetFeature<"ptx23", "PTXVersion", "PTX_VERSION_2_3",
+                                    "Use PTX Language Version 2.3",
+                                    [FeaturePTX22]>;
+
 //===- PTX Shader Model ---------------------------------------------------===//
 
 def FeatureSM10 : SubtargetFeature<"sm10", "PTXShaderModel", "PTX_SM_1_0",
diff --git a/lib/Target/PTX/PTXISelLowering.cpp b/lib/Target/PTX/PTXISelLowering.cpp
index 23b93da..e9b1d8c 100644
--- a/lib/Target/PTX/PTXISelLowering.cpp
+++ b/lib/Target/PTX/PTXISelLowering.cpp
@@ -65,7 +65,9 @@ PTXTargetLowering::PTXTargetLowering(TargetMachine &TM)
   
   // need to lower SETCC of Preds into bitwise logic
   setOperationAction(ISD::SETCC, MVT::i1, Custom);
-  
+
+  setMinFunctionAlignment(2);
+
   // Compute derived properties from the register classes
   computeRegisterProperties();
 }
diff --git a/lib/Target/PTX/PTXISelLowering.h b/lib/Target/PTX/PTXISelLowering.h
index 6a7e3e6..225c000 100644
--- a/lib/Target/PTX/PTXISelLowering.h
+++ b/lib/Target/PTX/PTXISelLowering.h
@@ -37,9 +37,6 @@ class PTXTargetLowering : public TargetLowering {
 
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
-    virtual unsigned getFunctionAlignment(const Function *F) const {
-      return 2; }
-
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
 
     virtual SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/PTX/PTXInstrInfo.td b/lib/Target/PTX/PTXInstrInfo.td
index 1ac9d3f..d5d08be 100644
--- a/lib/Target/PTX/PTXInstrInfo.td
+++ b/lib/Target/PTX/PTXInstrInfo.td
@@ -36,6 +36,12 @@ def SupportsPTX21       : Predicate<"getSubtarget().supportsPTX21()">;
 def DoesNotSupportPTX21 : Predicate<"!getSubtarget().supportsPTX21()">;
 def SupportsPTX22       : Predicate<"getSubtarget().supportsPTX22()">;
 def DoesNotSupportPTX22 : Predicate<"!getSubtarget().supportsPTX22()">;
+def SupportsPTX23       : Predicate<"getSubtarget().supportsPTX23()">;
+def DoesNotSupportPTX23 : Predicate<"!getSubtarget().supportsPTX23()">;
+
+// Fused-Multiply Add
+def SupportsFMA         : Predicate<"getSubtarget().supportsFMA()">;
+def DoesNotSupportFMA   : Predicate<"!getSubtarget().supportsFMA()">;
 
 //===----------------------------------------------------------------------===//
 // Instruction Pattern Stuff
@@ -627,8 +633,8 @@ def FDIVri64SM10 : InstPTX<(outs RRegf64:$d),
 // In the short term, mad is supported on all PTX versions and we use a
 // default rounding mode no matter what shader model or PTX version.
 // TODO: Allow the rounding mode to be selectable through llc.
-defm FMADSM13 : PTX_FLOAT_4OP<"mad.rn", fmul, fadd>, Requires<[SupportsSM13]>;
-defm FMAD : PTX_FLOAT_4OP<"mad", fmul, fadd>, Requires<[DoesNotSupportSM13]>;
+defm FMADSM13 : PTX_FLOAT_4OP<"mad.rn", fmul, fadd>, Requires<[SupportsSM13, SupportsFMA]>;
+defm FMAD : PTX_FLOAT_4OP<"mad", fmul, fadd>, Requires<[DoesNotSupportSM13, SupportsFMA]>;
 
 ///===- Floating-Point Intrinsic Instructions -----------------------------===//
 
@@ -665,6 +671,8 @@ def FCOS64 : InstPTX<(outs RRegf64:$d),
 
 ///===- Comparison and Selection Instructions -----------------------------===//
 
+// .setp
+
 // Compare u16
 
 defm SETPEQu16 : PTX_SETP_I<RRegu16, "u16", i16imm, SETEQ,  "eq">;
diff --git a/lib/Target/PTX/PTXRegisterInfo.h b/lib/Target/PTX/PTXRegisterInfo.h
index 67e130f..dc56352 100644
--- a/lib/Target/PTX/PTXRegisterInfo.h
+++ b/lib/Target/PTX/PTXRegisterInfo.h
@@ -57,6 +57,9 @@ struct PTXRegisterInfo : public PTXGenRegisterInfo {
   virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const {
     return PTXGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
   }
+  virtual int getLLVMRegNum(unsigned RegNum, bool isEH) const {
+    return PTXGenRegisterInfo::getLLVMRegNumFull(RegNum, 0);
+  }
 }; // struct PTXRegisterInfo
 } // namespace llvm
 
diff --git a/lib/Target/PTX/PTXSubtarget.cpp b/lib/Target/PTX/PTXSubtarget.cpp
index a224f2b..e8a1dfe 100644
--- a/lib/Target/PTX/PTXSubtarget.cpp
+++ b/lib/Target/PTX/PTXSubtarget.cpp
@@ -21,7 +21,8 @@ PTXSubtarget::PTXSubtarget(const std::string &TT, const std::string &FS,
   : PTXShaderModel(PTX_SM_1_0),
     PTXVersion(PTX_VERSION_2_0),
     SupportsDouble(false),
-    Is64Bit(is64Bit) {
+    SupportsFMA(true),
+    Is64Bit(is64Bit) {	
   std::string TARGET = "generic";
   ParseSubtargetFeatures(FS, TARGET);
 }
@@ -41,6 +42,7 @@ std::string PTXSubtarget::getPTXVersionString() const {
     case PTX_VERSION_2_0: return "2.0";
     case PTX_VERSION_2_1: return "2.1";
     case PTX_VERSION_2_2: return "2.2";
+    case PTX_VERSION_2_3: return "2.3";
   }
 }
 
diff --git a/lib/Target/PTX/PTXSubtarget.h b/lib/Target/PTX/PTXSubtarget.h
index 47d9842..59fa696 100644
--- a/lib/Target/PTX/PTXSubtarget.h
+++ b/lib/Target/PTX/PTXSubtarget.h
@@ -37,7 +37,8 @@ namespace llvm {
       enum PTXVersionEnum {
         PTX_VERSION_2_0,  /*< PTX Version 2.0 */
         PTX_VERSION_2_1,  /*< PTX Version 2.1 */
-        PTX_VERSION_2_2   /*< PTX Version 2.2 */
+        PTX_VERSION_2_2,  /*< PTX Version 2.2 */
+        PTX_VERSION_2_3   /*< PTX Version 2.3 */
       };
 
       /// Shader Model supported on the target GPU.
@@ -48,7 +49,10 @@ namespace llvm {
 
       // The native .f64 type is supported on the hardware.
       bool SupportsDouble;
-
+      
+      // Support the fused-multiply add (FMA) and multiply-add (MAD) instructions
+      bool SupportsFMA;
+      
       // Use .u64 instead of .u32 for addresses.
       bool Is64Bit;
 
@@ -63,6 +67,8 @@ namespace llvm {
 
       bool is64Bit() const { return Is64Bit; }
 
+      bool supportsFMA() const { return SupportsFMA; }
+      
       bool supportsSM13() const { return PTXShaderModel >= PTX_SM_1_3; }
 
       bool supportsSM20() const { return PTXShaderModel >= PTX_SM_2_0; }
@@ -71,6 +77,8 @@ namespace llvm {
 
       bool supportsPTX22() const { return PTXVersion >= PTX_VERSION_2_2; }
 
+      bool supportsPTX23() const { return PTXVersion >= PTX_VERSION_2_3; }
+
       std::string ParseSubtargetFeatures(const std::string &FS,
                                          const std::string &CPU);
   }; // class PTXSubtarget
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index c8db0c4..1a9bd76 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -26,6 +26,9 @@ StringRef PPCInstPrinter::getOpcodeName(unsigned Opcode) const {
   return getInstructionName(Opcode);
 }
 
+void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << getRegisterName(RegNo);
+}
 
 void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
   // Check for slwi/srwi mnemonics.
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index 9cf9db9..adfa0aa 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -33,6 +33,7 @@ public:
     return SyntaxVariant == 1;
   }
   
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
   virtual void printInst(const MCInst *MI, raw_ostream &O);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
   
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index 7242f3a..92672b5 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -43,7 +43,7 @@ namespace llvm {
   TargetAsmBackend *createPPCAsmBackend(const Target &, const std::string &);
   
   void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
-                                    AsmPrinter &AP);
+                                    AsmPrinter &AP, bool isDarwin);
   
   extern Target ThePPC32Target;
   extern Target ThePPC64Target;
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 09a9be9..b795db9 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -344,7 +344,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
   case PPC::LDtoc: {
     // Transform %X3 = LDtoc <ga:@min1>, %X2
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
       
     // Change the opcode to LD, and the global address operand to be a
     // reference to the TOC entry we will synthesize later.
@@ -376,7 +376,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
-  LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
+  LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
   OutStreamer.EmitInstruction(TmpInst);
 }
 
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 6aca6b0..375e000 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -259,8 +259,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineModuleInfo &MMI = MF.getMMI();
   DebugLoc dl;
   bool needsFrameMoves = MMI.hasDebugInfo() ||
-       !MF.getFunction()->doesNotThrow() ||
-       UnwindTablesMandatory;
+    MF.getFunction()->needsUnwindTableEntry();
 
   // Prepare for frame info.
   MCSymbol *FrameLabel = 0;
@@ -488,6 +487,14 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
       unsigned Reg = CSI[I].getReg();
       if (Reg == PPC::LR || Reg == PPC::LR8 || Reg == PPC::RM) continue;
+
+      // This is a bit of a hack: CR2LT, CR2GT, CR2EQ and CR2UN are just
+      // subregisters of CR2. We just need to emit a move of CR2.
+      if (Reg == PPC::CR2LT || Reg == PPC::CR2GT || Reg == PPC::CR2EQ)
+        continue;
+      if (Reg == PPC::CR2UN)
+        Reg = PPC::CR2;
+
       MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
       MachineLocation CSSrc(Reg);
       Moves.push_back(MachineMove(Label, CSDst, CSSrc));
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index 0de5844..74ecff5 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -233,7 +233,7 @@ void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) {
   unsigned Opcode = Node->getMachineOpcode();
 
   // Update structural hazard information.
-  if (Opcode == PPC::MTCTR) HasCTRSet = true;
+  if (Opcode == PPC::MTCTR || Opcode == PPC::MTCTR8) HasCTRSet = true;
 
   // Track the address stored to.
   if (isStore) {
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index faae9b2..511bb22 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -240,11 +240,11 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
 
     if (PPCLowering.getPointerTy() == MVT::i32) {
       GlobalBaseReg = RegInfo->createVirtualRegister(PPC::GPRCRegisterClass);
-      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR), PPC::LR);
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
     } else {
       GlobalBaseReg = RegInfo->createVirtualRegister(PPC::G8RCRegisterClass);
-      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8), PPC::LR8);
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8));
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg);
     }
   }
@@ -1057,9 +1057,10 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     SDValue Chain = N->getOperand(0);
     SDValue Target = N->getOperand(1);
     unsigned Opc = Target.getValueType() == MVT::i32 ? PPC::MTCTR : PPC::MTCTR8;
+    unsigned Reg = Target.getValueType() == MVT::i32 ? PPC::BCTR : PPC::BCTR8;
     Chain = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Target,
                                            Chain), 0);
-    return CurDAG->SelectNodeTo(N, PPC::BCTR, MVT::Other, Chain);
+    return CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain);
   }
   }
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 128522c..dbb184c 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -394,6 +394,10 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");
   }
 
+  setMinFunctionAlignment(2);
+  if (PPCSubTarget.isDarwin())
+    setPrefFunctionAlignment(4);
+
   computeRegisterProperties();
 }
 
@@ -460,14 +464,6 @@ MVT::SimpleValueType PPCTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i32;
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned PPCTargetLowering::getFunctionAlignment(const Function *F) const {
-  if (getTargetMachine().getSubtarget<PPCSubtarget>().isDarwin())
-    return F->hasFnAttr(Attribute::OptimizeForSize) ? 2 : 4;
-  else
-    return 2;
-}
-
 //===----------------------------------------------------------------------===//
 // Node matching predicates, for use by the tblgen matching code.
 //===----------------------------------------------------------------------===//
@@ -1014,7 +1010,8 @@ bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
       short Imm;
       if (isIntS16Immediate(CN, Imm)) {
         Disp = DAG.getTargetConstant((unsigned short)Imm >> 2, getPointerTy());
-        Base = DAG.getRegister(PPC::R0, CN->getValueType(0));
+        Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::X0 : PPC::R0,
+                               CN->getValueType(0));
         return true;
       }
 
@@ -1561,8 +1558,8 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
   CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
@@ -1622,8 +1619,8 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
   // Aggregates passed by value are stored in the local variable space of the
   // caller's stack frame, right above the parameter list area.
   SmallVector<CCValAssign, 16> ByValArgLocs;
-  CCState CCByValInfo(CallConv, isVarArg, getTargetMachine(),
-                      ByValArgLocs, *DAG.getContext());
+  CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		      getTargetMachine(), ByValArgLocs, *DAG.getContext());
 
   // Reserve stack space for the allocations in CCInfo.
   CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
@@ -2565,7 +2562,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     Callee.setNode(0);
     // Add CTR register as callee so a bctr can be emitted later.
     if (isTailCall)
-      Ops.push_back(DAG.getRegister(PPC::CTR, PtrVT));
+      Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT));
   }
 
   // If this is a direct call, pass the chain and the callee.
@@ -2594,8 +2591,8 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                    SmallVectorImpl<SDValue> &InVals) const {
 
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCRetInfo(CallConv, isVarArg, getTargetMachine(),
-                    RVLocs, *DAG.getContext());
+  CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		    getTargetMachine(), RVLocs, *DAG.getContext());
   CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
 
   // Copy all of the result registers out of their specified physreg.
@@ -2644,8 +2641,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
     // to the liveout set for the function.
     if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
       SmallVector<CCValAssign, 16> RVLocs;
-      CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
-                     *DAG.getContext());
+      CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		     getTargetMachine(), RVLocs, *DAG.getContext());
       CCInfo.AnalyzeCallResult(Ins, RetCC_PPC);
       for (unsigned i = 0; i != RVLocs.size(); ++i)
         DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
@@ -2758,8 +2755,8 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
 
   // Assign locations to all of the outgoing arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
   CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
@@ -2798,8 +2795,8 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
 
   // Assign locations to all of the outgoing aggregate by value arguments.
   SmallVector<CCValAssign, 16> ByValArgLocs;
-  CCState CCByValInfo(CallConv, isVarArg, getTargetMachine(), ByValArgLocs,
-                      *DAG.getContext());
+  CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		      getTargetMachine(), ByValArgLocs, *DAG.getContext());
 
   // Reserve stack space for the allocations in CCInfo.
   CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
@@ -3306,8 +3303,8 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
                                DebugLoc dl, SelectionDAG &DAG) const {
 
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
 
   // If this is the first return lowered for this function, add the regs to the
@@ -5442,10 +5439,16 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
 
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
 /// vector.  If it is invalid, don't add anything to Ops.
-void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, char Letter,
+void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
   SDValue Result(0,0);
+
+  // Only support length 1 constraints.
+  if (Constraint.length() > 1) return;
+
+  char Letter = Constraint[0];
   switch (Letter) {
   default: break;
   case 'I':
@@ -5501,7 +5504,7 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, char Letter,
   }
 
   // Handle standard constraint letters.
-  TargetLowering::LowerAsmOperandForConstraint(Op, Letter, Ops, DAG);
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
 // isLegalAddressingMode - Return true if the addressing mode represented
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 33daae9..986b4e7 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -328,7 +328,7 @@ namespace llvm {
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops.
     virtual void LowerAsmOperandForConstraint(SDValue Op,
-                                              char ConstraintLetter,
+                                              std::string &Constraint,
                                               std::vector<SDValue> &Ops,
                                               SelectionDAG &DAG) const;
 
@@ -364,9 +364,6 @@ namespace llvm {
                         bool NonScalarIntSafe, bool MemcpyStrSrc,
                         MachineFunction &MF) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const;
-
   private:
     SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
     SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 9f0fae5..e88ad37 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -60,7 +60,7 @@ def HI48_64 : SDNodeXForm<imm, [{
 //
 
 let Defs = [LR8] in
-  def MovePCtoLR8 : Pseudo<(outs), (ins piclabel:$label), "", []>,
+  def MovePCtoLR8 : Pseudo<(outs), (ins), "", []>,
                     PPC970_Unit_BRU;
 
 // Darwin ABI Calls.
@@ -190,10 +190,15 @@ def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset, variable_ops)
 
 
 let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
-    isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR, RM] in
-def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>,
-    Requires<[In64BitMode]>;
+    isIndirectBranch = 1, isCall = 1, Uses = [CTR8, RM] in {
+  let isReturn = 1 in {
+    def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>,
+        Requires<[In64BitMode]>;
+  }
 
+  def BCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>,
+      Requires<[In64BitMode]>;
+}
 
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 24071b7..773578c 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -300,7 +300,6 @@ def calltarget : Operand<iPTR> {
 def aaddr : Operand<iPTR> {
   let PrintMethod = "printAbsAddrOperand";
 }
-def piclabel: Operand<iPTR> {}
 def symbolHi: Operand<i32> {
   let PrintMethod = "printSymbolHi";
   let EncoderMethod = "getHA16Encoding";
@@ -413,7 +412,7 @@ let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
 }
 
 let Defs = [LR] in
-  def MovePCtoLR : Pseudo<(outs), (ins piclabel:$label), "", []>,
+  def MovePCtoLR : Pseudo<(outs), (ins), "", []>,
                    PPC970_Unit_BRU;
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
diff --git a/lib/Target/PowerPC/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/PPCMCAsmInfo.cpp
index 9e508cc..2d5c880 100644
--- a/lib/Target/PowerPC/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/PPCMCAsmInfo.cpp
@@ -48,7 +48,7 @@ PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
 
   // Exceptions handling
   if (!is64Bit)
-    ExceptionsType = ExceptionHandling::DwarfTable;
+    ExceptionsType = ExceptionHandling::DwarfCFI;
     
   ZeroDirective = "\t.space\t";
   Data64bitsDirective = is64Bit ? "\t.quad\t" : 0;
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index 6082587..33af426 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -95,14 +95,14 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
 }
 
 static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
-                              AsmPrinter &Printer) {
+                              AsmPrinter &Printer, bool isDarwin) {
   MCContext &Ctx = Printer.OutContext;
   MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
 
   if (MO.getTargetFlags() & PPCII::MO_LO16)
-    RefKind = MCSymbolRefExpr::VK_PPC_LO16;
+    RefKind = isDarwin ? MCSymbolRefExpr::VK_PPC_DARWIN_LO16 : MCSymbolRefExpr::VK_PPC_GAS_LO16;
   else if (MO.getTargetFlags() & PPCII::MO_HA16)
-    RefKind = MCSymbolRefExpr::VK_PPC_HA16;
+    RefKind = isDarwin ? MCSymbolRefExpr::VK_PPC_DARWIN_HA16 : MCSymbolRefExpr::VK_PPC_GAS_HA16;
 
   // FIXME: This isn't right, but we don't have a good way to express this in
   // the MC Level, see below.
@@ -130,7 +130,7 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
 }
 
 void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
-                                        AsmPrinter &AP) {
+                                        AsmPrinter &AP, bool isDarwin) {
   OutMI.setOpcode(MI->getOpcode());
   
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
@@ -154,16 +154,17 @@ void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
       break;
     case MachineOperand::MO_GlobalAddress:
     case MachineOperand::MO_ExternalSymbol:
-      MCOp = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP);
+      MCOp = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin);
       break;
     case MachineOperand::MO_JumpTableIndex:
-      MCOp = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP);
+      MCOp = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin);
       break;
     case MachineOperand::MO_ConstantPoolIndex:
-      MCOp = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP);
+      MCOp = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin);
       break;
     case MachineOperand::MO_BlockAddress:
-      MCOp = GetSymbolRef(MO,AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP);
+      MCOp = GetSymbolRef(MO,AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP,
+                          isDarwin);
       break;
     }
     
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 45d8b6b..3374e9b 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -686,9 +686,28 @@ unsigned PPCRegisterInfo::getEHHandlerRegister() const {
   return !Subtarget.isPPC64() ? PPC::R4 : PPC::X4;
 }
 
+/// DWARFFlavour - Flavour of dwarf regnumbers
+///
+namespace DWARFFlavour {
+  enum {
+    PPC64 = 0, PPC32 = 1
+  };
+}
+
 int PPCRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   // FIXME: Most probably dwarf numbers differs for Linux and Darwin
-  return PPCGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
+  unsigned Flavour = Subtarget.isPPC64() ?
+    DWARFFlavour::PPC64 : DWARFFlavour::PPC32;
+
+  return PPCGenRegisterInfo::getDwarfRegNumFull(RegNum, Flavour);
+}
+
+int PPCRegisterInfo::getLLVMRegNum(unsigned RegNum, bool isEH) const {
+  // FIXME: Most probably dwarf numbers differs for Linux and Darwin
+  unsigned Flavour = Subtarget.isPPC64() ?
+    DWARFFlavour::PPC64 : DWARFFlavour::PPC32;
+
+  return PPCGenRegisterInfo::getLLVMRegNumFull(RegNum, Flavour);
 }
 
 #include "PPCGenRegisterInfo.inc"
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index aa29ffe..48c2562 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -68,6 +68,7 @@ public:
   unsigned getEHHandlerRegister() const;
 
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index 2639165..3c01901 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -65,203 +65,203 @@ class CRBIT<bits<5> num, string n> : PPCReg<n> {
 
 
 // General-purpose registers
-def R0  : GPR< 0,  "r0">, DwarfRegNum<[0]>;
-def R1  : GPR< 1,  "r1">, DwarfRegNum<[1]>;
-def R2  : GPR< 2,  "r2">, DwarfRegNum<[2]>;
-def R3  : GPR< 3,  "r3">, DwarfRegNum<[3]>;
-def R4  : GPR< 4,  "r4">, DwarfRegNum<[4]>;
-def R5  : GPR< 5,  "r5">, DwarfRegNum<[5]>;
-def R6  : GPR< 6,  "r6">, DwarfRegNum<[6]>;
-def R7  : GPR< 7,  "r7">, DwarfRegNum<[7]>;
-def R8  : GPR< 8,  "r8">, DwarfRegNum<[8]>;
-def R9  : GPR< 9,  "r9">, DwarfRegNum<[9]>;
-def R10 : GPR<10, "r10">, DwarfRegNum<[10]>;
-def R11 : GPR<11, "r11">, DwarfRegNum<[11]>;
-def R12 : GPR<12, "r12">, DwarfRegNum<[12]>;
-def R13 : GPR<13, "r13">, DwarfRegNum<[13]>;
-def R14 : GPR<14, "r14">, DwarfRegNum<[14]>;
-def R15 : GPR<15, "r15">, DwarfRegNum<[15]>;
-def R16 : GPR<16, "r16">, DwarfRegNum<[16]>;
-def R17 : GPR<17, "r17">, DwarfRegNum<[17]>;
-def R18 : GPR<18, "r18">, DwarfRegNum<[18]>;
-def R19 : GPR<19, "r19">, DwarfRegNum<[19]>;
-def R20 : GPR<20, "r20">, DwarfRegNum<[20]>;
-def R21 : GPR<21, "r21">, DwarfRegNum<[21]>;
-def R22 : GPR<22, "r22">, DwarfRegNum<[22]>;
-def R23 : GPR<23, "r23">, DwarfRegNum<[23]>;
-def R24 : GPR<24, "r24">, DwarfRegNum<[24]>;
-def R25 : GPR<25, "r25">, DwarfRegNum<[25]>;
-def R26 : GPR<26, "r26">, DwarfRegNum<[26]>;
-def R27 : GPR<27, "r27">, DwarfRegNum<[27]>;
-def R28 : GPR<28, "r28">, DwarfRegNum<[28]>;
-def R29 : GPR<29, "r29">, DwarfRegNum<[29]>;
-def R30 : GPR<30, "r30">, DwarfRegNum<[30]>;
-def R31 : GPR<31, "r31">, DwarfRegNum<[31]>;
+def R0  : GPR< 0,  "r0">, DwarfRegNum<[-2, 0]>;
+def R1  : GPR< 1,  "r1">, DwarfRegNum<[-2, 1]>;
+def R2  : GPR< 2,  "r2">, DwarfRegNum<[-2, 2]>;
+def R3  : GPR< 3,  "r3">, DwarfRegNum<[-2, 3]>;
+def R4  : GPR< 4,  "r4">, DwarfRegNum<[-2, 4]>;
+def R5  : GPR< 5,  "r5">, DwarfRegNum<[-2, 5]>;
+def R6  : GPR< 6,  "r6">, DwarfRegNum<[-2, 6]>;
+def R7  : GPR< 7,  "r7">, DwarfRegNum<[-2, 7]>;
+def R8  : GPR< 8,  "r8">, DwarfRegNum<[-2, 8]>;
+def R9  : GPR< 9,  "r9">, DwarfRegNum<[-2, 9]>;
+def R10 : GPR<10, "r10">, DwarfRegNum<[-2, 10]>;
+def R11 : GPR<11, "r11">, DwarfRegNum<[-2, 11]>;
+def R12 : GPR<12, "r12">, DwarfRegNum<[-2, 12]>;
+def R13 : GPR<13, "r13">, DwarfRegNum<[-2, 13]>;
+def R14 : GPR<14, "r14">, DwarfRegNum<[-2, 14]>;
+def R15 : GPR<15, "r15">, DwarfRegNum<[-2, 15]>;
+def R16 : GPR<16, "r16">, DwarfRegNum<[-2, 16]>;
+def R17 : GPR<17, "r17">, DwarfRegNum<[-2, 17]>;
+def R18 : GPR<18, "r18">, DwarfRegNum<[-2, 18]>;
+def R19 : GPR<19, "r19">, DwarfRegNum<[-2, 19]>;
+def R20 : GPR<20, "r20">, DwarfRegNum<[-2, 20]>;
+def R21 : GPR<21, "r21">, DwarfRegNum<[-2, 21]>;
+def R22 : GPR<22, "r22">, DwarfRegNum<[-2, 22]>;
+def R23 : GPR<23, "r23">, DwarfRegNum<[-2, 23]>;
+def R24 : GPR<24, "r24">, DwarfRegNum<[-2, 24]>;
+def R25 : GPR<25, "r25">, DwarfRegNum<[-2, 25]>;
+def R26 : GPR<26, "r26">, DwarfRegNum<[-2, 26]>;
+def R27 : GPR<27, "r27">, DwarfRegNum<[-2, 27]>;
+def R28 : GPR<28, "r28">, DwarfRegNum<[-2, 28]>;
+def R29 : GPR<29, "r29">, DwarfRegNum<[-2, 29]>;
+def R30 : GPR<30, "r30">, DwarfRegNum<[-2, 30]>;
+def R31 : GPR<31, "r31">, DwarfRegNum<[-2, 31]>;
 
 // 64-bit General-purpose registers
-def X0  : GP8< R0,  "r0">, DwarfRegNum<[0]>;
-def X1  : GP8< R1,  "r1">, DwarfRegNum<[1]>;
-def X2  : GP8< R2,  "r2">, DwarfRegNum<[2]>;
-def X3  : GP8< R3,  "r3">, DwarfRegNum<[3]>;
-def X4  : GP8< R4,  "r4">, DwarfRegNum<[4]>;
-def X5  : GP8< R5,  "r5">, DwarfRegNum<[5]>;
-def X6  : GP8< R6,  "r6">, DwarfRegNum<[6]>;
-def X7  : GP8< R7,  "r7">, DwarfRegNum<[7]>;
-def X8  : GP8< R8,  "r8">, DwarfRegNum<[8]>;
-def X9  : GP8< R9,  "r9">, DwarfRegNum<[9]>;
-def X10 : GP8<R10, "r10">, DwarfRegNum<[10]>;
-def X11 : GP8<R11, "r11">, DwarfRegNum<[11]>;
-def X12 : GP8<R12, "r12">, DwarfRegNum<[12]>;
-def X13 : GP8<R13, "r13">, DwarfRegNum<[13]>;
-def X14 : GP8<R14, "r14">, DwarfRegNum<[14]>;
-def X15 : GP8<R15, "r15">, DwarfRegNum<[15]>;
-def X16 : GP8<R16, "r16">, DwarfRegNum<[16]>;
-def X17 : GP8<R17, "r17">, DwarfRegNum<[17]>;
-def X18 : GP8<R18, "r18">, DwarfRegNum<[18]>;
-def X19 : GP8<R19, "r19">, DwarfRegNum<[19]>;
-def X20 : GP8<R20, "r20">, DwarfRegNum<[20]>;
-def X21 : GP8<R21, "r21">, DwarfRegNum<[21]>;
-def X22 : GP8<R22, "r22">, DwarfRegNum<[22]>;
-def X23 : GP8<R23, "r23">, DwarfRegNum<[23]>;
-def X24 : GP8<R24, "r24">, DwarfRegNum<[24]>;
-def X25 : GP8<R25, "r25">, DwarfRegNum<[25]>;
-def X26 : GP8<R26, "r26">, DwarfRegNum<[26]>;
-def X27 : GP8<R27, "r27">, DwarfRegNum<[27]>;
-def X28 : GP8<R28, "r28">, DwarfRegNum<[28]>;
-def X29 : GP8<R29, "r29">, DwarfRegNum<[29]>;
-def X30 : GP8<R30, "r30">, DwarfRegNum<[30]>;
-def X31 : GP8<R31, "r31">, DwarfRegNum<[31]>;
+def X0  : GP8< R0,  "r0">, DwarfRegNum<[0, -2]>;
+def X1  : GP8< R1,  "r1">, DwarfRegNum<[1, -2]>;
+def X2  : GP8< R2,  "r2">, DwarfRegNum<[2, -2]>;
+def X3  : GP8< R3,  "r3">, DwarfRegNum<[3, -2]>;
+def X4  : GP8< R4,  "r4">, DwarfRegNum<[4, -2]>;
+def X5  : GP8< R5,  "r5">, DwarfRegNum<[5, -2]>;
+def X6  : GP8< R6,  "r6">, DwarfRegNum<[6, -2]>;
+def X7  : GP8< R7,  "r7">, DwarfRegNum<[7, -2]>;
+def X8  : GP8< R8,  "r8">, DwarfRegNum<[8, -2]>;
+def X9  : GP8< R9,  "r9">, DwarfRegNum<[9, -2]>;
+def X10 : GP8<R10, "r10">, DwarfRegNum<[10, -2]>;
+def X11 : GP8<R11, "r11">, DwarfRegNum<[11, -2]>;
+def X12 : GP8<R12, "r12">, DwarfRegNum<[12, -2]>;
+def X13 : GP8<R13, "r13">, DwarfRegNum<[13, -2]>;
+def X14 : GP8<R14, "r14">, DwarfRegNum<[14, -2]>;
+def X15 : GP8<R15, "r15">, DwarfRegNum<[15, -2]>;
+def X16 : GP8<R16, "r16">, DwarfRegNum<[16, -2]>;
+def X17 : GP8<R17, "r17">, DwarfRegNum<[17, -2]>;
+def X18 : GP8<R18, "r18">, DwarfRegNum<[18, -2]>;
+def X19 : GP8<R19, "r19">, DwarfRegNum<[19, -2]>;
+def X20 : GP8<R20, "r20">, DwarfRegNum<[20, -2]>;
+def X21 : GP8<R21, "r21">, DwarfRegNum<[21, -2]>;
+def X22 : GP8<R22, "r22">, DwarfRegNum<[22, -2]>;
+def X23 : GP8<R23, "r23">, DwarfRegNum<[23, -2]>;
+def X24 : GP8<R24, "r24">, DwarfRegNum<[24, -2]>;
+def X25 : GP8<R25, "r25">, DwarfRegNum<[25, -2]>;
+def X26 : GP8<R26, "r26">, DwarfRegNum<[26, -2]>;
+def X27 : GP8<R27, "r27">, DwarfRegNum<[27, -2]>;
+def X28 : GP8<R28, "r28">, DwarfRegNum<[28, -2]>;
+def X29 : GP8<R29, "r29">, DwarfRegNum<[29, -2]>;
+def X30 : GP8<R30, "r30">, DwarfRegNum<[30, -2]>;
+def X31 : GP8<R31, "r31">, DwarfRegNum<[31, -2]>;
 
 // Floating-point registers
-def F0  : FPR< 0,  "f0">, DwarfRegNum<[32]>;
-def F1  : FPR< 1,  "f1">, DwarfRegNum<[33]>;
-def F2  : FPR< 2,  "f2">, DwarfRegNum<[34]>;
-def F3  : FPR< 3,  "f3">, DwarfRegNum<[35]>;
-def F4  : FPR< 4,  "f4">, DwarfRegNum<[36]>;
-def F5  : FPR< 5,  "f5">, DwarfRegNum<[37]>;
-def F6  : FPR< 6,  "f6">, DwarfRegNum<[38]>;
-def F7  : FPR< 7,  "f7">, DwarfRegNum<[39]>;
-def F8  : FPR< 8,  "f8">, DwarfRegNum<[40]>;
-def F9  : FPR< 9,  "f9">, DwarfRegNum<[41]>;
-def F10 : FPR<10, "f10">, DwarfRegNum<[42]>;
-def F11 : FPR<11, "f11">, DwarfRegNum<[43]>;
-def F12 : FPR<12, "f12">, DwarfRegNum<[44]>;
-def F13 : FPR<13, "f13">, DwarfRegNum<[45]>;
-def F14 : FPR<14, "f14">, DwarfRegNum<[46]>;
-def F15 : FPR<15, "f15">, DwarfRegNum<[47]>;
-def F16 : FPR<16, "f16">, DwarfRegNum<[48]>;
-def F17 : FPR<17, "f17">, DwarfRegNum<[49]>;
-def F18 : FPR<18, "f18">, DwarfRegNum<[50]>;
-def F19 : FPR<19, "f19">, DwarfRegNum<[51]>;
-def F20 : FPR<20, "f20">, DwarfRegNum<[52]>;
-def F21 : FPR<21, "f21">, DwarfRegNum<[53]>;
-def F22 : FPR<22, "f22">, DwarfRegNum<[54]>;
-def F23 : FPR<23, "f23">, DwarfRegNum<[55]>;
-def F24 : FPR<24, "f24">, DwarfRegNum<[56]>;
-def F25 : FPR<25, "f25">, DwarfRegNum<[57]>;
-def F26 : FPR<26, "f26">, DwarfRegNum<[58]>;
-def F27 : FPR<27, "f27">, DwarfRegNum<[59]>;
-def F28 : FPR<28, "f28">, DwarfRegNum<[60]>;
-def F29 : FPR<29, "f29">, DwarfRegNum<[61]>;
-def F30 : FPR<30, "f30">, DwarfRegNum<[62]>;
-def F31 : FPR<31, "f31">, DwarfRegNum<[63]>;
+def F0  : FPR< 0,  "f0">, DwarfRegNum<[32, 32]>;
+def F1  : FPR< 1,  "f1">, DwarfRegNum<[33, 33]>;
+def F2  : FPR< 2,  "f2">, DwarfRegNum<[34, 34]>;
+def F3  : FPR< 3,  "f3">, DwarfRegNum<[35, 35]>;
+def F4  : FPR< 4,  "f4">, DwarfRegNum<[36, 36]>;
+def F5  : FPR< 5,  "f5">, DwarfRegNum<[37, 37]>;
+def F6  : FPR< 6,  "f6">, DwarfRegNum<[38, 38]>;
+def F7  : FPR< 7,  "f7">, DwarfRegNum<[39, 39]>;
+def F8  : FPR< 8,  "f8">, DwarfRegNum<[40, 40]>;
+def F9  : FPR< 9,  "f9">, DwarfRegNum<[41, 41]>;
+def F10 : FPR<10, "f10">, DwarfRegNum<[42, 42]>;
+def F11 : FPR<11, "f11">, DwarfRegNum<[43, 43]>;
+def F12 : FPR<12, "f12">, DwarfRegNum<[44, 44]>;
+def F13 : FPR<13, "f13">, DwarfRegNum<[45, 45]>;
+def F14 : FPR<14, "f14">, DwarfRegNum<[46, 46]>;
+def F15 : FPR<15, "f15">, DwarfRegNum<[47, 47]>;
+def F16 : FPR<16, "f16">, DwarfRegNum<[48, 48]>;
+def F17 : FPR<17, "f17">, DwarfRegNum<[49, 49]>;
+def F18 : FPR<18, "f18">, DwarfRegNum<[50, 50]>;
+def F19 : FPR<19, "f19">, DwarfRegNum<[51, 51]>;
+def F20 : FPR<20, "f20">, DwarfRegNum<[52, 52]>;
+def F21 : FPR<21, "f21">, DwarfRegNum<[53, 53]>;
+def F22 : FPR<22, "f22">, DwarfRegNum<[54, 54]>;
+def F23 : FPR<23, "f23">, DwarfRegNum<[55, 55]>;
+def F24 : FPR<24, "f24">, DwarfRegNum<[56, 56]>;
+def F25 : FPR<25, "f25">, DwarfRegNum<[57, 57]>;
+def F26 : FPR<26, "f26">, DwarfRegNum<[58, 58]>;
+def F27 : FPR<27, "f27">, DwarfRegNum<[59, 59]>;
+def F28 : FPR<28, "f28">, DwarfRegNum<[60, 60]>;
+def F29 : FPR<29, "f29">, DwarfRegNum<[61, 61]>;
+def F30 : FPR<30, "f30">, DwarfRegNum<[62, 62]>;
+def F31 : FPR<31, "f31">, DwarfRegNum<[63, 63]>;
 
 // Vector registers
-def V0  : VR< 0,  "v0">, DwarfRegNum<[77]>;
-def V1  : VR< 1,  "v1">, DwarfRegNum<[78]>;
-def V2  : VR< 2,  "v2">, DwarfRegNum<[79]>;
-def V3  : VR< 3,  "v3">, DwarfRegNum<[80]>;
-def V4  : VR< 4,  "v4">, DwarfRegNum<[81]>;
-def V5  : VR< 5,  "v5">, DwarfRegNum<[82]>;
-def V6  : VR< 6,  "v6">, DwarfRegNum<[83]>;
-def V7  : VR< 7,  "v7">, DwarfRegNum<[84]>;
-def V8  : VR< 8,  "v8">, DwarfRegNum<[85]>;
-def V9  : VR< 9,  "v9">, DwarfRegNum<[86]>;
-def V10 : VR<10, "v10">, DwarfRegNum<[87]>;
-def V11 : VR<11, "v11">, DwarfRegNum<[88]>;
-def V12 : VR<12, "v12">, DwarfRegNum<[89]>;
-def V13 : VR<13, "v13">, DwarfRegNum<[90]>;
-def V14 : VR<14, "v14">, DwarfRegNum<[91]>;
-def V15 : VR<15, "v15">, DwarfRegNum<[92]>;
-def V16 : VR<16, "v16">, DwarfRegNum<[93]>;
-def V17 : VR<17, "v17">, DwarfRegNum<[94]>;
-def V18 : VR<18, "v18">, DwarfRegNum<[95]>;
-def V19 : VR<19, "v19">, DwarfRegNum<[96]>;
-def V20 : VR<20, "v20">, DwarfRegNum<[97]>;
-def V21 : VR<21, "v21">, DwarfRegNum<[98]>;
-def V22 : VR<22, "v22">, DwarfRegNum<[99]>;
-def V23 : VR<23, "v23">, DwarfRegNum<[100]>;
-def V24 : VR<24, "v24">, DwarfRegNum<[101]>;
-def V25 : VR<25, "v25">, DwarfRegNum<[102]>;
-def V26 : VR<26, "v26">, DwarfRegNum<[103]>;
-def V27 : VR<27, "v27">, DwarfRegNum<[104]>;
-def V28 : VR<28, "v28">, DwarfRegNum<[105]>;
-def V29 : VR<29, "v29">, DwarfRegNum<[106]>;
-def V30 : VR<30, "v30">, DwarfRegNum<[107]>;
-def V31 : VR<31, "v31">, DwarfRegNum<[108]>;
+def V0  : VR< 0,  "v0">, DwarfRegNum<[77, 77]>;
+def V1  : VR< 1,  "v1">, DwarfRegNum<[78, 78]>;
+def V2  : VR< 2,  "v2">, DwarfRegNum<[79, 79]>;
+def V3  : VR< 3,  "v3">, DwarfRegNum<[80, 80]>;
+def V4  : VR< 4,  "v4">, DwarfRegNum<[81, 81]>;
+def V5  : VR< 5,  "v5">, DwarfRegNum<[82, 82]>;
+def V6  : VR< 6,  "v6">, DwarfRegNum<[83, 83]>;
+def V7  : VR< 7,  "v7">, DwarfRegNum<[84, 84]>;
+def V8  : VR< 8,  "v8">, DwarfRegNum<[85, 85]>;
+def V9  : VR< 9,  "v9">, DwarfRegNum<[86, 86]>;
+def V10 : VR<10, "v10">, DwarfRegNum<[87, 87]>;
+def V11 : VR<11, "v11">, DwarfRegNum<[88, 88]>;
+def V12 : VR<12, "v12">, DwarfRegNum<[89, 89]>;
+def V13 : VR<13, "v13">, DwarfRegNum<[90, 90]>;
+def V14 : VR<14, "v14">, DwarfRegNum<[91, 91]>;
+def V15 : VR<15, "v15">, DwarfRegNum<[92, 92]>;
+def V16 : VR<16, "v16">, DwarfRegNum<[93, 93]>;
+def V17 : VR<17, "v17">, DwarfRegNum<[94, 94]>;
+def V18 : VR<18, "v18">, DwarfRegNum<[95, 95]>;
+def V19 : VR<19, "v19">, DwarfRegNum<[96, 96]>;
+def V20 : VR<20, "v20">, DwarfRegNum<[97, 97]>;
+def V21 : VR<21, "v21">, DwarfRegNum<[98, 98]>;
+def V22 : VR<22, "v22">, DwarfRegNum<[99, 99]>;
+def V23 : VR<23, "v23">, DwarfRegNum<[100, 100]>;
+def V24 : VR<24, "v24">, DwarfRegNum<[101, 101]>;
+def V25 : VR<25, "v25">, DwarfRegNum<[102, 102]>;
+def V26 : VR<26, "v26">, DwarfRegNum<[103, 103]>;
+def V27 : VR<27, "v27">, DwarfRegNum<[104, 104]>;
+def V28 : VR<28, "v28">, DwarfRegNum<[105, 105]>;
+def V29 : VR<29, "v29">, DwarfRegNum<[106, 106]>;
+def V30 : VR<30, "v30">, DwarfRegNum<[107, 107]>;
+def V31 : VR<31, "v31">, DwarfRegNum<[108, 108]>;
 
 // Condition register bits
-def CR0LT : CRBIT< 0, "0">, DwarfRegNum<[0]>;
-def CR0GT : CRBIT< 1, "1">, DwarfRegNum<[0]>;
-def CR0EQ : CRBIT< 2, "2">, DwarfRegNum<[0]>;
-def CR0UN : CRBIT< 3, "3">, DwarfRegNum<[0]>;
-def CR1LT : CRBIT< 4, "4">, DwarfRegNum<[0]>;
-def CR1GT : CRBIT< 5, "5">, DwarfRegNum<[0]>;
-def CR1EQ : CRBIT< 6, "6">, DwarfRegNum<[0]>;
-def CR1UN : CRBIT< 7, "7">, DwarfRegNum<[0]>;
-def CR2LT : CRBIT< 8, "8">, DwarfRegNum<[0]>;
-def CR2GT : CRBIT< 9, "9">, DwarfRegNum<[0]>;
-def CR2EQ : CRBIT<10, "10">, DwarfRegNum<[0]>;
-def CR2UN : CRBIT<11, "11">, DwarfRegNum<[0]>;
-def CR3LT : CRBIT<12, "12">, DwarfRegNum<[0]>;
-def CR3GT : CRBIT<13, "13">, DwarfRegNum<[0]>;
-def CR3EQ : CRBIT<14, "14">, DwarfRegNum<[0]>;
-def CR3UN : CRBIT<15, "15">, DwarfRegNum<[0]>;
-def CR4LT : CRBIT<16, "16">, DwarfRegNum<[0]>;
-def CR4GT : CRBIT<17, "17">, DwarfRegNum<[0]>;
-def CR4EQ : CRBIT<18, "18">, DwarfRegNum<[0]>;
-def CR4UN : CRBIT<19, "19">, DwarfRegNum<[0]>;
-def CR5LT : CRBIT<20, "20">, DwarfRegNum<[0]>;
-def CR5GT : CRBIT<21, "21">, DwarfRegNum<[0]>;
-def CR5EQ : CRBIT<22, "22">, DwarfRegNum<[0]>;
-def CR5UN : CRBIT<23, "23">, DwarfRegNum<[0]>;
-def CR6LT : CRBIT<24, "24">, DwarfRegNum<[0]>;
-def CR6GT : CRBIT<25, "25">, DwarfRegNum<[0]>;
-def CR6EQ : CRBIT<26, "26">, DwarfRegNum<[0]>;
-def CR6UN : CRBIT<27, "27">, DwarfRegNum<[0]>;
-def CR7LT : CRBIT<28, "28">, DwarfRegNum<[0]>;
-def CR7GT : CRBIT<29, "29">, DwarfRegNum<[0]>;
-def CR7EQ : CRBIT<30, "30">, DwarfRegNum<[0]>;
-def CR7UN : CRBIT<31, "31">, DwarfRegNum<[0]>;
+def CR0LT : CRBIT< 0, "0">;
+def CR0GT : CRBIT< 1, "1">;
+def CR0EQ : CRBIT< 2, "2">;
+def CR0UN : CRBIT< 3, "3">;
+def CR1LT : CRBIT< 4, "4">;
+def CR1GT : CRBIT< 5, "5">;
+def CR1EQ : CRBIT< 6, "6">;
+def CR1UN : CRBIT< 7, "7">;
+def CR2LT : CRBIT< 8, "8">;
+def CR2GT : CRBIT< 9, "9">;
+def CR2EQ : CRBIT<10, "10">;
+def CR2UN : CRBIT<11, "11">;
+def CR3LT : CRBIT<12, "12">;
+def CR3GT : CRBIT<13, "13">;
+def CR3EQ : CRBIT<14, "14">;
+def CR3UN : CRBIT<15, "15">;
+def CR4LT : CRBIT<16, "16">;
+def CR4GT : CRBIT<17, "17">;
+def CR4EQ : CRBIT<18, "18">;
+def CR4UN : CRBIT<19, "19">;
+def CR5LT : CRBIT<20, "20">;
+def CR5GT : CRBIT<21, "21">;
+def CR5EQ : CRBIT<22, "22">;
+def CR5UN : CRBIT<23, "23">;
+def CR6LT : CRBIT<24, "24">;
+def CR6GT : CRBIT<25, "25">;
+def CR6EQ : CRBIT<26, "26">;
+def CR6UN : CRBIT<27, "27">;
+def CR7LT : CRBIT<28, "28">;
+def CR7GT : CRBIT<29, "29">;
+def CR7EQ : CRBIT<30, "30">;
+def CR7UN : CRBIT<31, "31">;
 
 // Condition registers
 let SubRegIndices = [sub_lt, sub_gt, sub_eq, sub_un] in {
-def CR0 : CR<0, "cr0", [CR0LT, CR0GT, CR0EQ, CR0UN]>, DwarfRegNum<[68]>;
-def CR1 : CR<1, "cr1", [CR1LT, CR1GT, CR1EQ, CR1UN]>, DwarfRegNum<[69]>;
-def CR2 : CR<2, "cr2", [CR2LT, CR2GT, CR2EQ, CR2UN]>, DwarfRegNum<[70]>;
-def CR3 : CR<3, "cr3", [CR3LT, CR3GT, CR3EQ, CR3UN]>, DwarfRegNum<[71]>;
-def CR4 : CR<4, "cr4", [CR4LT, CR4GT, CR4EQ, CR4UN]>, DwarfRegNum<[72]>;
-def CR5 : CR<5, "cr5", [CR5LT, CR5GT, CR5EQ, CR5UN]>, DwarfRegNum<[73]>;
-def CR6 : CR<6, "cr6", [CR6LT, CR6GT, CR6EQ, CR6UN]>, DwarfRegNum<[74]>;
-def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75]>;
+def CR0 : CR<0, "cr0", [CR0LT, CR0GT, CR0EQ, CR0UN]>, DwarfRegNum<[68, 68]>;
+def CR1 : CR<1, "cr1", [CR1LT, CR1GT, CR1EQ, CR1UN]>, DwarfRegNum<[69, 69]>;
+def CR2 : CR<2, "cr2", [CR2LT, CR2GT, CR2EQ, CR2UN]>, DwarfRegNum<[70, 70]>;
+def CR3 : CR<3, "cr3", [CR3LT, CR3GT, CR3EQ, CR3UN]>, DwarfRegNum<[71, 71]>;
+def CR4 : CR<4, "cr4", [CR4LT, CR4GT, CR4EQ, CR4UN]>, DwarfRegNum<[72, 72]>;
+def CR5 : CR<5, "cr5", [CR5LT, CR5GT, CR5EQ, CR5UN]>, DwarfRegNum<[73, 73]>;
+def CR6 : CR<6, "cr6", [CR6LT, CR6GT, CR6EQ, CR6UN]>, DwarfRegNum<[74, 74]>;
+def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75, 75]>;
 }
 
 // Link register
-def LR  : SPR<8, "lr">, DwarfRegNum<[65]>;
+def LR  : SPR<8, "lr">, DwarfRegNum<[-2, 65]>;
 //let Aliases = [LR] in
-def LR8 : SPR<8, "lr">, DwarfRegNum<[65]>;
+def LR8 : SPR<8, "lr">, DwarfRegNum<[65, -2]>;
 
 // Count register
-def CTR  : SPR<9, "ctr">, DwarfRegNum<[66]>;
-def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66]>;
+def CTR  : SPR<9, "ctr">, DwarfRegNum<[-2, 66]>;
+def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]>;
 
 // VRsave register
-def VRSAVE: SPR<256, "VRsave">, DwarfRegNum<[107]>;
+def VRSAVE: SPR<256, "VRsave">, DwarfRegNum<[109]>;
 
 // Carry bit.  In the architecture this is really bit 0 of the XER register
 // (which really is SPR register 1);  this is the only bit interesting to a
 // compiler.
-def CARRY: SPR<1, "ca">, DwarfRegNum<[0]>;
+def CARRY: SPR<1, "ca">;
 
 // FP rounding mode:  bits 30 and 31 of the FP status and control register
 // This is not allocated as a normal register; it appears only in
@@ -271,7 +271,7 @@ def CARRY: SPR<1, "ca">, DwarfRegNum<[0]>;
 // return and call instructions are described as Uses of RM, so instructions
 // that do nothing but change RM will not get deleted.
 // Also, in the architecture it is not really a SPR; 512 is arbitrary.
-def RM: SPR<512, "**ROUNDING MODE**">, DwarfRegNum<[0]>;
+def RM: SPR<512, "**ROUNDING MODE**">;
 
 /// Register classes
 // Allocate volatiles first
@@ -279,68 +279,12 @@ def RM: SPR<512, "**ROUNDING MODE**">, DwarfRegNum<[0]>;
 def GPRC : RegisterClass<"PPC", [i32], 32,
      [R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12,
       R30, R29, R28, R27, R26, R25, R24, R23, R22, R21, R20, R19, R18, R17,
-      R16, R15, R14, R13, R31, R0, R1, LR]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GPRCClass::iterator
-    GPRCClass::allocation_order_begin(const MachineFunction &MF) const {
-      // 32-bit SVR4 ABI: r2 is reserved for the OS.
-      // 64-bit SVR4 ABI: r2 is reserved for the TOC pointer.
-      // Darwin: R2 is reserved for CR save/restore sequence.
-      return begin()+1;
-    }
-    GPRCClass::iterator
-    GPRCClass::allocation_order_end(const MachineFunction &MF) const {
-      // On PPC64, r13 is the thread pointer.  Never allocate this register.
-      // Note that this is overconservative, as it also prevents allocation of
-      // R31 when the FP is not needed.
-      // When using the 32-bit SVR4 ABI, r13 is reserved for the Small Data Area
-      // pointer.
-      const PPCSubtarget &Subtarget = MF.getTarget().getSubtarget<PPCSubtarget>();
-      const PPCFrameLowering *PPCFI =
-        static_cast<const PPCFrameLowering*>(MF.getTarget().getFrameLowering());
-   
-      if (Subtarget.isPPC64() || Subtarget.isSVR4ABI())
-        return end()-5;  // don't allocate R13, R31, R0, R1, LR
-        
-      if (PPCFI->needsFP(MF))
-        return end()-4;  // don't allocate R31, R0, R1, LR
-      else
-        return end()-3;  // don't allocate R0, R1, LR
-    }
-  }];
-}
+      R16, R15, R14, R13, R31, R0, R1, LR]>;
+
 def G8RC : RegisterClass<"PPC", [i64], 64,
      [X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12,
       X30, X29, X28, X27, X26, X25, X24, X23, X22, X21, X20, X19, X18, X17,
-      X16, X15, X14, X31, X13, X0, X1, LR8]>
-{
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    G8RCClass::iterator
-    G8RCClass::allocation_order_begin(const MachineFunction &MF) const {
-      // 64-bit SVR4 ABI: r2 is reserved for the TOC pointer.
-      // Darwin: r2 is reserved for CR save/restore sequence.
-      return begin()+1;
-    }
-    G8RCClass::iterator
-    G8RCClass::allocation_order_end(const MachineFunction &MF) const {
-      const PPCFrameLowering *PPCFI =
-        static_cast<const PPCFrameLowering*>(MF.getTarget().getFrameLowering());
-      if (PPCFI->needsFP(MF))
-        return end()-5;
-      else
-        return end()-4;
-    }
-  }];
-}
+      X16, X15, X14, X31, X13, X0, X1, LR8]>;
 
 // Allocate volatiles first, then non-volatiles in reverse order. With the SVR4
 // ABI the size of the Floating-point register save area is determined by the
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index ffe3fa4..fcec368 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -2305,4 +2305,56 @@ The two or/and's should be merged into one each.
 
 //===---------------------------------------------------------------------===//
 
+Machine level code hoisting can be useful in some cases.  For example, PR9408
+is about:
 
+typedef union {
+ void (*f1)(int);
+ void (*f2)(long);
+} funcs;
+
+void foo(funcs f, int which) {
+ int a = 5;
+ if (which) {
+   f.f1(a);
+ } else {
+   f.f2(a);
+ }
+}
+
+which we compile to:
+
+foo:                                    # @foo
+# BB#0:                                 # %entry
+       pushq   %rbp
+       movq    %rsp, %rbp
+       testl   %esi, %esi
+       movq    %rdi, %rax
+       je      .LBB0_2
+# BB#1:                                 # %if.then
+       movl    $5, %edi
+       callq   *%rax
+       popq    %rbp
+       ret
+.LBB0_2:                                # %if.else
+       movl    $5, %edi
+       callq   *%rax
+       popq    %rbp
+       ret
+
+Note that bb1 and bb2 are the same.  This doesn't happen at the IR level
+because one call is passing an i32 and the other is passing an i64.
+
+//===---------------------------------------------------------------------===//
+
+I see this sort of pattern in 176.gcc in a few places (e.g. the start of
+store_bit_field).  The rem should be replaced with a multiply and subtract:
+
+  %3 = sdiv i32 %A, %B
+  %4 = srem i32 %A, %B
+
+Similarly for udiv/urem.  Note that this shouldn't be done on X86 or ARM,
+which can do this in a single operation (instruction or libcall).  It is
+probably best to do this in the code generator.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index edb62fa..0b4612d 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -91,8 +91,8 @@ SparcTargetLowering::LowerReturn(SDValue Chain,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, DAG.getTarget(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 DAG.getTarget(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_Sparc32);
@@ -139,7 +139,7 @@ SparcTargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain,
                        RetAddrOffsetNode, Flag);
-  return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain, 
+  return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain,
                      RetAddrOffsetNode);
 }
 
@@ -161,8 +161,8 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain,
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32);
 
   const unsigned StackOffset = 92;
@@ -182,8 +182,6 @@ SparcTargetLowering::LowerFormalArguments(SDValue Chain,
     }
 
     if (VA.isRegLoc()) {
-      EVT RegVT = VA.getLocVT();
-
       if (VA.needsCustom()) {
         assert(VA.getLocVT() == MVT::f64);
         unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
@@ -362,8 +360,8 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getTarget(), ArgLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 DAG.getTarget(), ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeCallOperands(Outs, CC_Sparc32);
 
   // Get the size of the outgoing arguments stack space requirement.
@@ -593,8 +591,8 @@ SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState RVInfo(CallConv, isVarArg, DAG.getTarget(),
-                 RVLocs, *DAG.getContext());
+  CCState RVInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 DAG.getTarget(), RVLocs, *DAG.getContext());
 
   RVInfo.AnalyzeCallResult(Ins, RetCC_Sparc32);
 
@@ -801,6 +799,8 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   if (TM.getSubtarget<SparcSubtarget>().isV9())
     setOperationAction(ISD::CTPOP, MVT::i32, Legal);
 
+  setMinFunctionAlignment(2);
+
   computeRegisterProperties();
 }
 
@@ -1290,8 +1290,3 @@ SparcTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The Sparc target isn't yet aware of offsets.
   return false;
 }
-
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned SparcTargetLowering::getFunctionAlignment(const Function *) const {
-  return 2;
-}
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index 7d02df8..9ea6e16 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -71,9 +71,6 @@ namespace llvm {
 
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const;
-
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv,
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index b010d04..9fcf028 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -39,6 +39,8 @@ const unsigned* SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
 
 BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
+  // FIXME: G1 reserved for now for large imm generation by frame code.
+  Reserved.set(SP::G1);
   Reserved.set(SP::G2);
   Reserved.set(SP::G3);
   Reserved.set(SP::G4);
@@ -130,5 +132,9 @@ int SparcRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return SparcGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
 }
 
+int SparcRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
+  return SparcGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
+}
+
 #include "SparcGenRegisterInfo.inc"
 
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index d930b53..56c8068 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -52,6 +52,7 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
   unsigned getEHHandlerRegister() const;
 
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
index 5ef4dae..0729818 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/lib/Target/Sparc/SparcRegisterInfo.td
@@ -117,22 +117,22 @@ def F30 : Rf<30, "F30">, DwarfRegNum<[62]>;
 def F31 : Rf<31, "F31">, DwarfRegNum<[63]>;
 
 // Aliases of the F* registers used to hold 64-bit fp values (doubles)
-def D0  : Rd< 0,  "F0", [F0,   F1]>, DwarfRegNum<[32]>;
-def D1  : Rd< 2,  "F2", [F2,   F3]>, DwarfRegNum<[34]>; 
-def D2  : Rd< 4,  "F4", [F4,   F5]>, DwarfRegNum<[36]>;
-def D3  : Rd< 6,  "F6", [F6,   F7]>, DwarfRegNum<[38]>; 
-def D4  : Rd< 8,  "F8", [F8,   F9]>, DwarfRegNum<[40]>;
-def D5  : Rd<10, "F10", [F10, F11]>, DwarfRegNum<[42]>;
-def D6  : Rd<12, "F12", [F12, F13]>, DwarfRegNum<[44]>;
-def D7  : Rd<14, "F14", [F14, F15]>, DwarfRegNum<[46]>; 
-def D8  : Rd<16, "F16", [F16, F17]>, DwarfRegNum<[48]>;
-def D9  : Rd<18, "F18", [F18, F19]>, DwarfRegNum<[50]>; 
-def D10 : Rd<20, "F20", [F20, F21]>, DwarfRegNum<[52]>;
-def D11 : Rd<22, "F22", [F22, F23]>, DwarfRegNum<[54]>;
-def D12 : Rd<24, "F24", [F24, F25]>, DwarfRegNum<[56]>;
-def D13 : Rd<26, "F26", [F26, F27]>, DwarfRegNum<[58]>; 
-def D14 : Rd<28, "F28", [F28, F29]>, DwarfRegNum<[60]>;
-def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[62]>;
+def D0  : Rd< 0,  "F0", [F0,   F1]>, DwarfRegNum<[72]>;
+def D1  : Rd< 2,  "F2", [F2,   F3]>, DwarfRegNum<[73]>;
+def D2  : Rd< 4,  "F4", [F4,   F5]>, DwarfRegNum<[74]>;
+def D3  : Rd< 6,  "F6", [F6,   F7]>, DwarfRegNum<[75]>;
+def D4  : Rd< 8,  "F8", [F8,   F9]>, DwarfRegNum<[76]>;
+def D5  : Rd<10, "F10", [F10, F11]>, DwarfRegNum<[77]>;
+def D6  : Rd<12, "F12", [F12, F13]>, DwarfRegNum<[78]>;
+def D7  : Rd<14, "F14", [F14, F15]>, DwarfRegNum<[79]>;
+def D8  : Rd<16, "F16", [F16, F17]>, DwarfRegNum<[80]>;
+def D9  : Rd<18, "F18", [F18, F19]>, DwarfRegNum<[81]>;
+def D10 : Rd<20, "F20", [F20, F21]>, DwarfRegNum<[82]>;
+def D11 : Rd<22, "F22", [F22, F23]>, DwarfRegNum<[83]>;
+def D12 : Rd<24, "F24", [F24, F25]>, DwarfRegNum<[84]>;
+def D13 : Rd<26, "F26", [F26, F27]>, DwarfRegNum<[85]>;
+def D14 : Rd<28, "F28", [F28, F29]>, DwarfRegNum<[86]>;
+def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[87]>;
 
 // Register classes.
 //
@@ -142,8 +142,6 @@ def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[62]>;
 def IntRegs : RegisterClass<"SP", [i32], 32, [L0, L1, L2, L3, L4, L5, L6, L7,
                                      I0, I1, I2, I3, I4, I5,
                                      O0, O1, O2, O3, O4, O5, O7,
-
-   // FIXME: G1 reserved for now for large imm generation by frame code.
                                      G1,
                                      // Non-allocatable regs:
                                      G2, G3, G4, // FIXME: OK for use only in
@@ -153,19 +151,7 @@ def IntRegs : RegisterClass<"SP", [i32], 32, [L0, L1, L2, L3, L4, L5, L6, L7,
                                      I7, // return address
                                      G0, // constant zero
                                      G5, G6, G7 // reserved for kernel
-                                     ]> {
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    IntRegsClass::iterator
-    IntRegsClass::allocation_order_end(const MachineFunction &MF) const {
-      // FIXME: These special regs should be taken out of the regclass!
-      return end()-10  // Don't allocate special registers
-         -1;  // FIXME: G1 reserved for large imm generation by frame code.
-    }
-  }];
-}
+                                     ]>;
 
 def FPRegs : RegisterClass<"SP", [f32], 32, [F0, F1, F2, F3, F4, F5, F6, F7, F8,
   F9, F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, F20, F21, F22,
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index d331614..af85df5 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -153,6 +153,8 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm) :
   setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Expand);
 
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  setMinFunctionAlignment(1);
 }
 
 SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
@@ -289,8 +291,8 @@ SystemZTargetLowering::LowerCCCArguments(SDValue Chain,
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);
 
   if (isVarArg)
@@ -382,8 +384,8 @@ SystemZTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
 
@@ -511,8 +513,8 @@ SystemZTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ);
 
@@ -556,8 +558,8 @@ SystemZTargetLowering::LowerReturn(SDValue Chain,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_SystemZ);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 3019242..bab3dc2 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -66,11 +66,6 @@ namespace llvm {
     /// DAG node.
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const {
-      return 1;
-    }
-
     std::pair<unsigned, const TargetRegisterClass*>
     getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
     TargetLowering::ConstraintType
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 28f94f4..ed62cff 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -58,6 +58,20 @@ BitVector SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const
   return Reserved;
 }
 
+const TargetRegisterClass*
+SystemZRegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+                                              const TargetRegisterClass *B,
+                                              unsigned Idx) const {
+  switch(Idx) {
+  // Exact sub-classes don't exist for the other sub-register indexes.
+  default: return 0;
+  case SystemZ::subreg_32bit:
+    if (B == SystemZ::ADDR32RegisterClass)
+      return A->getSize() == 8 ? SystemZ::ADDR64RegisterClass : 0;
+    return A;
+  }
+}
+
 void SystemZRegisterInfo::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
@@ -125,4 +139,10 @@ int SystemZRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return -1;
 }
 
+int SystemZRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
+  assert(0 && "What is the dwarf register number");
+  return -1;
+}
+
+
 #include "SystemZGenRegisterInfo.inc"
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index b450798..cd8f20f 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -34,6 +34,10 @@ struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
+  const TargetRegisterClass*
+  getMatchingSuperRegClass(const TargetRegisterClass *A,
+                           const TargetRegisterClass *B, unsigned Idx) const;
+
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
@@ -50,6 +54,7 @@ struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
   unsigned getEHHandlerRegister() const;
 
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
index 0028c85..9313ffd 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -61,22 +61,22 @@ def subreg_odd    : SubRegIndex;
 }
 
 // General-purpose registers
-def R0W  : GPR32< 0,  "r0">, DwarfRegNum<[0]>;
-def R1W  : GPR32< 1,  "r1">, DwarfRegNum<[1]>;
-def R2W  : GPR32< 2,  "r2">, DwarfRegNum<[2]>;
-def R3W  : GPR32< 3,  "r3">, DwarfRegNum<[3]>;
-def R4W  : GPR32< 4,  "r4">, DwarfRegNum<[4]>;
-def R5W  : GPR32< 5,  "r5">, DwarfRegNum<[5]>;
-def R6W  : GPR32< 6,  "r6">, DwarfRegNum<[6]>;
-def R7W  : GPR32< 7,  "r7">, DwarfRegNum<[7]>;
-def R8W  : GPR32< 8,  "r8">, DwarfRegNum<[8]>;
-def R9W  : GPR32< 9,  "r9">, DwarfRegNum<[9]>;
-def R10W : GPR32<10, "r10">, DwarfRegNum<[10]>;
-def R11W : GPR32<11, "r11">, DwarfRegNum<[11]>;
-def R12W : GPR32<12, "r12">, DwarfRegNum<[12]>;
-def R13W : GPR32<13, "r13">, DwarfRegNum<[13]>;
-def R14W : GPR32<14, "r14">, DwarfRegNum<[14]>;
-def R15W : GPR32<15, "r15">, DwarfRegNum<[15]>;
+def R0W  : GPR32< 0,  "r0">;
+def R1W  : GPR32< 1,  "r1">;
+def R2W  : GPR32< 2,  "r2">;
+def R3W  : GPR32< 3,  "r3">;
+def R4W  : GPR32< 4,  "r4">;
+def R5W  : GPR32< 5,  "r5">;
+def R6W  : GPR32< 6,  "r6">;
+def R7W  : GPR32< 7,  "r7">;
+def R8W  : GPR32< 8,  "r8">;
+def R9W  : GPR32< 9,  "r9">;
+def R10W : GPR32<10, "r10">;
+def R11W : GPR32<11, "r11">;
+def R12W : GPR32<12, "r12">;
+def R13W : GPR32<13, "r13">;
+def R14W : GPR32<14, "r14">;
+def R15W : GPR32<15, "r15">;
 
 let SubRegIndices = [subreg_32bit] in {
 def R0D  : GPR64< 0,  "r0", [R0W]>,  DwarfRegNum<[0]>;
@@ -99,26 +99,26 @@ def R15D : GPR64<15, "r15", [R15W]>, DwarfRegNum<[15]>;
 
 // Register pairs
 let SubRegIndices = [subreg_32bit, subreg_odd32] in {
-def R0P  : GPR64< 0,  "r0", [R0W,  R1W],  [R0D,  R1D]>,  DwarfRegNum<[0]>;
-def R2P  : GPR64< 2,  "r2", [R2W,  R3W],  [R2D,  R3D]>,  DwarfRegNum<[2]>;
-def R4P  : GPR64< 4,  "r4", [R4W,  R5W],  [R4D,  R5D]>,  DwarfRegNum<[4]>;
-def R6P  : GPR64< 6,  "r6", [R6W,  R7W],  [R6D,  R7D]>,  DwarfRegNum<[6]>;
-def R8P  : GPR64< 8,  "r8", [R8W,  R9W],  [R8D,  R9D]>,  DwarfRegNum<[8]>;
-def R10P : GPR64<10, "r10", [R10W, R11W], [R10D, R11D]>, DwarfRegNum<[10]>;
-def R12P : GPR64<12, "r12", [R12W, R13W], [R12D, R13D]>, DwarfRegNum<[12]>;
-def R14P : GPR64<14, "r14", [R14W, R15W], [R14D, R15D]>, DwarfRegNum<[14]>;
+def R0P  : GPR64< 0,  "r0", [R0W,  R1W],  [R0D,  R1D]>;
+def R2P  : GPR64< 2,  "r2", [R2W,  R3W],  [R2D,  R3D]>;
+def R4P  : GPR64< 4,  "r4", [R4W,  R5W],  [R4D,  R5D]>;
+def R6P  : GPR64< 6,  "r6", [R6W,  R7W],  [R6D,  R7D]>;
+def R8P  : GPR64< 8,  "r8", [R8W,  R9W],  [R8D,  R9D]>;
+def R10P : GPR64<10, "r10", [R10W, R11W], [R10D, R11D]>;
+def R12P : GPR64<12, "r12", [R12W, R13W], [R12D, R13D]>;
+def R14P : GPR64<14, "r14", [R14W, R15W], [R14D, R15D]>;
 }
 
 let SubRegIndices = [subreg_even, subreg_odd],
  CompositeIndices = [(subreg_odd32  subreg_odd,  subreg_32bit)] in {
-def R0Q  : GPR128< 0,  "r0", [R0D,  R1D],  [R0P]>,  DwarfRegNum<[0]>;
-def R2Q  : GPR128< 2,  "r2", [R2D,  R3D],  [R2P]>,  DwarfRegNum<[2]>;
-def R4Q  : GPR128< 4,  "r4", [R4D,  R5D],  [R4P]>,  DwarfRegNum<[4]>;
-def R6Q  : GPR128< 6,  "r6", [R6D,  R7D],  [R6P]>,  DwarfRegNum<[6]>;
-def R8Q  : GPR128< 8,  "r8", [R8D,  R9D],  [R8P]>,  DwarfRegNum<[8]>;
-def R10Q : GPR128<10, "r10", [R10D, R11D], [R10P]>, DwarfRegNum<[10]>;
-def R12Q : GPR128<12, "r12", [R12D, R13D], [R12P]>, DwarfRegNum<[12]>;
-def R14Q : GPR128<14, "r14", [R14D, R15D], [R14P]>, DwarfRegNum<[14]>;
+def R0Q  : GPR128< 0,  "r0", [R0D,  R1D],  [R0P]>;
+def R2Q  : GPR128< 2,  "r2", [R2D,  R3D],  [R2P]>;
+def R4Q  : GPR128< 4,  "r4", [R4D,  R5D],  [R4P]>;
+def R6Q  : GPR128< 6,  "r6", [R6D,  R7D],  [R6P]>;
+def R8Q  : GPR128< 8,  "r8", [R8D,  R9D],  [R8P]>;
+def R10Q : GPR128<10, "r10", [R10D, R11D], [R10P]>;
+def R12Q : GPR128<12, "r12", [R12D, R13D], [R12P]>;
+def R14Q : GPR128<14, "r14", [R14D, R15D], [R14P]>;
 }
 
 // Floating-point registers
@@ -140,22 +140,22 @@ def F14S : FPRS<14, "f14">, DwarfRegNum<[30]>;
 def F15S : FPRS<15, "f15">, DwarfRegNum<[31]>;
 
 let SubRegIndices = [subreg_32bit] in {
-def F0L  : FPRL< 0,  "f0", [F0S]>,  DwarfRegNum<[16]>;
-def F1L  : FPRL< 1,  "f1", [F1S]>,  DwarfRegNum<[17]>;
-def F2L  : FPRL< 2,  "f2", [F2S]>,  DwarfRegNum<[18]>;
-def F3L  : FPRL< 3,  "f3", [F3S]>,  DwarfRegNum<[19]>;
-def F4L  : FPRL< 4,  "f4", [F4S]>,  DwarfRegNum<[20]>;
-def F5L  : FPRL< 5,  "f5", [F5S]>,  DwarfRegNum<[21]>;
-def F6L  : FPRL< 6,  "f6", [F6S]>,  DwarfRegNum<[22]>;
-def F7L  : FPRL< 7,  "f7", [F7S]>,  DwarfRegNum<[23]>;
-def F8L  : FPRL< 8,  "f8", [F8S]>,  DwarfRegNum<[24]>;
-def F9L  : FPRL< 9,  "f9", [F9S]>,  DwarfRegNum<[25]>;
-def F10L : FPRL<10, "f10", [F10S]>, DwarfRegNum<[26]>;
-def F11L : FPRL<11, "f11", [F11S]>, DwarfRegNum<[27]>;
-def F12L : FPRL<12, "f12", [F12S]>, DwarfRegNum<[28]>;
-def F13L : FPRL<13, "f13", [F13S]>, DwarfRegNum<[29]>;
-def F14L : FPRL<14, "f14", [F14S]>, DwarfRegNum<[30]>;
-def F15L : FPRL<15, "f15", [F15S]>, DwarfRegNum<[31]>;
+def F0L  : FPRL< 0,  "f0", [F0S]>;
+def F1L  : FPRL< 1,  "f1", [F1S]>;
+def F2L  : FPRL< 2,  "f2", [F2S]>;
+def F3L  : FPRL< 3,  "f3", [F3S]>;
+def F4L  : FPRL< 4,  "f4", [F4S]>;
+def F5L  : FPRL< 5,  "f5", [F5S]>;
+def F6L  : FPRL< 6,  "f6", [F6S]>;
+def F7L  : FPRL< 7,  "f7", [F7S]>;
+def F8L  : FPRL< 8,  "f8", [F8S]>;
+def F9L  : FPRL< 9,  "f9", [F9S]>;
+def F10L : FPRL<10, "f10", [F10S]>;
+def F11L : FPRL<11, "f11", [F11S]>;
+def F12L : FPRL<12, "f12", [F12S]>;
+def F13L : FPRL<13, "f13", [F13S]>;
+def F14L : FPRL<14, "f14", [F14S]>;
+def F15L : FPRL<15, "f15", [F15S]>;
 }
 
 // Status register
diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index e336b09..709dfd2 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp
@@ -38,8 +38,8 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T) {
     TLI.setUnavailable(LibFunc::memset_pattern16);
   }
 
-  // iprintf and friends are only available on XCore.
-  if (T.getArch() != Triple::xcore) {
+  // iprintf and friends are only available on XCore and TCE.
+  if (T.getArch() != Triple::xcore && T.getArch() != Triple::tce) {
     TLI.setUnavailable(LibFunc::iprintf);
     TLI.setUnavailable(LibFunc::siprintf);
     TLI.setUnavailable(LibFunc::fiprintf);
@@ -61,6 +61,12 @@ TargetLibraryInfo::TargetLibraryInfo(const Triple &T) : ImmutablePass(ID) {
   initialize(*this, T);
 }
 
+TargetLibraryInfo::TargetLibraryInfo(const TargetLibraryInfo &TLI)
+  : ImmutablePass(ID) {
+  memcpy(AvailableArray, TLI.AvailableArray, sizeof(AvailableArray));
+}
+
+
 /// disableAllFunctions - This disables all builtins, which is used for options
 /// like -fno-builtin.
 void TargetLibraryInfo::disableAllFunctions() {
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 717ad41..3343384 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -58,7 +58,6 @@ TargetLoweringObjectFile::TargetLoweringObjectFile() : Ctx(0) {
   DwarfRangesSection = 0;
   DwarfMacroInfoSection = 0;
   
-  IsFunctionEHSymbolGlobal = false;
   IsFunctionEHFrameSymbolPrivate = true;
   SupportsWeakOmittedEHFrame = true;
 }
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 76ccc09..863b811 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -40,7 +40,6 @@ namespace llvm {
   bool JITExceptionHandling;
   bool JITEmitDebugInfo;
   bool JITEmitDebugInfoToDisk;
-  bool UnwindTablesMandatory;
   Reloc::Model RelocationModel;
   CodeModel::Model CMModel;
   bool GuaranteedTailCallOpt;
@@ -143,11 +142,6 @@ EmitJitDebugInfoToDisk("jit-emit-debug-to-disk",
   cl::desc("Emit debug info objfiles to disk"),
   cl::location(JITEmitDebugInfoToDisk),
   cl::init(false));
-static cl::opt<bool, true>
-EnableUnwindTables("unwind-tables",
-  cl::desc("Generate unwinding tables for all functions"),
-  cl::location(UnwindTablesMandatory),
-  cl::init(false));
 
 static cl::opt<llvm::Reloc::Model, true>
 DefRelocationModel("relocation-model",
diff --git a/lib/Target/TargetRegisterInfo.cpp b/lib/Target/TargetRegisterInfo.cpp
index 4811ba5..1c3f2dd 100644
--- a/lib/Target/TargetRegisterInfo.cpp
+++ b/lib/Target/TargetRegisterInfo.cpp
@@ -96,7 +96,8 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
   } else {
     for (TargetRegisterInfo::regclass_iterator I = regclass_begin(),
          E = regclass_end(); I != E; ++I)
-      getAllocatableSetForRC(MF, *I, Allocatable);
+      if ((*I)->isAllocatable())
+        getAllocatableSetForRC(MF, *I, Allocatable);
   }
 
   // Mask out the reserved registers
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index dd6e353..68247d2 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -41,6 +41,11 @@ X86ATTInstPrinter::X86ATTInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI)
             &TM.getSubtarget<X86Subtarget>()));
 }
 
+void X86ATTInstPrinter::printRegName(raw_ostream &OS,
+                                     unsigned RegNo) const {
+  OS << '%' << getRegisterName(RegNo);
+}
+
 void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) {
   // Try to print any aliases first.
   if (!printAliasInstr(MI, OS))
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index 8d69391..5f939b6 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -26,6 +26,7 @@ class X86ATTInstPrinter : public MCInstPrinter {
 public:
   X86ATTInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI);
   
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
   virtual void printInst(const MCInst *MI, raw_ostream &OS);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
 
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 47253eb..5f581ba 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -29,6 +29,10 @@ using namespace llvm;
 #define GET_INSTRUCTION_NAME
 #include "X86GenAsmWriter1.inc"
 
+void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << getRegisterName(RegNo);
+}
+
 void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) {
   printInstruction(MI, OS);
   
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index ca99dc0..c8030c3 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -27,6 +27,7 @@ public:
   X86IntelInstPrinter(TargetMachine &TM, const MCAsmInfo &MAI)
     : MCInstPrinter(MAI) {}
 
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
   virtual void printInst(const MCInst *MI, raw_ostream &OS);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
   
diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt
index e7429a3..bcfdf0b 100644
--- a/lib/Target/X86/README-X86-64.txt
+++ b/lib/Target/X86/README-X86-64.txt
@@ -124,51 +124,6 @@ if we have whole-function selectiondags.
 
 //===---------------------------------------------------------------------===//
 
-Take the following C code
-(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43640):
-
-struct u1
-{
-        float x;
-        float y;
-};
-
-float foo(struct u1 u)
-{
-        return u.x + u.y;
-}
-
-Optimizes to the following IR:
-define float @foo(double %u.0) nounwind readnone {
-entry:
-  %tmp8 = bitcast double %u.0 to i64              ; <i64> [#uses=2]
-  %tmp6 = trunc i64 %tmp8 to i32                  ; <i32> [#uses=1]
-  %tmp7 = bitcast i32 %tmp6 to float              ; <float> [#uses=1]
-  %tmp2 = lshr i64 %tmp8, 32                      ; <i64> [#uses=1]
-  %tmp3 = trunc i64 %tmp2 to i32                  ; <i32> [#uses=1]
-  %tmp4 = bitcast i32 %tmp3 to float              ; <float> [#uses=1]
-  %0 = fadd float %tmp7, %tmp4                    ; <float> [#uses=1]
-  ret float %0
-}
-
-And current llvm-gcc/clang output:
-	movd	%xmm0, %rax
-	movd	%eax, %xmm1
-	shrq	$32, %rax
-	movd	%eax, %xmm0
-	addss	%xmm1, %xmm0
-	ret
-
-We really shouldn't move the floats to RAX, only to immediately move them
-straight back to the XMM registers.
-
-There really isn't any good way to handle this purely in IR optimizers; it
-could possibly be handled by changing the output of the fronted, though.  It
-would also be feasible to add a x86-specific DAGCombine to optimize the
-bitcast+trunc+(lshr+)bitcast combination.
-
-//===---------------------------------------------------------------------===//
-
 Take the following code
 (from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653):
 extern unsigned long table[];
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index ea3014e..560947a 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -2040,3 +2040,29 @@ _clamp2:                                ## @clamp2
 The move of 0 could be scheduled above the test to make it is xor reg,reg.
 
 //===---------------------------------------------------------------------===//
+
+GCC PR48986.  We currently compile this:
+
+void bar(void);
+void yyy(int* p) {
+    if (__sync_fetch_and_add(p, -1) == 1)
+      bar();
+}
+
+into:
+	movl	$-1, %eax
+	lock
+	xaddl	%eax, (%rdi)
+	cmpl	$1, %eax
+	je	LBB0_2
+
+Instead we could generate:
+
+	lock
+	dec %rdi
+	je LBB0_2
+
+The trick is to match "fetch_and_add(X, -C) == C".
+
+//===---------------------------------------------------------------------===//
+
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 25b8d3e..7bb9676 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -101,8 +101,10 @@ def : Proc<"i686",            []>;
 def : Proc<"pentiumpro",      [FeatureCMOV]>;
 def : Proc<"pentium2",        [FeatureMMX, FeatureCMOV]>;
 def : Proc<"pentium3",        [FeatureSSE1]>;
+def : Proc<"pentium3m",       [FeatureSSE1, FeatureSlowBTMem]>;
 def : Proc<"pentium-m",       [FeatureSSE2, FeatureSlowBTMem]>;
 def : Proc<"pentium4",        [FeatureSSE2]>;
+def : Proc<"pentium4m",       [FeatureSSE2, FeatureSlowBTMem]>;
 def : Proc<"x86-64",          [FeatureSSE2,   Feature64Bit, FeatureSlowBTMem]>;
 def : Proc<"yonah",           [FeatureSSE3, FeatureSlowBTMem]>;
 def : Proc<"prescott",        [FeatureSSE3, FeatureSlowBTMem]>;
@@ -122,7 +124,7 @@ def : Proc<"westmere",        [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem,
 // SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
 // rather than a superset.
 // FIXME: Disabling AVX for now since it's not ready.
-def : Proc<"sandybridge",     [FeatureSSE42, Feature64Bit,
+def : Proc<"corei7-avx",      [FeatureSSE42, Feature64Bit,
                                FeatureAES, FeatureCLMUL]>;
 
 def : Proc<"k6",              [FeatureMMX]>;
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 1382f18..f1b9972 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -108,11 +108,11 @@ private:
   bool X86SelectFPExt(const Instruction *I);
   bool X86SelectFPTrunc(const Instruction *I);
 
-  bool X86SelectExtractValue(const Instruction *I);
-
   bool X86VisitIntrinsicCall(const IntrinsicInst &I);
   bool X86SelectCall(const Instruction *I);
 
+  bool DoSelectCall(const Instruction *I, const char *MemIntName);
+
   const X86InstrInfo *getInstrInfo() const {
     return getTargetMachine()->getInstrInfo();
   }
@@ -135,6 +135,8 @@ private:
 
   bool isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1 = false);
 
+  bool IsMemcpySmall(uint64_t Len);
+
   bool TryEmitSmallMemcpy(X86AddressMode DestAM,
                           X86AddressMode SrcAM, uint64_t Len);
 };
@@ -401,7 +403,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
         Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
         continue;
       }
-      
+
       // A array/variable index is always of the form i*S where S is the
       // constant scale size.  See if we can push the scale into immediates.
       uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
@@ -469,7 +471,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
     if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
       if (GVar->isThreadLocal())
         return false;
-    
+
     // RIP-relative addresses can't have additional register operands, so if
     // we've already folded stuff into the addressing mode, just force the
     // global value into its own register, which we can use as the basereg.
@@ -704,7 +706,8 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
-    CCState CCInfo(CC, F.isVarArg(), TM, ValLocs, I->getContext());
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
+		   I->getContext());
     CCInfo.AnalyzeReturn(Outs, RetCC_X86);
 
     const Value *RV = Ret->getOperand(0);
@@ -724,18 +727,38 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
     // Only handle register returns for now.
     if (!VA.isRegLoc())
       return false;
-    // TODO: For now, don't try to handle cases where getLocInfo()
-    // says Full but the types don't match.
-    if (TLI.getValueType(RV->getType()) != VA.getValVT())
-      return false;
 
     // The calling-convention tables for x87 returns don't tell
     // the whole story.
     if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
       return false;
 
-    // Make the copy.
     unsigned SrcReg = Reg + VA.getValNo();
+    EVT SrcVT = TLI.getValueType(RV->getType());
+    EVT DstVT = VA.getValVT();
+    // Special handling for extended integers.
+    if (SrcVT != DstVT) {
+      if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
+        return false;
+
+      if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
+        return false;
+
+      assert(DstVT == MVT::i32 && "X86 should always ext to i32");
+
+      if (SrcVT == MVT::i1) {
+        if (Outs[0].Flags.isSExt())
+          return false;
+        SrcReg = FastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
+        SrcVT = MVT::i8;
+      }
+      unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
+                                             ISD::SIGN_EXTEND;
+      SrcReg = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
+                          SrcReg, /*TODO: Kill=*/false);
+    }
+
+    // Make the copy.
     unsigned DstReg = VA.getLocReg();
     const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg);
     // Avoid a cross-class copy. This is very unlikely.
@@ -916,18 +939,31 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
 
 bool X86FastISel::X86SelectZExt(const Instruction *I) {
   // Handle zero-extension from i1 to i8, which is common.
-  if (I->getType()->isIntegerTy(8) &&
-      I->getOperand(0)->getType()->isIntegerTy(1)) {
-    unsigned ResultReg = getRegForValue(I->getOperand(0));
-    if (ResultReg == 0) return false;
-    // Set the high bits to zero.
-    ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
-    if (ResultReg == 0) return false;
-    UpdateValueMap(I, ResultReg);
-    return true;
+  if (!I->getOperand(0)->getType()->isIntegerTy(1))
+    return false;
+
+  EVT DstVT = TLI.getValueType(I->getType());
+  if (!TLI.isTypeLegal(DstVT))
+    return false;
+
+  unsigned ResultReg = getRegForValue(I->getOperand(0));
+  if (ResultReg == 0)
+    return false;
+
+  // Set the high bits to zero.
+  ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
+  if (ResultReg == 0)
+    return false;
+
+  if (DstVT != MVT::i8) {
+    ResultReg = FastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
+                           ResultReg, /*Kill=*/true);
+    if (ResultReg == 0)
+      return false;
   }
 
-  return false;
+  UpdateValueMap(I, ResultReg);
+  return true;
 }
 
 
@@ -1010,63 +1046,6 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
       FuncInfo.MBB->addSuccessor(TrueMBB);
       return true;
     }
-  } else if (ExtractValueInst *EI =
-             dyn_cast<ExtractValueInst>(BI->getCondition())) {
-    // Check to see if the branch instruction is from an "arithmetic with
-    // overflow" intrinsic. The main way these intrinsics are used is:
-    //
-    //   %t = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
-    //   %sum = extractvalue { i32, i1 } %t, 0
-    //   %obit = extractvalue { i32, i1 } %t, 1
-    //   br i1 %obit, label %overflow, label %normal
-    //
-    // The %sum and %obit are converted in an ADD and a SETO/SETB before
-    // reaching the branch. Therefore, we search backwards through the MBB
-    // looking for the SETO/SETB instruction. If an instruction modifies the
-    // EFLAGS register before we reach the SETO/SETB instruction, then we can't
-    // convert the branch into a JO/JB instruction.
-    if (const IntrinsicInst *CI =
-          dyn_cast<IntrinsicInst>(EI->getAggregateOperand())){
-      if (CI->getIntrinsicID() == Intrinsic::sadd_with_overflow ||
-          CI->getIntrinsicID() == Intrinsic::uadd_with_overflow) {
-        const MachineInstr *SetMI = 0;
-        unsigned Reg = getRegForValue(EI);
-
-        for (MachineBasicBlock::const_reverse_iterator
-               RI = FuncInfo.MBB->rbegin(), RE = FuncInfo.MBB->rend();
-             RI != RE; ++RI) {
-          const MachineInstr &MI = *RI;
-
-          if (MI.definesRegister(Reg)) {
-            if (MI.isCopy()) {
-              Reg = MI.getOperand(1).getReg();
-              continue;
-            }
-
-            SetMI = &MI;
-            break;
-          }
-
-          const TargetInstrDesc &TID = MI.getDesc();
-          if (TID.hasImplicitDefOfPhysReg(X86::EFLAGS) ||
-              MI.hasUnmodeledSideEffects())
-            break;
-        }
-
-        if (SetMI) {
-          unsigned OpCode = SetMI->getOpcode();
-
-          if (OpCode == X86::SETOr || OpCode == X86::SETBr) {
-            BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                    TII.get(OpCode == X86::SETOr ?  X86::JO_4 : X86::JB_4))
-              .addMBB(TrueMBB);
-            FastEmitBranch(FalseMBB, DL);
-            FuncInfo.MBB->addSuccessor(TrueMBB);
-            return true;
-          }
-        }
-      }
-    }
   } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
     // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
     // typically happen for _Bool and C++ bools.
@@ -1086,13 +1065,13 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
         if (OpReg == 0) return false;
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TestOpc))
           .addReg(OpReg).addImm(1);
-        
+
         unsigned JmpOpc = X86::JNE_4;
         if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
           std::swap(TrueMBB, FalseMBB);
           JmpOpc = X86::JE_4;
         }
-        
+
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(JmpOpc))
           .addMBB(TrueMBB);
         FastEmitBranch(FalseMBB, DL);
@@ -1266,18 +1245,13 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
 }
 
 bool X86FastISel::X86SelectTrunc(const Instruction *I) {
-  if (Subtarget->is64Bit())
-    // All other cases should be handled by the tblgen generated code.
-    return false;
   EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
   EVT DstVT = TLI.getValueType(I->getType());
 
-  // This code only handles truncation to byte right now.
+  // This code only handles truncation to byte.
   if (DstVT != MVT::i8 && DstVT != MVT::i1)
-    // All other cases should be handled by the tblgen generated code.
     return false;
-  if (SrcVT != MVT::i16 && SrcVT != MVT::i32)
-    // All other cases should be handled by the tblgen generated code.
+  if (!TLI.isTypeLegal(SrcVT))
     return false;
 
   unsigned InputReg = getRegForValue(I->getOperand(0));
@@ -1285,16 +1259,26 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
     // Unhandled operand.  Halt "fast" selection and bail.
     return false;
 
-  // First issue a copy to GR16_ABCD or GR32_ABCD.
-  const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16)
-    ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass;
-  unsigned CopyReg = createResultReg(CopyRC);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
-          CopyReg).addReg(InputReg);
+  if (SrcVT == MVT::i8) {
+    // Truncate from i8 to i1; no code needed.
+    UpdateValueMap(I, InputReg);
+    return true;
+  }
+
+  if (!Subtarget->is64Bit()) {
+    // If we're on x86-32; we can't extract an i8 from a general register.
+    // First issue a copy to GR16_ABCD or GR32_ABCD.
+    const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16)
+      ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass;
+    unsigned CopyReg = createResultReg(CopyRC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            CopyReg).addReg(InputReg);
+    InputReg = CopyReg;
+  }
 
-  // Then issue an extract_subreg.
+  // Issue an extract_subreg.
   unsigned ResultReg = FastEmitInst_extractsubreg(MVT::i8,
-                                                  CopyReg, /*Kill=*/true,
+                                                  InputReg, /*Kill=*/true,
                                                   X86::sub_8bit);
   if (!ResultReg)
     return false;
@@ -1303,36 +1287,18 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
   return true;
 }
 
-bool X86FastISel::X86SelectExtractValue(const Instruction *I) {
-  const ExtractValueInst *EI = cast<ExtractValueInst>(I);
-  const Value *Agg = EI->getAggregateOperand();
-
-  if (const IntrinsicInst *CI = dyn_cast<IntrinsicInst>(Agg)) {
-    switch (CI->getIntrinsicID()) {
-    default: break;
-    case Intrinsic::sadd_with_overflow:
-    case Intrinsic::uadd_with_overflow: {
-      // Cheat a little. We know that the registers for "add" and "seto" are
-      // allocated sequentially. However, we only keep track of the register
-      // for "add" in the value map. Use extractvalue's index to get the
-      // correct register for "seto".
-      unsigned OpReg = getRegForValue(Agg);
-      if (OpReg == 0)
-        return false;
-      UpdateValueMap(I, OpReg + *EI->idx_begin());
-      return true;
-    }
-    }
-  }
-
-  return false;
+bool X86FastISel::IsMemcpySmall(uint64_t Len) {
+  return Len <= (Subtarget->is64Bit() ? 32 : 16);
 }
 
 bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
                                      X86AddressMode SrcAM, uint64_t Len) {
+
   // Make sure we don't bloat code by inlining very large memcpy's.
-  bool i64Legal = TLI.isTypeLegal(MVT::i64);
-  if (Len > (i64Legal ? 32 : 16)) return false;
+  if (!IsMemcpySmall(Len))
+    return false;
+
+  bool i64Legal = Subtarget->is64Bit();
 
   // We don't care about alignment here since we just emit integer accesses.
   while (Len) {
@@ -1369,20 +1335,44 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
   case Intrinsic::memcpy: {
     const MemCpyInst &MCI = cast<MemCpyInst>(I);
     // Don't handle volatile or variable length memcpys.
-    if (MCI.isVolatile() || !isa<ConstantInt>(MCI.getLength()))
+    if (MCI.isVolatile())
+      return false;
+
+    if (isa<ConstantInt>(MCI.getLength())) {
+      // Small memcpy's are common enough that we want to do them
+      // without a call if possible.
+      uint64_t Len = cast<ConstantInt>(MCI.getLength())->getZExtValue();
+      if (IsMemcpySmall(Len)) {
+        X86AddressMode DestAM, SrcAM;
+        if (!X86SelectAddress(MCI.getRawDest(), DestAM) ||
+            !X86SelectAddress(MCI.getRawSource(), SrcAM))
+          return false;
+        TryEmitSmallMemcpy(DestAM, SrcAM, Len);
+        return true;
+      }
+    }
+
+    unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+    if (!MCI.getLength()->getType()->isIntegerTy(SizeWidth))
       return false;
 
-    uint64_t Len = cast<ConstantInt>(MCI.getLength())->getZExtValue();
-    
-    // Get the address of the dest and source addresses.
-    X86AddressMode DestAM, SrcAM;
-    if (!X86SelectAddress(MCI.getRawDest(), DestAM) ||
-        !X86SelectAddress(MCI.getRawSource(), SrcAM))
+    if (MCI.getSourceAddressSpace() > 255 || MCI.getDestAddressSpace() > 255)
       return false;
 
-    return TryEmitSmallMemcpy(DestAM, SrcAM, Len);
+    return DoSelectCall(&I, "memcpy");
+  }
+  case Intrinsic::memset: {
+    const MemSetInst &MSI = cast<MemSetInst>(I);
+
+    unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+    if (!MSI.getLength()->getType()->isIntegerTy(SizeWidth))
+      return false;
+
+    if (MSI.getDestAddressSpace() > 255)
+      return false;
+
+    return DoSelectCall(&I, "memset");
   }
-      
   case Intrinsic::stackprotector: {
     // Emit code inline code to store the stack guard onto the stack.
     EVT PtrTy = TLI.getPointerTy();
@@ -1396,29 +1386,6 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
     return true;
   }
-  case Intrinsic::objectsize: {
-    // FIXME: This should be moved to generic code!
-    ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
-    const Type *Ty = I.getCalledFunction()->getReturnType();
-
-    MVT VT;
-    if (!isTypeLegal(Ty, VT))
-      return false;
-
-    unsigned OpC = 0;
-    if (VT == MVT::i32)
-      OpC = X86::MOV32ri;
-    else if (VT == MVT::i64)
-      OpC = X86::MOV64ri;
-    else
-      return false;
-
-    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg).
-                                  addImm(CI->isZero() ? -1ULL : 0);
-    UpdateValueMap(&I, ResultReg);
-    return true;
-  }
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst *DI = cast<DbgDeclareInst>(&I);
     X86AddressMode AM;
@@ -1439,12 +1406,9 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::uadd_with_overflow: {
     // FIXME: Should fold immediates.
-    
+
     // Replace "add with overflow" intrinsics with an "add" instruction followed
-    // by a seto/setc instruction. Later on, when the "extractvalue"
-    // instructions are encountered, we use the fact that two registers were
-    // created sequentially to get the correct registers for the "sum" and the
-    // "overflow bit".
+    // by a seto/setc instruction.
     const Function *Callee = I.getCalledFunction();
     const Type *RetTy =
       cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0));
@@ -1470,27 +1434,18 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     else
       return false;
 
-    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    // The call to CreateRegs builds two sequential registers, to store the
+    // both the the returned values.
+    unsigned ResultReg = FuncInfo.CreateRegs(I.getType());
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg)
       .addReg(Reg1).addReg(Reg2);
-    unsigned DestReg1 = UpdateValueMap(&I, ResultReg);
-
-    // If the add with overflow is an intra-block value then we just want to
-    // create temporaries for it like normal.  If it is a cross-block value then
-    // UpdateValueMap will return the cross-block register used.  Since we
-    // *really* want the value to be live in the register pair known by
-    // UpdateValueMap, we have to use DestReg1+1 as the destination register in
-    // the cross block case.  In the non-cross-block case, we should just make
-    // another register for the value.
-    if (DestReg1 != ResultReg)
-      ResultReg = DestReg1+1;
-    else
-      ResultReg = createResultReg(TLI.getRegClassFor(MVT::i8));
 
     unsigned Opc = X86::SETBr;
     if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow)
       Opc = X86::SETOr;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg+1);
+
+    UpdateValueMap(&I, ResultReg, 2);
     return true;
   }
   }
@@ -1508,6 +1463,14 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI))
     return X86VisitIntrinsicCall(*II);
 
+  return DoSelectCall(I, 0);
+}
+
+// Select either a call, or an llvm.memcpy/memmove/memset intrinsic
+bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
+  const CallInst *CI = cast<CallInst>(I);
+  const Value *Callee = CI->getCalledValue();
+
   // Handle only C and fastcc calling conventions for now.
   ImmutableCallSite CS(CI);
   CallingConv::ID CC = CS.getCallingConv();
@@ -1533,12 +1496,15 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   if (Subtarget->IsCalleePop(isVarArg, CC))
     return false;
 
-  // Handle *simple* calls for now.
-  const Type *RetTy = CS.getType();
-  MVT RetVT;
-  if (RetTy->isVoidTy())
-    RetVT = MVT::isVoid;
-  else if (!isTypeLegal(RetTy, RetVT, true))
+  // Check whether the function can return without sret-demotion.
+  SmallVector<ISD::OutputArg, 4> Outs;
+  SmallVector<uint64_t, 4> Offsets;
+  GetReturnInfo(I->getType(), CS.getAttributes().getRetAttributes(),
+                Outs, TLI, &Offsets);
+  bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
+					   *FuncInfo.MF, FTy->isVarArg(),
+					   Outs, FTy->getContext());
+  if (!CanLowerReturn)
     return false;
 
   // Materialize callee address in a register. FIXME: GV address can be
@@ -1555,13 +1521,6 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   } else
     return false;
 
-  // Allow calls which produce i1 results.
-  bool AndToI1 = false;
-  if (RetVT == MVT::i1) {
-    RetVT = MVT::i8;
-    AndToI1 = true;
-  }
-
   // Deal with call operands first.
   SmallVector<const Value *, 8> ArgVals;
   SmallVector<unsigned, 8> Args;
@@ -1573,6 +1532,10 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   ArgFlags.reserve(CS.arg_size());
   for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
        i != e; ++i) {
+    // If we're lowering a mem intrinsic instead of a regular call, skip the
+    // last two arguments, which should not passed to the underlying functions.
+    if (MemIntName && e-i <= 2)
+      break;
     Value *ArgVal = *i;
     ISD::ArgFlagsTy Flags;
     unsigned AttrInd = i - CS.arg_begin() + 1;
@@ -1581,6 +1544,25 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
     if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
       Flags.setZExt();
 
+    if (CS.paramHasAttr(AttrInd, Attribute::ByVal)) {
+      const PointerType *Ty = cast<PointerType>(ArgVal->getType());
+      const Type *ElementTy = Ty->getElementType();
+      unsigned FrameSize = TD.getTypeAllocSize(ElementTy);
+      unsigned FrameAlign = CS.getParamAlignment(AttrInd);
+      if (!FrameAlign)
+        FrameAlign = TLI.getByValTypeAlignment(ElementTy);
+      Flags.setByVal();
+      Flags.setByValSize(FrameSize);
+      Flags.setByValAlign(FrameAlign);
+      if (!IsMemcpySmall(FrameSize))
+        return false;
+    }
+
+    if (CS.paramHasAttr(AttrInd, Attribute::InReg))
+      Flags.setInReg();
+    if (CS.paramHasAttr(AttrInd, Attribute::Nest))
+      Flags.setNest();
+
     // If this is an i1/i8/i16 argument, promote to i32 to avoid an extra
     // instruction.  This is safe because it is common to all fastisel supported
     // calling conventions on x86.
@@ -1593,9 +1575,9 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
           ArgVal = ConstantExpr::getZExt(CI,Type::getInt32Ty(CI->getContext()));
       }
     }
-    
+
     unsigned ArgReg;
-    
+
     // Passing bools around ends up doing a trunc to i1 and passing it.
     // Codegen this as an argument + "and 1".
     if (ArgVal->getType()->isIntegerTy(1) && isa<TruncInst>(ArgVal) &&
@@ -1604,10 +1586,10 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
       ArgVal = cast<TruncInst>(ArgVal)->getOperand(0);
       ArgReg = getRegForValue(ArgVal);
       if (ArgReg == 0) return false;
-      
+
       MVT ArgVT;
       if (!isTypeLegal(ArgVal->getType(), ArgVT)) return false;
-      
+
       ArgReg = FastEmit_ri(ArgVT, ArgVT, ISD::AND, ArgReg,
                            ArgVal->hasOneUse(), 1);
     } else {
@@ -1616,16 +1598,12 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
 
     if (ArgReg == 0) return false;
 
-    // FIXME: Only handle *easy* calls for now.
-    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
-        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
-        CS.paramHasAttr(AttrInd, Attribute::ByVal))
-      return false;
-
     const Type *ArgTy = ArgVal->getType();
     MVT ArgVT;
     if (!isTypeLegal(ArgTy, ArgVT))
       return false;
+    if (ArgVT == MVT::x86mmx)
+      return false;
     unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
     Flags.setOrigAlign(OriginalAlignment);
 
@@ -1637,7 +1615,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CC, isVarArg, TM, ArgLocs, I->getParent()->getContext());
+  CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs,
+		 I->getParent()->getContext());
 
   // Allocate shadow area for Win64
   if (Subtarget->isTargetWin64())
@@ -1666,6 +1645,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
     default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::SExt: {
+      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+             "Unexpected extend");
       bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
                                        Arg, ArgVT, Arg);
       assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
@@ -1673,6 +1654,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
       break;
     }
     case CCValAssign::ZExt: {
+      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+             "Unexpected extend");
       bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
                                        Arg, ArgVT, Arg);
       assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
@@ -1680,9 +1663,8 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
       break;
     }
     case CCValAssign::AExt: {
-      // We don't handle MMX parameters yet.
-      if (VA.getLocVT().isVector() && VA.getLocVT().getSizeInBits() == 128)
-        return false;
+      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+             "Unexpected extend");
       bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(),
                                        Arg, ArgVT, Arg);
       if (!Emitted)
@@ -1716,14 +1698,21 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
       AM.Base.Reg = StackPtr;
       AM.Disp = LocMemOffset;
       const Value *ArgVal = ArgVals[VA.getValNo()];
-
-      // If this is a really simple value, emit this with the Value* version of
-      // X86FastEmitStore.  If it isn't simple, we don't want to do this, as it
-      // can cause us to reevaluate the argument.
-      if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal))
+      ISD::ArgFlagsTy Flags = ArgFlags[VA.getValNo()];
+
+      if (Flags.isByVal()) {
+        X86AddressMode SrcAM;
+        SrcAM.Base.Reg = Arg;
+        bool Res = TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize());
+        assert(Res && "memcpy length already checked!"); (void)Res;
+      } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
+        // If this is a really simple value, emit this with the Value* version
+        //of X86FastEmitStore.  If it isn't simple, we don't want to do this,
+        // as it can cause us to reevaluate the argument.
         X86FastEmitStore(ArgVT, ArgVal, AM);
-      else
+      } else {
         X86FastEmitStore(ArgVT, Arg, AM);
+      }
     }
   }
 
@@ -1793,8 +1782,11 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
     }
 
 
-    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc))
-      .addGlobalAddress(GV, 0, OpFlags);
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc));
+    if (MemIntName)
+      MIB.addExternalSymbol(MemIntName, OpFlags);
+    else
+      MIB.addGlobalAddress(GV, 0, OpFlags);
   }
 
   // Add an implicit use GOT pointer in EBX.
@@ -1816,63 +1808,74 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp))
     .addImm(NumBytes).addImm(NumBytesCallee);
 
-  // Now handle call return value (if any).
-  SmallVector<unsigned, 4> UsedRegs;
-  if (RetVT != MVT::isVoid) {
-    SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, false, TM, RVLocs, I->getParent()->getContext());
-    CCInfo.AnalyzeCallResult(RetVT, RetCC_X86);
+  // Build info for return calling conv lowering code.
+  // FIXME: This is practically a copy-paste from TargetLowering::LowerCallTo.
+  SmallVector<ISD::InputArg, 32> Ins;
+  SmallVector<EVT, 4> RetTys;
+  ComputeValueVTs(TLI, I->getType(), RetTys);
+  for (unsigned i = 0, e = RetTys.size(); i != e; ++i) {
+    EVT VT = RetTys[i];
+    EVT RegisterVT = TLI.getRegisterType(I->getParent()->getContext(), VT);
+    unsigned NumRegs = TLI.getNumRegisters(I->getParent()->getContext(), VT);
+    for (unsigned j = 0; j != NumRegs; ++j) {
+      ISD::InputArg MyFlags;
+      MyFlags.VT = RegisterVT.getSimpleVT();
+      MyFlags.Used = !CS.getInstruction()->use_empty();
+      if (CS.paramHasAttr(0, Attribute::SExt))
+        MyFlags.Flags.setSExt();
+      if (CS.paramHasAttr(0, Attribute::ZExt))
+        MyFlags.Flags.setZExt();
+      if (CS.paramHasAttr(0, Attribute::InReg))
+        MyFlags.Flags.setInReg();
+      Ins.push_back(MyFlags);
+    }
+  }
 
-    // Copy all of the result registers out of their specified physreg.
-    assert(RVLocs.size() == 1 && "Can't handle multi-value calls!");
-    EVT CopyVT = RVLocs[0].getValVT();
-    TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT);
+  // Now handle call return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCRetInfo(CC, false, *FuncInfo.MF, TM, RVLocs,
+		    I->getParent()->getContext());
+  unsigned ResultReg = FuncInfo.CreateRegs(I->getType());
+  CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    EVT CopyVT = RVLocs[i].getValVT();
+    unsigned CopyReg = ResultReg + i;
 
     // If this is a call to a function that returns an fp value on the x87 fp
     // stack, but where we prefer to use the value in xmm registers, copy it
     // out as F80 and use a truncate to move it from fp stack reg to xmm reg.
-    if ((RVLocs[0].getLocReg() == X86::ST0 ||
-         RVLocs[0].getLocReg() == X86::ST1) &&
+    if ((RVLocs[i].getLocReg() == X86::ST0 ||
+         RVLocs[i].getLocReg() == X86::ST1) &&
         isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) {
       CopyVT = MVT::f80;
-      DstRC = X86::RFP80RegisterClass;
+      CopyReg = createResultReg(X86::RFP80RegisterClass);
     }
 
-    unsigned ResultReg = createResultReg(DstRC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
-            ResultReg).addReg(RVLocs[0].getLocReg());
-    UsedRegs.push_back(RVLocs[0].getLocReg());
+            CopyReg).addReg(RVLocs[i].getLocReg());
+    UsedRegs.push_back(RVLocs[i].getLocReg());
 
-    if (CopyVT != RVLocs[0].getValVT()) {
+    if (CopyVT != RVLocs[i].getValVT()) {
       // Round the F80 the right size, which also moves to the appropriate xmm
       // register. This is accomplished by storing the F80 value in memory and
       // then loading it back. Ewww...
-      EVT ResVT = RVLocs[0].getValVT();
+      EVT ResVT = RVLocs[i].getValVT();
       unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
       unsigned MemSize = ResVT.getSizeInBits()/8;
       int FI = MFI.CreateStackObject(MemSize, MemSize, false);
       addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                                 TII.get(Opc)), FI)
-        .addReg(ResultReg);
-      DstRC = ResVT == MVT::f32
-        ? X86::FR32RegisterClass : X86::FR64RegisterClass;
+        .addReg(CopyReg);
       Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
-      ResultReg = createResultReg(DstRC);
       addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                                TII.get(Opc), ResultReg), FI);
+                                TII.get(Opc), ResultReg + i), FI);
     }
-
-    if (AndToI1) {
-      // Mask out all but lowest bit for some call which produces an i1.
-      unsigned AndResult = createResultReg(X86::GR8RegisterClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-              TII.get(X86::AND8ri), AndResult).addReg(ResultReg).addImm(1);
-      ResultReg = AndResult;
-    }
-
-    UpdateValueMap(I, ResultReg);
   }
 
+  if (RVLocs.size())
+    UpdateValueMap(I, ResultReg, RVLocs.size());
+
   // Set all unused physreg defs as dead.
   static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
 
@@ -1911,8 +1914,6 @@ X86FastISel::TargetSelectInstruction(const Instruction *I)  {
     return X86SelectFPExt(I);
   case Instruction::FPTrunc:
     return X86SelectFPTrunc(I);
-  case Instruction::ExtractValue:
-    return X86SelectExtractValue(I);
   case Instruction::IntToPtr: // Deliberate fall-through.
   case Instruction::PtrToInt: {
     EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
@@ -1990,7 +1991,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
       if (AM.BaseType == X86AddressMode::RegBase &&
           AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == 0)
         return AM.Base.Reg;
-      
+
       Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
       unsigned ResultReg = createResultReg(RC);
       addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 06d12fc..2e95300 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -355,7 +355,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineModuleInfo &MMI = MF.getMMI();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   bool needsFrameMoves = MMI.hasDebugInfo() ||
-                          !Fn->doesNotThrow() || UnwindTablesMandatory;
+    Fn->needsUnwindTableEntry();
   uint64_t MaxAlign  = MFI->getMaxAlignment(); // Desired stack alignment.
   uint64_t StackSize = MFI->getStackSize();    // Number of bytes to allocate.
   bool HasFP = hasFP(MF);
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 4534e85..1fcc274 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -189,6 +189,7 @@ namespace {
     SDNode *Select(SDNode *N);
     SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
     SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT);
+    SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT);
 
     bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
     bool MatchWrapper(SDValue N, X86ISelAddressMode &AM);
@@ -1329,6 +1330,8 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
   return ResNode;
 }
 
+// FIXME: Figure out some way to unify this with the 'or' and other code
+// below.
 SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
   if (Node->hasAnyUseOfValue(0))
     return 0;
@@ -1479,6 +1482,158 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
   }
 }
 
+enum AtomicOpc {
+  OR,
+  AND,
+  XOR,
+  AtomicOpcEnd
+};
+
+enum AtomicSz {
+  ConstantI8,
+  I8,
+  SextConstantI16,
+  ConstantI16,
+  I16,
+  SextConstantI32,
+  ConstantI32,
+  I32,
+  SextConstantI64,
+  ConstantI64,
+  I64,
+  AtomicSzEnd
+};
+
+static const unsigned int AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
+  {
+    X86::LOCK_OR8mi,
+    X86::LOCK_OR8mr,
+    X86::LOCK_OR16mi8,
+    X86::LOCK_OR16mi,
+    X86::LOCK_OR16mr,
+    X86::LOCK_OR32mi8,
+    X86::LOCK_OR32mi,
+    X86::LOCK_OR32mr,
+    X86::LOCK_OR64mi8,
+    X86::LOCK_OR64mi32,
+    X86::LOCK_OR64mr
+  },
+  {
+    X86::LOCK_AND8mi,
+    X86::LOCK_AND8mr,
+    X86::LOCK_AND16mi8,
+    X86::LOCK_AND16mi,
+    X86::LOCK_AND16mr,
+    X86::LOCK_AND32mi8,
+    X86::LOCK_AND32mi,
+    X86::LOCK_AND32mr,
+    X86::LOCK_AND64mi8,
+    X86::LOCK_AND64mi32,
+    X86::LOCK_AND64mr
+  },
+  {
+    X86::LOCK_XOR8mi,
+    X86::LOCK_XOR8mr,
+    X86::LOCK_XOR16mi8,
+    X86::LOCK_XOR16mi,
+    X86::LOCK_XOR16mr,
+    X86::LOCK_XOR32mi8,
+    X86::LOCK_XOR32mi,
+    X86::LOCK_XOR32mr,
+    X86::LOCK_XOR64mi8,
+    X86::LOCK_XOR64mi32,
+    X86::LOCK_XOR64mr
+  }
+};
+
+SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
+  if (Node->hasAnyUseOfValue(0))
+    return 0;
+  
+  // Optimize common patterns for __sync_or_and_fetch and similar arith
+  // operations where the result is not used. This allows us to use the "lock"
+  // version of the arithmetic instruction.
+  // FIXME: Same as for 'add' and 'sub', try to merge those down here.
+  SDValue Chain = Node->getOperand(0);
+  SDValue Ptr = Node->getOperand(1);
+  SDValue Val = Node->getOperand(2);
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
+    return 0;
+
+  // Which index into the table.
+  enum AtomicOpc Op;
+  switch (Node->getOpcode()) {
+    case ISD::ATOMIC_LOAD_OR:
+      Op = OR;
+      break;
+    case ISD::ATOMIC_LOAD_AND:
+      Op = AND;
+      break;
+    case ISD::ATOMIC_LOAD_XOR:
+      Op = XOR;
+      break;
+    default:
+      return 0;
+  }
+  
+  bool isCN = false;
+  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val);
+  if (CN) {
+    isCN = true;
+    Val = CurDAG->getTargetConstant(CN->getSExtValue(), NVT);
+  }
+  
+  unsigned Opc = 0;
+  switch (NVT.getSimpleVT().SimpleTy) {
+    default: return 0;
+    case MVT::i8:
+      if (isCN)
+        Opc = AtomicOpcTbl[Op][ConstantI8];
+      else
+        Opc = AtomicOpcTbl[Op][I8];
+      break;
+    case MVT::i16:
+      if (isCN) {
+        if (immSext8(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][SextConstantI16];
+        else
+          Opc = AtomicOpcTbl[Op][ConstantI16];
+      } else
+        Opc = AtomicOpcTbl[Op][I16];
+      break;
+    case MVT::i32:
+      if (isCN) {
+        if (immSext8(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][SextConstantI32];
+        else
+          Opc = AtomicOpcTbl[Op][ConstantI32];
+      } else
+        Opc = AtomicOpcTbl[Op][I32];
+      break;
+    case MVT::i64:
+      if (isCN) {
+        if (immSext8(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][SextConstantI64];
+        else if (i64immSExt32(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][ConstantI64];
+      } else
+        Opc = AtomicOpcTbl[Op][I64];
+      break;
+  }
+  
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                 dl, NVT), 0);
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+  SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
+  SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0);
+  cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
+  SDValue RetVals[] = { Undef, Ret };
+  return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
+}
+
 /// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has
 /// any uses which require the SF or OF bits to be accurate.
 static bool HasNoSignedComparisonUses(SDNode *N) {
@@ -1580,6 +1735,14 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       return RetVal;
     break;
   }
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR: {
+    SDNode *RetVal = SelectAtomicLoadArith(Node, NVT);
+    if (RetVal)
+      return RetVal;
+    break;
+  }
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR: {
@@ -1843,17 +2006,17 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
         SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
         Move =
-          SDValue(CurDAG->getMachineNode(X86::MOVZX16rm8, dl, MVT::i16,
+          SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
                                          MVT::Other, Ops,
                                          array_lengthof(Ops)), 0);
         Chain = Move.getValue(1);
         ReplaceUses(N0.getValue(1), Chain);
       } else {
         Move =
-          SDValue(CurDAG->getMachineNode(X86::MOVZX16rr8, dl, MVT::i16, N0),0);
+          SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0);
         Chain = CurDAG->getEntryNode();
       }
-      Chain  = CurDAG->getCopyToReg(Chain, dl, X86::AX, Move, SDValue());
+      Chain  = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue());
       InFlag = Chain.getValue(1);
     } else {
       InFlag =
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 703c01d..294a6a7 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -222,7 +222,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   // X86 is weird, it always uses i8 for shift amounts and setcc results.
   setBooleanContents(ZeroOrOneBooleanContent);
-    
+
   // For 64-bit since we have so many registers use the ILP scheduler, for
   // 32-bit code use the register pressure specific scheduling.
   if (Subtarget->is64Bit())
@@ -574,6 +574,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 
+    // Lower this to FGETSIGNx86 plus an AND.
+    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
+    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
+
     // We don't support sin/cos/fmod
     setOperationAction(ISD::FSIN , MVT::f64, Expand);
     setOperationAction(ISD::FCOS , MVT::f64, Expand);
@@ -927,7 +931,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     // Can turn SHL into an integer multiply.
     setOperationAction(ISD::SHL,                MVT::v4i32, Custom);
     setOperationAction(ISD::SHL,                MVT::v16i8, Custom);
-    setOperationAction(ISD::SRL,                MVT::v4i32, Legal);
 
     // i8 and i16 vectors are custom , because the source register and source
     // source memory operand types are not the same width.  f32 vectors are
@@ -949,6 +952,19 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     }
   }
 
+  if (Subtarget->hasSSE2()) {
+    setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
+    setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
+    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
+
+    setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
+    setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
+    setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
+
+    setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
+    setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
+  }
+
   if (Subtarget->hasSSE42())
     setOperationAction(ISD::VSETCC,             MVT::v2i64, Custom);
 
@@ -1081,6 +1097,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::SINT_TO_FP);
   if (Subtarget->is64Bit())
     setTargetDAGCombine(ISD::MUL);
 
@@ -1096,6 +1113,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
   setPrefLoopAlignment(16);
   benefitFromCodePlacementOpt = true;
+
+  setPrefFunctionAlignment(4);
 }
 
 
@@ -1247,11 +1266,6 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
-  return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4;
-}
-
 // FIXME: Why this routine is here? Move to RegInfo!
 std::pair<const TargetRegisterClass*, uint8_t>
 X86TargetLowering::findRepresentativeClass(EVT VT) const{
@@ -1306,11 +1320,12 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
 #include "X86GenCallingConv.inc"
 
 bool
-X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
+X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+				  MachineFunction &MF, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
                  RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC_X86);
 }
@@ -1325,7 +1340,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
                  RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
 
@@ -1476,8 +1491,8 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   bool Is64Bit = Subtarget->is64Bit();
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
 
   // Copy all of the result registers out of their specified physreg.
@@ -1518,20 +1533,6 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
         Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
                           // This truncation won't change the value.
                           DAG.getIntPtrConstant(1));
-    } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
-      // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64.
-      if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
-        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
-                                   MVT::v2i64, InFlag).getValue(1);
-        Val = Chain.getValue(0);
-        Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
-                          Val, DAG.getConstant(0, MVT::i64));
-      } else {
-        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
-                                   MVT::i64, InFlag).getValue(1);
-        Val = Chain.getValue(0);
-      }
-      Val = DAG.getNode(ISD::BITCAST, dl, CopyVT, Val);
     } else {
       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
                                  CopyVT, InFlag).getValue(1);
@@ -1680,7 +1681,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
                  ArgLocs, *DAG.getContext());
 
   // Allocate shadow area for Win64
@@ -2007,7 +2008,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
                  ArgLocs, *DAG.getContext());
 
   // Allocate shadow area for Win64
@@ -2530,16 +2531,30 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   if (RegInfo->needsStackRealignment(MF))
     return false;
 
-  // Do not sibcall optimize vararg calls unless the call site is not passing
-  // any arguments.
-  if (isVarArg && !Outs.empty())
-    return false;
-
   // Also avoid sibcall optimization if either caller or callee uses struct
   // return semantics.
   if (isCalleeStructRet || isCallerStructRet)
     return false;
 
+  // Do not sibcall optimize vararg calls unless all arguments are passed via
+  // registers.
+  if (isVarArg && !Outs.empty()) {
+
+    // Optimizing for varargs on Win64 is unlikely to be safe without
+    // additional testing.
+    if (Subtarget->isTargetWin64())
+      return false;
+
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+		   getTargetMachine(), ArgLocs, *DAG.getContext());
+
+    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+      if (!ArgLocs[i].isRegLoc())
+        return false;
+  }
+
   // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack.
   // Therefore if it's not used by the call it is not safe to optimize this into
   // a sibcall.
@@ -2552,8 +2567,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   }
   if (Unused) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CalleeCC, false, getTargetMachine(),
-                   RVLocs, *DAG.getContext());
+    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
+		   getTargetMachine(), RVLocs, *DAG.getContext());
     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
       CCValAssign &VA = RVLocs[i];
@@ -2566,13 +2581,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   // results are returned in the same way as what the caller expects.
   if (!CCMatch) {
     SmallVector<CCValAssign, 16> RVLocs1;
-    CCState CCInfo1(CalleeCC, false, getTargetMachine(),
-                    RVLocs1, *DAG.getContext());
+    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
+		    getTargetMachine(), RVLocs1, *DAG.getContext());
     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
 
     SmallVector<CCValAssign, 16> RVLocs2;
-    CCState CCInfo2(CallerCC, false, getTargetMachine(),
-                    RVLocs2, *DAG.getContext());
+    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
+		    getTargetMachine(), RVLocs2, *DAG.getContext());
     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
 
     if (RVLocs1.size() != RVLocs2.size())
@@ -2598,8 +2613,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     // Check if stack adjustment is needed. For now, do not do this if any
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(),
-                   ArgLocs, *DAG.getContext());
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+		   getTargetMachine(), ArgLocs, *DAG.getContext());
 
     // Allocate shadow area for Win64
     if (Subtarget->isTargetWin64()) {
@@ -6619,9 +6634,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 }
 
 
-/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
+/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and
 /// take a 2 x i32 value to shift plus a shift amount.
-SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
@@ -6710,12 +6725,18 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
 
   unsigned ByteSize = SrcVT.getSizeInBits()/8;
 
-  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
-  MachineMemOperand *MMO =
-    DAG.getMachineFunction()
-    .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
-                          MachineMemOperand::MOLoad, ByteSize, ByteSize);
-
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
+  MachineMemOperand *MMO;
+  if (FI) {
+    int SSFI = FI->getIndex();
+    MMO =
+      DAG.getMachineFunction()
+      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                            MachineMemOperand::MOLoad, ByteSize, ByteSize);
+  } else {
+    MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
+    StackSlot = StackSlot.getOperand(1);
+  }
   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
                                            X86ISD::FILD, DL,
@@ -7206,6 +7227,17 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
 }
 
+SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const {
+  SDValue N0 = Op.getOperand(0);
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
+  SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
+                                  DAG.getConstant(1, VT));
+  return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
+}
+
 /// Emit nodes that will be selected as "test Op0,Op0", or something
 /// equivalent.
 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
@@ -8781,16 +8813,71 @@ SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {
   return Res;
 }
 
-SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
+
   EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
   SDValue R = Op.getOperand(0);
+  SDValue Amt = Op.getOperand(1);
 
   LLVMContext *Context = DAG.getContext();
 
-  assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later");
+  // Must have SSE2.
+  if (!Subtarget->hasSSE2()) return SDValue();
+
+  // Optimize shl/srl/sra with constant shift amount.
+  if (isSplatVector(Amt.getNode())) {
+    SDValue SclrAmt = Amt->getOperand(0);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
+      uint64_t ShiftAmt = C->getZExtValue();
+
+      if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+    }
+  }
+
+  // Lower SHL with variable shift amount.
+  // Cannot lower SHL without SSE4.1 or later.
+  if (!Subtarget->hasSSE41()) return SDValue();
 
-  if (VT == MVT::v4i32) {
+  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
     Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                      DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
                      Op.getOperand(1), DAG.getConstant(23, MVT::i32));
@@ -8809,7 +8896,7 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const {
     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
   }
-  if (VT == MVT::v16i8) {
+  if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
     // a = a << 5;
     Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                      DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
@@ -9114,7 +9201,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   case ISD::SHL_PARTS:
   case ISD::SRA_PARTS:
-  case ISD::SRL_PARTS:          return LowerShift(Op, DAG);
+  case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
@@ -9122,6 +9209,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FABS:               return LowerFABS(Op, DAG);
   case ISD::FNEG:               return LowerFNEG(Op, DAG);
   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
+  case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
   case ISD::SETCC:              return LowerSETCC(Op, DAG);
   case ISD::VSETCC:             return LowerVSETCC(Op, DAG);
   case ISD::SELECT:             return LowerSELECT(Op, DAG);
@@ -9142,7 +9230,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
   case ISD::MUL:                return LowerMUL_V2I64(Op, DAG);
-  case ISD::SHL:                return LowerSHL(Op, DAG);
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::SHL:                return LowerShift(Op, DAG);
   case ISD::SADDO:
   case ISD::UADDO:
   case ISD::SSUBO:
@@ -9309,6 +9399,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
   case X86ISD::SETCC:              return "X86ISD::SETCC";
   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
+  case X86ISD::FSETCCsd:           return "X86ISD::FSETCCsd";
+  case X86ISD::FSETCCss:           return "X86ISD::FSETCCss";
   case X86ISD::CMOV:               return "X86ISD::CMOV";
   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
@@ -10986,14 +11078,14 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
        UE = Uses.end(); UI != UE; ++UI) {
     SDNode *Extract = *UI;
 
-    // Compute the element's address.
+    // cOMpute the element's address.
     SDValue Idx = Extract->getOperand(1);
     unsigned EltSize =
         InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
     uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
     SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
 
-    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(),
+    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
                                      StackPtr, OffsetVal);
 
     // Load the scalar.
@@ -11266,15 +11358,28 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
     return SDValue();
 
+  SDValue FalseOp = N->getOperand(0);
+  SDValue TrueOp = N->getOperand(1);
+  X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
+  SDValue Cond = N->getOperand(3);
+  if (CC == X86::COND_E || CC == X86::COND_NE) {
+    switch (Cond.getOpcode()) {
+    default: break;
+    case X86ISD::BSR:
+    case X86ISD::BSF:
+      // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
+      if (DAG.isKnownNeverZero(Cond.getOperand(0)))
+        return (CC == X86::COND_E) ? FalseOp : TrueOp;
+    }
+  }
+
   // If this is a select between two integer constants, try to do some
   // optimizations.  Note that the operands are ordered the opposite of SELECT
   // operands.
-  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
-    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
+    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
       // larger than FalseC (the false value).
-      X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
-
       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
         CC = X86::GetOppositeBranchCondition(CC);
         std::swap(TrueC, FalseC);
@@ -11284,7 +11389,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
       // This is efficient for any integer data type (including i8/i16) and
       // shift amount.
       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
-        SDValue Cond = N->getOperand(3);
         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
                            DAG.getConstant(CC, MVT::i8), Cond);
 
@@ -11302,7 +11406,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
       // for any integer data type, including i8/i16.
       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
-        SDValue Cond = N->getOperand(3);
         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
                            DAG.getConstant(CC, MVT::i8), Cond);
 
@@ -11341,7 +11444,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
 
         if (isFastMultiplier) {
           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
-          SDValue Cond = N->getOperand(3);
           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
                              DAG.getConstant(CC, MVT::i8), Cond);
           // Zero extend the condition if needed.
@@ -11576,12 +11678,94 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
 }
 
 
+// CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
+// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
+// and friends.  Likewise for OR -> CMPNEQSS.
+static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
+                            TargetLowering::DAGCombinerInfo &DCI,
+                            const X86Subtarget *Subtarget) {
+  unsigned opcode;
+
+  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
+  // we're requiring SSE2 for both.
+  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue CMP0 = N0->getOperand(1);
+    SDValue CMP1 = N1->getOperand(1);
+    DebugLoc DL = N->getDebugLoc();
+
+    // The SETCCs should both refer to the same CMP.
+    if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
+      return SDValue();
+
+    SDValue CMP00 = CMP0->getOperand(0);
+    SDValue CMP01 = CMP0->getOperand(1);
+    EVT     VT    = CMP00.getValueType();
+
+    if (VT == MVT::f32 || VT == MVT::f64) {
+      bool ExpectingFlags = false;
+      // Check for any users that want flags:
+      for (SDNode::use_iterator UI = N->use_begin(),
+             UE = N->use_end();
+           !ExpectingFlags && UI != UE; ++UI)
+        switch (UI->getOpcode()) {
+        default:
+        case ISD::BR_CC:
+        case ISD::BRCOND:
+        case ISD::SELECT:
+          ExpectingFlags = true;
+          break;
+        case ISD::CopyToReg:
+        case ISD::SIGN_EXTEND:
+        case ISD::ZERO_EXTEND:
+        case ISD::ANY_EXTEND:
+          break;
+        }
+
+      if (!ExpectingFlags) {
+        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
+        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
+
+        if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
+          X86::CondCode tmp = cc0;
+          cc0 = cc1;
+          cc1 = tmp;
+        }
+
+        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
+            (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
+          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
+          X86ISD::NodeType NTOperator = is64BitFP ?
+            X86ISD::FSETCCsd : X86ISD::FSETCCss;
+          // FIXME: need symbolic constants for these magic numbers.
+          // See X86ATTInstPrinter.cpp:printSSECC().
+          unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
+          SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01,
+                                              DAG.getConstant(x86cc, MVT::i8));
+          SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32,
+                                              OnesOrZeroesF);
+          SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI,
+                                      DAG.getConstant(1, MVT::i32));
+          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
+          return OneBitOfTruth;
+        }
+      }
+    }
+  }
+  return SDValue();
+}
+
 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget *Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
+  if (R.getNode())
+    return R;
+
   // Want to form PANDN nodes, in the hopes of then easily combining them with
   // OR and AND nodes to form PBLEND/PSIGN.
   EVT VT = N->getValueType(0);
@@ -11611,6 +11795,10 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
+  if (R.getNode())
+    return R;
+
   EVT VT = N->getValueType(0);
   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64)
     return SDValue();
@@ -11978,6 +12166,26 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86TargetLowering *XTLI) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Op0 = N->getOperand(0);
+  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
+  // a 32-bit target where SSE doesn't support i64->FP operations.
+  if (Op0.getOpcode() == ISD::LOAD) {
+    LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
+    EVT VT = Ld->getValueType(0);
+    if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
+        ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
+        !XTLI->getSubtarget()->is64Bit() &&
+        !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+      SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), Ld->getChain(), Op0, DAG);
+      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
+      return FILDChain;
+    }
+  }
+  return SDValue();
+}
+
 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
                                  X86TargetLowering::DAGCombinerInfo &DCI) {
@@ -12062,6 +12270,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
+  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
   case X86ISD::FXOR:
   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
@@ -12491,12 +12700,16 @@ LowerXConstraint(EVT ConstraintVT) const {
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
 /// vector.  If it is invalid, don't add anything to Ops.
 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
-                                                     char Constraint,
+                                                     std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
   SDValue Result(0, 0);
 
-  switch (Constraint) {
+  // Only support length 1 constraints for now.
+  if (Constraint.length() > 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
   default: break;
   case 'I':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
@@ -12688,7 +12901,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
         return std::make_pair(0U, X86::GR8RegisterClass);
       if (VT == MVT::i16)
         return std::make_pair(0U, X86::GR16RegisterClass);
-      if (VT == MVT::i32 || !Subtarget->is64Bit())
+      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
         return std::make_pair(0U, X86::GR32RegisterClass);
       return std::make_pair(0U, X86::GR64RegisterClass);
     case 'R':   // LEGACY_REGS
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 6301057..d61a125 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -94,6 +94,15 @@ namespace llvm {
       // one's or all zero's.
       SETCC_CARRY,  // R = carry_bit ? ~0 : 0
 
+      /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
+      /// Operands are two FP values to compare; result is a mask of
+      /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
+      FSETCCss, FSETCCsd,
+
+      /// X86 MOVMSK{pd|ps}, extracts sign bits of two or four FP values,
+      /// result in an integer GPR.  Needs masking for scalar result.
+      FGETSIGNx86,
+
       /// X86 conditional moves. Operand 0 and operand 1 are the two values
       /// to select from. Operand 2 is the condition code, and operand 3 is the
       /// flag operand produced by a CMP or TEST instruction. It also writes a
@@ -592,7 +601,7 @@ namespace llvm {
     /// true it means one of the asm constraint of the inline asm instruction
     /// being processed is 'm'.
     virtual void LowerAsmOperandForConstraint(SDValue Op,
-                                              char ConstraintLetter,
+                                              std::string &Constraint,
                                               std::vector<SDValue> &Ops,
                                               SelectionDAG &DAG) const;
 
@@ -674,15 +683,15 @@ namespace llvm {
     /// or null if the target does not support "fast" ISel.
     virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const;
-
     /// getStackCookieLocation - Return true if the target stores stack
     /// protector cookies at a fixed offset in some non-standard address
     /// space, and populates the address space and offset as
     /// appropriate.
     virtual bool getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const;
 
+    SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
+                      SelectionDAG &DAG) const;
+
   protected:
     std::pair<const TargetRegisterClass*, uint8_t>
     findRepresentativeClass(EVT VT) const;
@@ -773,9 +782,7 @@ namespace llvm {
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
-    SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
-                      SelectionDAG &DAG) const;
+    SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBITCAST(SDValue op, SelectionDAG &DAG) const;
     SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
@@ -786,6 +793,7 @@ namespace llvm {
     SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerToBT(SDValue And, ISD::CondCode CC,
                       DebugLoc dl, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
@@ -808,7 +816,7 @@ namespace llvm {
     SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSHL(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
@@ -850,9 +858,10 @@ namespace llvm {
                              ISD::NodeType ExtendKind) const;
 
     virtual bool
-      CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
-                     const SmallVectorImpl<ISD::OutputArg> &Outs,
-                     LLVMContext &Context) const;
+    CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+		   bool isVarArg,
+		   const SmallVectorImpl<ISD::OutputArg> &Outs,
+		   LLVMContext &Context) const;
 
     void ReplaceATOMIC_BINARY_64(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                  SelectionDAG &DAG, unsigned NewOp) const;
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 4c915d9..33534cd 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -214,6 +214,30 @@ def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
 def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
           (SETBr)>;
 
+// (add OP, SETB) -> (adc OP, 0)
+def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op),
+          (ADC8ri GR8:$op, 0)>;
+def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op),
+          (ADC32ri8 GR32:$op, 0)>;
+def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op),
+          (ADC64ri8 GR64:$op, 0)>;
+
+// (sub OP, SETB) -> (sbb OP, 0)
+def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+          (SBB8ri GR8:$op, 0)>;
+def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+          (SBB32ri8 GR32:$op, 0)>;
+def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+          (SBB64ri8 GR64:$op, 0)>;
+
+// (sub OP, SETCC_CARRY) -> (adc OP, 0)
+def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))),
+          (ADC8ri GR8:$op, 0)>;
+def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))),
+          (ADC32ri8 GR32:$op, 0)>;
+def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
+          (ADC64ri8 GR64:$op, 0)>;
+
 //===----------------------------------------------------------------------===//
 // String Pseudo Instructions
 //
@@ -519,85 +543,98 @@ def Int_MemBarrierNoSSE64  : RI<0x09, MRM1r, (outs), (ins GR64:$zero),
                            Requires<[In64BitMode]>, LOCK;
 
 
-// Optimized codegen when the non-memory output is not used.
+// RegOpc corresponds to the mr version of the instruction
+// ImmOpc corresponds to the mi version of the instruction
+// ImmOpc8 corresponds to the mi8 version of the instruction
+// ImmMod corresponds to the instruction format of the mi and mi8 versions
+multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
+                           Format ImmMod, string mnemonic> {
 let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in {
-def LOCK_ADD8mr  : I<0x00, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
-                    "lock\n\t"
-                    "add{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_ADD16mr  : I<0x01, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
-                    "lock\n\t"
-                    "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
-def LOCK_ADD32mr  : I<0x01, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
-                    "lock\n\t"
-                    "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_ADD64mr : RI<0x01, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
-                      "lock\n\t"
-                      "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
 
-def LOCK_ADD8mi   : Ii8<0x80, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src2),
-                    "lock\n\t"
-                    "add{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_ADD16mi  : Ii16<0x81, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src2),
-                    "lock\n\t"
-                     "add{w}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_ADD32mi  : Ii32<0x81, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src2),
-                    "lock\n\t"
-                    "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_ADD64mi32 : RIi32<0x81, MRM0m, (outs),
-                                        (ins i64mem:$dst, i64i32imm :$src2),
-                      "lock\n\t"
-                      "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-
-def LOCK_ADD16mi8 : Ii8<0x83, MRM0m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
-                    "lock\n\t"
-                    "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
-def LOCK_ADD32mi8 : Ii8<0x83, MRM0m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
-                    "lock\n\t"
-                    "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_ADD64mi8 : RIi8<0x83, MRM0m, (outs),
-                                      (ins i64mem:$dst, i64i8imm :$src2),
-                    "lock\n\t"
-                    "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-
-def LOCK_SUB8mr   : I<0x28, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src2),
-                    "lock\n\t"
-                    "sub{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_SUB16mr  : I<0x29, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
-                    "lock\n\t"
-                    "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
-def LOCK_SUB32mr  : I<0x29, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
-                    "lock\n\t"
-                    "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_SUB64mr : RI<0x29, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
-                      "lock\n\t"
-                      "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def #NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                   RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
+                   MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+                   !strconcat("lock\n\t", mnemonic, "{b}\t",
+                              "{$src2, $dst|$dst, $src2}"),
+                   []>, LOCK;
+def #NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+                    MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                    !strconcat("lock\n\t", mnemonic, "{w}\t",
+                               "{$src2, $dst|$dst, $src2}"),
+                    []>, OpSize, LOCK;
+def #NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+                    MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                    !strconcat("lock\n\t", mnemonic, "{l}\t",
+                               "{$src2, $dst|$dst, $src2}"),
+                    []>, LOCK;
+def #NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                     RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+                     MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                     !strconcat("lock\n\t", mnemonic, "{q}\t",
+                                "{$src2, $dst|$dst, $src2}"),
+                     []>, LOCK;
+
+def #NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                     ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
+                     ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
+                     !strconcat("lock\n\t", mnemonic, "{b}\t",
+                                "{$src2, $dst|$dst, $src2}"),
+                     []>, LOCK;
+
+def #NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                       ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                       ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
+                       !strconcat("lock\n\t", mnemonic, "{w}\t",
+                                  "{$src2, $dst|$dst, $src2}"),
+                       []>, LOCK;
+
+def #NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                       ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                       ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
+                       !strconcat("lock\n\t", mnemonic, "{l}\t",
+                                  "{$src2, $dst|$dst, $src2}"),
+                       []>, LOCK;
+
+def #NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                          ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                          ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
+                          !strconcat("lock\n\t", mnemonic, "{q}\t",
+                                     "{$src2, $dst|$dst, $src2}"),
+                          []>, LOCK;
+
+def #NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+                       ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+                       ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
+                       !strconcat("lock\n\t", mnemonic, "{w}\t",
+                                  "{$src2, $dst|$dst, $src2}"),
+                       []>, LOCK;
+def #NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+                       ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+                       ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
+                       !strconcat("lock\n\t", mnemonic, "{l}\t",
+                                  "{$src2, $dst|$dst, $src2}"),
+                       []>, LOCK;
+def #NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+                        ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+                        ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
+                        !strconcat("lock\n\t", mnemonic, "{q}\t",
+                                   "{$src2, $dst|$dst, $src2}"),
+                        []>, LOCK;
 
+}
 
-def LOCK_SUB8mi   : Ii8<0x80, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src2),
-                    "lock\n\t"
-                    "sub{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_SUB16mi  : Ii16<0x81, MRM5m, (outs), (ins i16mem:$dst, i16imm:$src2),
-                    "lock\n\t"
-                    "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
-def LOCK_SUB32mi  : Ii32<0x81, MRM5m, (outs), (ins i32mem:$dst, i32imm:$src2),
-                    "lock\n\t"
-                     "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_SUB64mi32 : RIi32<0x81, MRM5m, (outs),
-                                        (ins i64mem:$dst, i64i32imm:$src2),
-                      "lock\n\t"
-                      "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+}
 
+defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">;
+defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">;
+defm LOCK_OR  : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">;
+defm LOCK_AND : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM4m, "and">;
+defm LOCK_XOR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM6m, "xor">;
 
-def LOCK_SUB16mi8 : Ii8<0x83, MRM5m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
-                    "lock\n\t"
-                     "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
-def LOCK_SUB32mi8 : Ii8<0x83, MRM5m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
-                    "lock\n\t"
-                     "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
-def LOCK_SUB64mi8 : RIi8<0x83, MRM5m, (outs),
-                                      (ins i64mem:$dst, i64i8imm :$src2),
-                      "lock\n\t"
-                      "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+// Optimized codegen when the non-memory output is not used.
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in {
 
 def LOCK_INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst),
                     "lock\n\t"
@@ -960,7 +997,8 @@ def : Pat<(extloadi64i32 addr:$src),
 
 // anyext. Define these to do an explicit zero-extend to
 // avoid partial-register updates.
-def : Pat<(i16 (anyext GR8 :$src)), (MOVZX16rr8  GR8 :$src)>;
+def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG
+                                     (MOVZX32rr8 GR8 :$src), sub_16bit)>;
 def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8  GR8 :$src)>;
 
 // Except for i16 -> i32 since isel expect i16 ops to be promoted to i32.
@@ -1127,9 +1165,9 @@ def : Pat<(and GR32:$src1, 0xff),
       Requires<[In32BitMode]>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR16:$src1, 0xff),
-          (MOVZX16rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src1,
-                                                             GR16_ABCD)),
-                                      sub_8bit))>,
+           (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG
+            (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)),
+             sub_16bit)>,
       Requires<[In32BitMode]>;
 
 // r & (2^32-1) ==> movz
@@ -1147,7 +1185,8 @@ def : Pat<(and GR32:$src1, 0xff),
       Requires<[In64BitMode]>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR16:$src1, 0xff),
-           (MOVZX16rr8 (i8 (EXTRACT_SUBREG GR16:$src1, sub_8bit)))>,
+           (EXTRACT_SUBREG (MOVZX32rr8 (i8
+            (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>,
       Requires<[In64BitMode]>;
 
 
@@ -1159,10 +1198,11 @@ def : Pat<(sext_inreg GR32:$src, i8),
                                                              GR32_ABCD)),
                                       sub_8bit))>,
       Requires<[In32BitMode]>;
+
 def : Pat<(sext_inreg GR16:$src, i8),
-          (MOVSX16rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
-                                                             GR16_ABCD)),
-                                      sub_8bit))>,
+           (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG
+            (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))),
+             sub_16bit)>,
       Requires<[In32BitMode]>;
 
 def : Pat<(sext_inreg GR64:$src, i32),
@@ -1175,9 +1215,19 @@ def : Pat<(sext_inreg GR32:$src, i8),
           (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>,
       Requires<[In64BitMode]>;
 def : Pat<(sext_inreg GR16:$src, i8),
-          (MOVSX16rr8 (i8 (EXTRACT_SUBREG GR16:$src, sub_8bit)))>,
+           (EXTRACT_SUBREG (MOVSX32rr8
+            (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>,
       Requires<[In64BitMode]>;
 
+// sext, sext_load, zext, zext_load
+def: Pat<(i16 (sext GR8:$src)),
+          (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(sextloadi16i8 addr:$src),
+          (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>;
+def: Pat<(i16 (zext GR8:$src)),
+          (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(zextloadi16i8 addr:$src),
+          (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
 
 // trunc patterns
 def : Pat<(i16 (trunc GR32:$src)),
@@ -1474,12 +1524,6 @@ def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
 def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
           (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
 
-// Optimize multiply by 2 with EFLAGS result.
-let AddedComplexity = 2 in {
-def : Pat<(X86smul_flag GR16:$src1, 2), (ADD16rr GR16:$src1, GR16:$src1)>;
-def : Pat<(X86smul_flag GR32:$src1, 2), (ADD32rr GR32:$src1, GR32:$src1)>;
-}
-
 // Patterns for nodes that do not produce flags, for instructions that do.
 
 // addition
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index 867c0f8..2e1d523 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -38,22 +38,11 @@ let neverHasSideEffects = 1 in {
 
 
 // Sign/Zero extenders
-// Use movsbl intead of movsbw; we don't care about the high 16 bits
-// of the register here. This has a smaller encoding and avoids a
-// partial-register update.  Actual movsbw included for the disassembler.
-def MOVSX16rr8W : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
-                    "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
-def MOVSX16rm8W : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
-                    "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
-
-// FIXME: Use a pat pattern or define a syntax here.                    
-let isCodeGenOnly=1 in {
-def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src),
-                   "", [(set GR16:$dst, (sext GR8:$src))]>, TB;
-def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src),
-                   "", [(set GR16:$dst, (sextloadi16i8 addr:$src))]>, TB;
-}
-def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
+def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+                   "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+                   "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src),
                    "movs{bl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (sext GR8:$src))]>, TB;
 def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
@@ -66,20 +55,10 @@ def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
                    "movs{wl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (sextloadi32i16 addr:$src))]>, TB;
 
-// Use movzbl intead of movzbw; we don't care about the high 16 bits
-// of the register here. This has a smaller encoding and avoids a
-// partial-register update.  Actual movzbw included for the disassembler.
-def MOVZX16rr8W : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
-                    "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
-def MOVZX16rm8W : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
-                    "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;  
-// FIXME: Use a pat pattern or define a syntax here.                    
-let isCodeGenOnly=1 in {
-def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src),
-                   "", [(set GR16:$dst, (zext GR8:$src))]>, TB;
-def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src),
-                   "", [(set GR16:$dst, (zextloadi16i8 addr:$src))]>, TB;
-}
+def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+                   "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+                   "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
 def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
                    "movz{bl|x}\t{$src, $dst|$dst, $src}",
                    [(set GR32:$dst, (zext GR8:$src))]>, TB;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 3cbfac1..7c9a9f7 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -38,8 +38,11 @@ def X86fxor    : SDNode<"X86ISD::FXOR",      SDTFPBinOp,
 def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
 def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
 def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
+def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>;
 def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
 def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
+def X86cmpss   : SDNode<"X86ISD::FSETCCss",    SDTX86Cmpss>;
+def X86cmpsd   : SDNode<"X86ISD::FSETCCsd",    SDTX86Cmpsd>;
 def X86pshufb  : SDNode<"X86ISD::PSHUFB",
                  SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 83f0260..e2016eb 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -2015,62 +2015,48 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
                                       bool isStackAligned,
                                       const TargetMachine &TM,
                                       bool load) {
-  switch (RC->getID()) {
+  switch (RC->getSize()) {
   default:
-    llvm_unreachable("Unknown regclass");
-  case X86::GR64RegClassID:
-  case X86::GR64_ABCDRegClassID:
-  case X86::GR64_NOREXRegClassID:
-  case X86::GR64_NOREX_NOSPRegClassID:
-  case X86::GR64_NOSPRegClassID:
-  case X86::GR64_TCRegClassID:
-  case X86::GR64_TCW64RegClassID:
-    return load ? X86::MOV64rm : X86::MOV64mr;
-  case X86::GR32RegClassID:
-  case X86::GR32_ABCDRegClassID:
-  case X86::GR32_ADRegClassID:
-  case X86::GR32_NOREXRegClassID:
-  case X86::GR32_NOSPRegClassID:
-  case X86::GR32_TCRegClassID:
-    return load ? X86::MOV32rm : X86::MOV32mr;
-  case X86::GR16RegClassID:
-  case X86::GR16_ABCDRegClassID:
-  case X86::GR16_NOREXRegClassID:
-    return load ? X86::MOV16rm : X86::MOV16mr;
-  case X86::GR8RegClassID:
-    // Copying to or from a physical H register on x86-64 requires a NOREX
-    // move.  Otherwise use a normal move.
-    if (isHReg(Reg) &&
-        TM.getSubtarget<X86Subtarget>().is64Bit())
-      return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
-    else
-      return load ? X86::MOV8rm : X86::MOV8mr;
-  case X86::GR8_ABCD_LRegClassID:
-  case X86::GR8_NOREXRegClassID:
-    return load ? X86::MOV8rm :X86::MOV8mr;
-  case X86::GR8_ABCD_HRegClassID:
+    llvm_unreachable("Unknown spill size");
+  case 1:
+    assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
     if (TM.getSubtarget<X86Subtarget>().is64Bit())
-      return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
-    else
-      return load ? X86::MOV8rm : X86::MOV8mr;
-  case X86::RFP80RegClassID:
+      // Copying to or from a physical H register on x86-64 requires a NOREX
+      // move.  Otherwise use a normal move.
+      if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
+        return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
+    return load ? X86::MOV8rm : X86::MOV8mr;
+  case 2:
+    assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
+    return load ? X86::MOV16rm : X86::MOV16mr;
+  case 4:
+    if (X86::GR32RegClass.hasSubClassEq(RC))
+      return load ? X86::MOV32rm : X86::MOV32mr;
+    if (X86::FR32RegClass.hasSubClassEq(RC))
+      return load ? X86::MOVSSrm : X86::MOVSSmr;
+    if (X86::RFP32RegClass.hasSubClassEq(RC))
+      return load ? X86::LD_Fp32m : X86::ST_Fp32m;
+    llvm_unreachable("Unknown 4-byte regclass");
+  case 8:
+    if (X86::GR64RegClass.hasSubClassEq(RC))
+      return load ? X86::MOV64rm : X86::MOV64mr;
+    if (X86::FR64RegClass.hasSubClassEq(RC))
+      return load ? X86::MOVSDrm : X86::MOVSDmr;
+    if (X86::VR64RegClass.hasSubClassEq(RC))
+      return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
+    if (X86::RFP64RegClass.hasSubClassEq(RC))
+      return load ? X86::LD_Fp64m : X86::ST_Fp64m;
+    llvm_unreachable("Unknown 8-byte regclass");
+  case 10:
+    assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
     return load ? X86::LD_Fp80m : X86::ST_FpP80m;
-  case X86::RFP64RegClassID:
-    return load ? X86::LD_Fp64m : X86::ST_Fp64m;
-  case X86::RFP32RegClassID:
-    return load ? X86::LD_Fp32m : X86::ST_Fp32m;
-  case X86::FR32RegClassID:
-    return load ? X86::MOVSSrm : X86::MOVSSmr;
-  case X86::FR64RegClassID:
-    return load ? X86::MOVSDrm : X86::MOVSDmr;
-  case X86::VR128RegClassID:
+  case 16:
+    assert(X86::VR128RegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass");
     // If stack is realigned we can use aligned stores.
     if (isStackAligned)
       return load ? X86::MOVAPSrm : X86::MOVAPSmr;
     else
       return load ? X86::MOVUPSrm : X86::MOVUPSmr;
-  case X86::VR64RegClassID:
-    return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
   }
 }
 
@@ -2434,7 +2420,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       Alignment = 4;
       break;
     default:
-      llvm_unreachable("Don't know how to fold this instruction!");
+      return 0;
     }
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 8da68b5..d895023 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -449,7 +449,6 @@ namespace X86II {
     SSEDomainShift = SegOvrShift + 2,
 
     OpcodeShift   = SSEDomainShift + 2,
-    OpcodeMask    = 0xFFULL << OpcodeShift,
 
     //===------------------------------------------------------------------===//
     /// VEX - The opcode prefix used by AVX instructions
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 03a0b0c..8cab808 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -23,6 +23,9 @@ def SDTIntShiftDOp: SDTypeProfile<1, 3,
 
 def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>;
 
+def SDTX86Cmpsd : SDTypeProfile<1, 3, [SDTCisVT<0, f64>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+
 def SDTX86Cmov    : SDTypeProfile<1, 4,
                                   [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
                                    SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
@@ -1485,6 +1488,7 @@ defm : FpUnaryAlias<"fucompi",  UCOM_FIPr>;
 // solely because gas supports it.
 def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op), 0>;
 def : InstAlias<"fmulp %st(0), $op", (MUL_FPrST0 RST:$op)>;
+def : InstAlias<"fsubp %st(0), $op", (SUBR_FPrST0 RST:$op)>;
 def : InstAlias<"fsubrp %st(0), $op", (SUB_FPrST0 RST:$op)>;
 def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>;
 def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>;
@@ -1546,8 +1550,8 @@ def : InstAlias<"movq $src, $dst",
 def : InstAlias<"movsd", (MOVSD)>;
 
 // movsx aliases
-def : InstAlias<"movsx $src, $dst", (MOVSX16rr8W GR16:$dst, GR8:$src), 0>;
-def : InstAlias<"movsx $src, $dst", (MOVSX16rm8W GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
 def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
 def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
 def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
@@ -1555,8 +1559,8 @@ def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
 def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;
 
 // movzx aliases
-def : InstAlias<"movzx $src, $dst", (MOVZX16rr8W GR16:$dst, GR8:$src), 0>;
-def : InstAlias<"movzx $src, $dst", (MOVZX16rm8W GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>;
 def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
 def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
 def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>;
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index bb2165a..b2d9fca 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -285,7 +285,7 @@ let Constraints = "$src1 = $dst" in
 defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, 1>;
 defm MMX_POR  : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,  1>;
 defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, 1>;
-defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn, 1>;
+defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn>;
 
 // Shift Instructions
 defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index cde3f6b..b64c03a 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1056,13 +1056,37 @@ let neverHasSideEffects = 1 in {
                   XD, VEX_4V;
 }
 
+let Constraints = "$src1 = $dst" in {
+def CMPSSrr : SIi8<0xC2, MRMSrcReg,
+                  (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, SSECC:$cc),
+                  "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
+                  [(set FR32:$dst, (X86cmpss (f32 FR32:$src1), FR32:$src2, imm:$cc))]>, XS;
+def CMPSSrm : SIi8<0xC2, MRMSrcMem,
+                  (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, SSECC:$cc),
+                  "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
+                  [(set FR32:$dst, (X86cmpss (f32 FR32:$src1), (loadf32 addr:$src2), imm:$cc))]>, XS;
+def CMPSDrr : SIi8<0xC2, MRMSrcReg,
+                  (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, SSECC:$cc),
+                  "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
+                  [(set FR64:$dst, (X86cmpsd (f64 FR64:$src1), FR64:$src2, imm:$cc))]>, XD;
+def CMPSDrm : SIi8<0xC2, MRMSrcMem,
+                  (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, SSECC:$cc),
+                  "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
+                  [(set FR64:$dst, (X86cmpsd (f64 FR64:$src1), (loadf64 addr:$src2), imm:$cc))]>, XD;
+}
 let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
-  defm CMPSS  : sse12_cmp_scalar<FR32, f32mem,
-                    "cmp${cc}ss\t{$src, $dst|$dst, $src}",
-                    "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}">, XS;
-  defm CMPSD  : sse12_cmp_scalar<FR64, f64mem,
-                    "cmp${cc}sd\t{$src, $dst|$dst, $src}",
-                    "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}">, XD;
+def CMPSSrr_alt : SIi8<0xC2, MRMSrcReg,
+                  (outs FR32:$dst), (ins FR32:$src1, FR32:$src, i8imm:$src2),
+                  "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XS;
+def CMPSSrm_alt : SIi8<0xC2, MRMSrcMem,
+                  (outs FR32:$dst), (ins FR32:$src1, f32mem:$src, i8imm:$src2),
+                  "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XS;
+def CMPSDrr_alt : SIi8<0xC2, MRMSrcReg,
+                  (outs FR64:$dst), (ins FR64:$src1, FR64:$src, i8imm:$src2),
+                  "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XD;
+def CMPSDrm_alt : SIi8<0xC2, MRMSrcMem,
+                  (outs FR64:$dst), (ins FR64:$src1, f64mem:$src, i8imm:$src2),
+                  "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XD;
 }
 
 multiclass sse12_cmp_scalar_int<RegisterClass RC, X86MemOperand x86memop,
@@ -1327,11 +1351,6 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
 }
 
 // Mask creation
-defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
-                                     SSEPackedSingle>, TB;
-defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
-                                     SSEPackedDouble>, TB, OpSize;
-
 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
                                       "movmskps", SSEPackedSingle>, VEX;
 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
@@ -1342,6 +1361,24 @@ defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
                                       "movmskpd", SSEPackedDouble>, OpSize,
                                       VEX;
+defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
+                                     SSEPackedSingle>, TB;
+defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
+                                     SSEPackedDouble>, TB, OpSize;
+
+// X86fgetsign
+def MOVMSKPDrr32_alt : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins FR64:$src),
+                    "movmskpd\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (X86fgetsign FR64:$src))], SSEPackedDouble>, TB, OpSize;
+def MOVMSKPDrr64_alt : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins FR64:$src),
+                    "movmskpd\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (X86fgetsign FR64:$src))], SSEPackedDouble>, TB, OpSize;
+def MOVMSKPSrr32_alt : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src),
+                    "movmskps\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (X86fgetsign FR32:$src))], SSEPackedSingle>, TB;
+def MOVMSKPSrr64_alt : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins FR32:$src),
+                    "movmskps\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (X86fgetsign FR32:$src))], SSEPackedSingle>, TB;
 
 // Assembler Only
 def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
@@ -1875,21 +1912,6 @@ defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>,
 // SSE 1 & 2 - Non-temporal stores
 //===----------------------------------------------------------------------===//
 
-def VMOVNTPSmr_Int : VPSI<0x2B, MRMDestMem, (outs),
-                       (ins i128mem:$dst, VR128:$src),
-                       "movntps\t{$src, $dst|$dst, $src}",
-                       [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>, VEX;
-def VMOVNTPDmr_Int : VPDI<0x2B, MRMDestMem, (outs),
-                       (ins i128mem:$dst, VR128:$src),
-                       "movntpd\t{$src, $dst|$dst, $src}",
-                       [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>, VEX;
-
-let ExeDomain = SSEPackedInt in
-  def VMOVNTDQmr_Int : VPDI<0xE7, MRMDestMem, (outs),
-                     (ins f128mem:$dst, VR128:$src),
-                     "movntdq\t{$src, $dst|$dst, $src}",
-                     [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>, VEX;
-
 let AddedComplexity = 400 in { // Prefer non-temporal versions
   def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
                        (ins f128mem:$dst, VR128:$src),
@@ -1906,12 +1928,16 @@ let AddedComplexity = 400 in { // Prefer non-temporal versions
                         "movntdq\t{$src, $dst|$dst, $src}",
                         [(alignednontemporalstore (v2f64 VR128:$src),
                                                   addr:$dst)]>, VEX;
+
   let ExeDomain = SSEPackedInt in
-  def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
-                      (ins f128mem:$dst, VR128:$src),
-                      "movntdq\t{$src, $dst|$dst, $src}",
-                      [(alignednontemporalstore (v4f32 VR128:$src),
-                                                addr:$dst)]>, VEX;
+  def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
+                           (ins f128mem:$dst, VR128:$src),
+                           "movntdq\t{$src, $dst|$dst, $src}",
+                           [(alignednontemporalstore (v4f32 VR128:$src),
+                                                     addr:$dst)]>, VEX;
+
+  def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
+            (VMOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasAVX]>;
 
   def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
                        (ins f256mem:$dst, VR256:$src),
@@ -1943,18 +1969,6 @@ def : Pat<(int_x86_avx_movnt_pd_256 addr:$dst, VR256:$src),
 def : Pat<(int_x86_avx_movnt_ps_256 addr:$dst, VR256:$src),
           (VMOVNTPSYmr addr:$dst, VR256:$src)>;
 
-def MOVNTPSmr_Int : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
-                    "movntps\t{$src, $dst|$dst, $src}",
-                    [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>;
-def MOVNTPDmr_Int : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
-                        "movntpd\t{$src, $dst|$dst, $src}",
-                        [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
-
-let ExeDomain = SSEPackedInt in
-def MOVNTDQmr_Int : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
-                        "movntdq\t{$src, $dst|$dst, $src}",
-                        [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
-
 let AddedComplexity = 400 in { // Prefer non-temporal versions
 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     "movntps\t{$src, $dst|$dst, $src}",
@@ -1972,22 +1986,19 @@ def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     "movntdq\t{$src, $dst|$dst, $src}",
                     [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
 
+def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
+          (MOVNTDQmr addr:$dst, VR128:$src)>;
+
 // There is no AVX form for instructions below this point
 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                  "movnti\t{$src, $dst|$dst, $src}",
                  [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
                TB, Requires<[HasSSE2]>;
-
 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                      "movnti\t{$src, $dst|$dst, $src}",
                      [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
                   TB, Requires<[HasSSE2]>;
-
 }
-def MOVNTImr_Int  :   I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
-                    "movnti\t{$src, $dst|$dst, $src}",
-                    [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>,
-                  TB, Requires<[HasSSE2]>;
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Misc Instructions (No AVX form)
@@ -4733,14 +4744,14 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
     def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src1, VR128:$src2),
                     !strconcat(OpcodeStr,
-                     "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
+                     "\t{$src2, $dst|$dst, $src2}"),
                     [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
                     OpSize;
 
     def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                     (ins VR128:$src1, i128mem:$src2),
                     !strconcat(OpcodeStr,
-                     "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
+                     "\t{$src2, $dst|$dst, $src2}"),
                     [(set VR128:$dst,
                       (IntId VR128:$src1,
                        (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize;
@@ -4961,66 +4972,66 @@ defm PCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128>;
 // This set of instructions are only rm, the only difference is the size
 // of r and m.
 let Constraints = "$src1 = $dst" in {
-  def CRC32m8  : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst),
+  def CRC32r32m8  : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst),
                       (ins GR32:$src1, i8mem:$src2),
                       "crc32{b} \t{$src2, $src1|$src1, $src2}",
                        [(set GR32:$dst,
-                         (int_x86_sse42_crc32_8 GR32:$src1,
+                         (int_x86_sse42_crc32_32_8 GR32:$src1,
                          (load addr:$src2)))]>;
-  def CRC32r8  : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst),
+  def CRC32r32r8  : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst),
                       (ins GR32:$src1, GR8:$src2),
                       "crc32{b} \t{$src2, $src1|$src1, $src2}",
                        [(set GR32:$dst,
-                         (int_x86_sse42_crc32_8 GR32:$src1, GR8:$src2))]>;
-  def CRC32m16  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
+                         (int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>;
+  def CRC32r32m16  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
                       (ins GR32:$src1, i16mem:$src2),
                       "crc32{w} \t{$src2, $src1|$src1, $src2}",
                        [(set GR32:$dst,
-                         (int_x86_sse42_crc32_16 GR32:$src1,
+                         (int_x86_sse42_crc32_32_16 GR32:$src1,
                          (load addr:$src2)))]>,
                          OpSize;
-  def CRC32r16  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
+  def CRC32r32r16  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
                       (ins GR32:$src1, GR16:$src2),
                       "crc32{w} \t{$src2, $src1|$src1, $src2}",
                        [(set GR32:$dst,
-                         (int_x86_sse42_crc32_16 GR32:$src1, GR16:$src2))]>,
+                         (int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>,
                          OpSize;
-  def CRC32m32  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
+  def CRC32r32m32  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
                       (ins GR32:$src1, i32mem:$src2),
                       "crc32{l} \t{$src2, $src1|$src1, $src2}",
                        [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32 GR32:$src1,
+                         (int_x86_sse42_crc32_32_32 GR32:$src1,
                          (load addr:$src2)))]>;
-  def CRC32r32  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
+  def CRC32r32r32  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
                       (ins GR32:$src1, GR32:$src2),
                       "crc32{l} \t{$src2, $src1|$src1, $src2}",
                        [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32 GR32:$src1, GR32:$src2))]>;
-  def CRC64m8  : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst),
+                         (int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>;
+  def CRC32r64m8  : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst),
                       (ins GR64:$src1, i8mem:$src2),
                       "crc32{b} \t{$src2, $src1|$src1, $src2}",
                        [(set GR64:$dst,
-                         (int_x86_sse42_crc64_8 GR64:$src1,
+                         (int_x86_sse42_crc32_64_8 GR64:$src1,
                          (load addr:$src2)))]>,
                          REX_W;
-  def CRC64r8  : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst),
+  def CRC32r64r8  : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst),
                       (ins GR64:$src1, GR8:$src2),
                       "crc32{b} \t{$src2, $src1|$src1, $src2}",
                        [(set GR64:$dst,
-                         (int_x86_sse42_crc64_8 GR64:$src1, GR8:$src2))]>,
+                         (int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>,
                          REX_W;
-  def CRC64m64  : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst),
+  def CRC32r64m64  : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst),
                       (ins GR64:$src1, i64mem:$src2),
                       "crc32{q} \t{$src2, $src1|$src1, $src2}",
                        [(set GR64:$dst,
-                         (int_x86_sse42_crc64_64 GR64:$src1,
+                         (int_x86_sse42_crc32_64_64 GR64:$src1,
                          (load addr:$src2)))]>,
                          REX_W;
-  def CRC64r64  : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst),
+  def CRC32r64r64  : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst),
                       (ins GR64:$src1, GR64:$src2),
                       "crc32{q} \t{$src2, $src1|$src1, $src2}",
                        [(set GR64:$dst,
-                         (int_x86_sse42_crc64_64 GR64:$src1, GR64:$src2))]>,
+                         (int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>,
                          REX_W;
 }
 
diff --git a/lib/Target/X86/X86MCAsmInfo.cpp b/lib/Target/X86/X86MCAsmInfo.cpp
index 83bba52..2e1ec63 100644
--- a/lib/Target/X86/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/X86MCAsmInfo.cpp
@@ -108,8 +108,6 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
   // Exceptions handling
   ExceptionsType = ExceptionHandling::DwarfCFI;
 
-  DwarfRequiresFrameSection = false;
-
   // OpenBSD has buggy support for .quad in 32-bit mode, just split into two
   // .words.
   if (T.getOS() == Triple::OpenBSD && T.getArch() == Triple::x86)
diff --git a/lib/Target/X86/X86MCCodeEmitter.cpp b/lib/Target/X86/X86MCCodeEmitter.cpp
index f195a67..55aceba 100644
--- a/lib/Target/X86/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/X86MCCodeEmitter.cpp
@@ -1015,7 +1015,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     } else {
       unsigned FixupKind;
       // FIXME: Is there a better way to know that we need a signed relocation?
-      if (MI.getOpcode() == X86::MOV64ri32 ||
+      if (MI.getOpcode() == X86::ADD64ri32 ||
+          MI.getOpcode() == X86::MOV64ri32 ||
           MI.getOpcode() == X86::MOV64mi32 ||
           MI.getOpcode() == X86::PUSH64i32)
         FixupKind = X86::reloc_signed_4byte;
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index cbe6db2..793156f 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -355,10 +355,6 @@ ReSimplify:
     assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
            "LEA has segment specified!");
     break;
-  case X86::MOVZX16rr8:   LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break;
-  case X86::MOVZX16rm8:   LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break;
-  case X86::MOVSX16rr8:   LowerSubReg32_Op0(OutMI, X86::MOVSX32rr8); break;
-  case X86::MOVSX16rm8:   LowerSubReg32_Op0(OutMI, X86::MOVSX32rm8); break;
   case X86::MOVZX64rr32:  LowerSubReg32_Op0(OutMI, X86::MOV32rr); break;
   case X86::MOVZX64rm32:  LowerSubReg32_Op0(OutMI, X86::MOV32rm); break;
   case X86::MOV64ri64i32: LowerSubReg32_Op0(OutMI, X86::MOV32ri); break;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 37fb0fe..1ad6203 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -73,29 +73,61 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
   }
 }
 
-/// getDwarfRegNum - This function maps LLVM register identifiers to the DWARF
-/// specific numbering, used in debug info and exception tables.
-int X86RegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const {
-  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
-  unsigned Flavour = DWARFFlavour::X86_64;
-
+static unsigned getFlavour(const X86Subtarget *Subtarget, bool isEH) {
   if (!Subtarget->is64Bit()) {
     if (Subtarget->isTargetDarwin()) {
       if (isEH)
-        Flavour = DWARFFlavour::X86_32_DarwinEH;
+        return DWARFFlavour::X86_32_DarwinEH;
       else
-        Flavour = DWARFFlavour::X86_32_Generic;
+        return DWARFFlavour::X86_32_Generic;
     } else if (Subtarget->isTargetCygMing()) {
       // Unsupported by now, just quick fallback
-      Flavour = DWARFFlavour::X86_32_Generic;
+      return DWARFFlavour::X86_32_Generic;
     } else {
-      Flavour = DWARFFlavour::X86_32_Generic;
+      return DWARFFlavour::X86_32_Generic;
     }
   }
+  return DWARFFlavour::X86_64;
+}
+
+/// getDwarfRegNum - This function maps LLVM register identifiers to the DWARF
+/// specific numbering, used in debug info and exception tables.
+int X86RegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const {
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  unsigned Flavour = getFlavour(Subtarget, isEH);
 
   return X86GenRegisterInfo::getDwarfRegNumFull(RegNo, Flavour);
 }
 
+/// getLLVMRegNum - This function maps DWARF register numbers to LLVM register.
+int X86RegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  unsigned Flavour = getFlavour(Subtarget, isEH);
+
+  return X86GenRegisterInfo::getLLVMRegNumFull(DwarfRegNo, Flavour);
+}
+
+int
+X86RegisterInfo::getSEHRegNum(unsigned i) const {
+  int reg = getX86RegNum(i);
+  switch (i) {
+  case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
+  case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
+  case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
+  case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
+  case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
+  case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
+  case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
+  case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
+  case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11:
+  case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
+  case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11:
+  case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15:
+    reg += 8;
+  }
+  return reg;
+}
+
 /// getX86RegNum - This function maps LLVM register identifiers to their X86
 /// specific numbering, which is used in various places encoding instructions.
 unsigned X86RegisterInfo::getX86RegNum(unsigned RegNo) {
@@ -229,19 +261,13 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
     }
     break;
   case X86::sub_8bit_hi:
-    if (B == &X86::GR8_ABCD_HRegClass) {
-      if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass ||
-          A == &X86::GR64_NOREXRegClass ||
-          A == &X86::GR64_NOSPRegClass ||
-          A == &X86::GR64_NOREX_NOSPRegClass)
-        return &X86::GR64_ABCDRegClass;
-      else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass ||
-               A == &X86::GR32_NOREXRegClass || A == &X86::GR32_NOSPRegClass)
-        return &X86::GR32_ABCDRegClass;
-      else if (A == &X86::GR16RegClass || A == &X86::GR16_ABCDRegClass ||
-               A == &X86::GR16_NOREXRegClass)
-        return &X86::GR16_ABCDRegClass;
-    }
+    if (B->hasSubClassEq(&X86::GR8_ABCD_HRegClass))
+      switch (A->getSize()) {
+        case 2: return getCommonSubClass(A, &X86::GR16_ABCDRegClass);
+        case 4: return getCommonSubClass(A, &X86::GR32_ABCDRegClass);
+        case 8: return getCommonSubClass(A, &X86::GR64_ABCDRegClass);
+        default: return 0;
+      }
     break;
   case X86::sub_16bit:
     if (B == &X86::GR16RegClass) {
@@ -285,9 +311,16 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
           A == &X86::GR64_NOREX_NOSPRegClass)
         return &X86::GR64_ABCDRegClass;
     } else if (B == &X86::GR32_NOREXRegClass) {
+      if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass)
+        return &X86::GR64_NOREXRegClass;
+      else if (A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass)
+        return &X86::GR64_NOREX_NOSPRegClass;
+      else if (A == &X86::GR64_ABCDRegClass)
+        return &X86::GR64_ABCDRegClass;
+    } else if (B == &X86::GR32_NOREX_NOSPRegClass) {
       if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass ||
           A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass)
-        return &X86::GR64_NOREXRegClass;
+        return &X86::GR64_NOREX_NOSPRegClass;
       else if (A == &X86::GR64_ABCDRegClass)
         return &X86::GR64_ABCDRegClass;
     }
@@ -473,6 +506,34 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(X86::ST5);
   Reserved.set(X86::ST6);
   Reserved.set(X86::ST7);
+
+  // Mark the segment registers as reserved.
+  Reserved.set(X86::CS);
+  Reserved.set(X86::SS);
+  Reserved.set(X86::DS);
+  Reserved.set(X86::ES);
+  Reserved.set(X86::FS);
+  Reserved.set(X86::GS);
+
+  // Reserve the registers that only exist in 64-bit mode.
+  if (!Is64Bit) {
+    for (unsigned n = 0; n != 8; ++n) {
+      const unsigned GPR64[] = {
+        X86::R8,  X86::R9,  X86::R10, X86::R11,
+        X86::R12, X86::R13, X86::R14, X86::R15
+      };
+      for (const unsigned *AI = getOverlaps(GPR64[n]); unsigned Reg = *AI;
+           ++AI)
+        Reserved.set(Reg);
+
+      // XMM8, XMM9, ...
+      assert(X86::XMM15 == X86::XMM8+7);
+      for (const unsigned *AI = getOverlaps(X86::XMM8 + n); unsigned Reg = *AI;
+           ++AI)
+        Reserved.set(Reg);
+    }
+  }
+
   return Reserved;
 }
 
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 9970c52..dd3d3dc 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -80,6 +80,10 @@ public:
   /// getDwarfRegNum - allows modification of X86GenRegisterInfo::getDwarfRegNum
   /// (created by TableGen) for target dependencies.
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
+
+  // FIXME: This should be tablegen'd like getDwarfRegNum is
+  int getSEHRegNum(unsigned i) const;
 
   /// Code Generation virtual methods...
   /// 
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index fd7a247..f1d149c 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -41,82 +41,82 @@ let Namespace = "X86" in {
 
   // 8-bit registers
   // Low registers
-  def AL : Register<"al">, DwarfRegNum<[0, 0, 0]>;
-  def DL : Register<"dl">, DwarfRegNum<[1, 2, 2]>;
-  def CL : Register<"cl">, DwarfRegNum<[2, 1, 1]>;
-  def BL : Register<"bl">, DwarfRegNum<[3, 3, 3]>;
+  def AL : Register<"al">;
+  def DL : Register<"dl">;
+  def CL : Register<"cl">;
+  def BL : Register<"bl">;
 
   // X86-64 only, requires REX.
   let CostPerUse = 1 in {
-  def SIL : Register<"sil">, DwarfRegNum<[4, 6, 6]>;
-  def DIL : Register<"dil">, DwarfRegNum<[5, 7, 7]>;
-  def BPL : Register<"bpl">, DwarfRegNum<[6, 4, 5]>;
-  def SPL : Register<"spl">, DwarfRegNum<[7, 5, 4]>;
-  def R8B  : Register<"r8b">,  DwarfRegNum<[8, -2, -2]>;
-  def R9B  : Register<"r9b">,  DwarfRegNum<[9, -2, -2]>;
-  def R10B : Register<"r10b">, DwarfRegNum<[10, -2, -2]>;
-  def R11B : Register<"r11b">, DwarfRegNum<[11, -2, -2]>;
-  def R12B : Register<"r12b">, DwarfRegNum<[12, -2, -2]>;
-  def R13B : Register<"r13b">, DwarfRegNum<[13, -2, -2]>;
-  def R14B : Register<"r14b">, DwarfRegNum<[14, -2, -2]>;
-  def R15B : Register<"r15b">, DwarfRegNum<[15, -2, -2]>;
+  def SIL : Register<"sil">;
+  def DIL : Register<"dil">;
+  def BPL : Register<"bpl">;
+  def SPL : Register<"spl">;
+  def R8B  : Register<"r8b">;
+  def R9B  : Register<"r9b">;
+  def R10B : Register<"r10b">;
+  def R11B : Register<"r11b">;
+  def R12B : Register<"r12b">;
+  def R13B : Register<"r13b">;
+  def R14B : Register<"r14b">;
+  def R15B : Register<"r15b">;
   }
 
   // High registers. On x86-64, these cannot be used in any instruction
   // with a REX prefix.
-  def AH : Register<"ah">, DwarfRegNum<[0, 0, 0]>;
-  def DH : Register<"dh">, DwarfRegNum<[1, 2, 2]>;
-  def CH : Register<"ch">, DwarfRegNum<[2, 1, 1]>;
-  def BH : Register<"bh">, DwarfRegNum<[3, 3, 3]>;
+  def AH : Register<"ah">;
+  def DH : Register<"dh">;
+  def CH : Register<"ch">;
+  def BH : Register<"bh">;
 
   // 16-bit registers
   let SubRegIndices = [sub_8bit, sub_8bit_hi] in {
-  def AX : RegisterWithSubRegs<"ax", [AL,AH]>, DwarfRegNum<[0, 0, 0]>;
-  def DX : RegisterWithSubRegs<"dx", [DL,DH]>, DwarfRegNum<[1, 2, 2]>;
-  def CX : RegisterWithSubRegs<"cx", [CL,CH]>, DwarfRegNum<[2, 1, 1]>;
-  def BX : RegisterWithSubRegs<"bx", [BL,BH]>, DwarfRegNum<[3, 3, 3]>;
+  def AX : RegisterWithSubRegs<"ax", [AL,AH]>;
+  def DX : RegisterWithSubRegs<"dx", [DL,DH]>;
+  def CX : RegisterWithSubRegs<"cx", [CL,CH]>;
+  def BX : RegisterWithSubRegs<"bx", [BL,BH]>;
   }
   let SubRegIndices = [sub_8bit] in {
-  def SI : RegisterWithSubRegs<"si", [SIL]>, DwarfRegNum<[4, 6, 6]>;
-  def DI : RegisterWithSubRegs<"di", [DIL]>, DwarfRegNum<[5, 7, 7]>;
-  def BP : RegisterWithSubRegs<"bp", [BPL]>, DwarfRegNum<[6, 4, 5]>;
-  def SP : RegisterWithSubRegs<"sp", [SPL]>, DwarfRegNum<[7, 5, 4]>;
+  def SI : RegisterWithSubRegs<"si", [SIL]>;
+  def DI : RegisterWithSubRegs<"di", [DIL]>;
+  def BP : RegisterWithSubRegs<"bp", [BPL]>;
+  def SP : RegisterWithSubRegs<"sp", [SPL]>;
   }
-  def IP : Register<"ip">, DwarfRegNum<[16]>;
+  def IP : Register<"ip">;
 
   // X86-64 only, requires REX.
   let SubRegIndices = [sub_8bit], CostPerUse = 1 in {
-  def R8W  : RegisterWithSubRegs<"r8w", [R8B]>, DwarfRegNum<[8, -2, -2]>;
-  def R9W  : RegisterWithSubRegs<"r9w", [R9B]>, DwarfRegNum<[9, -2, -2]>;
-  def R10W : RegisterWithSubRegs<"r10w", [R10B]>, DwarfRegNum<[10, -2, -2]>;
-  def R11W : RegisterWithSubRegs<"r11w", [R11B]>, DwarfRegNum<[11, -2, -2]>;
-  def R12W : RegisterWithSubRegs<"r12w", [R12B]>, DwarfRegNum<[12, -2, -2]>;
-  def R13W : RegisterWithSubRegs<"r13w", [R13B]>, DwarfRegNum<[13, -2, -2]>;
-  def R14W : RegisterWithSubRegs<"r14w", [R14B]>, DwarfRegNum<[14, -2, -2]>;
-  def R15W : RegisterWithSubRegs<"r15w", [R15B]>, DwarfRegNum<[15, -2, -2]>;
+  def R8W  : RegisterWithSubRegs<"r8w", [R8B]>;
+  def R9W  : RegisterWithSubRegs<"r9w", [R9B]>;
+  def R10W : RegisterWithSubRegs<"r10w", [R10B]>;
+  def R11W : RegisterWithSubRegs<"r11w", [R11B]>;
+  def R12W : RegisterWithSubRegs<"r12w", [R12B]>;
+  def R13W : RegisterWithSubRegs<"r13w", [R13B]>;
+  def R14W : RegisterWithSubRegs<"r14w", [R14B]>;
+  def R15W : RegisterWithSubRegs<"r15w", [R15B]>;
   }
   // 32-bit registers
   let SubRegIndices = [sub_16bit] in {
-  def EAX : RegisterWithSubRegs<"eax", [AX]>, DwarfRegNum<[0, 0, 0]>;
-  def EDX : RegisterWithSubRegs<"edx", [DX]>, DwarfRegNum<[1, 2, 2]>;
-  def ECX : RegisterWithSubRegs<"ecx", [CX]>, DwarfRegNum<[2, 1, 1]>;
-  def EBX : RegisterWithSubRegs<"ebx", [BX]>, DwarfRegNum<[3, 3, 3]>;
-  def ESI : RegisterWithSubRegs<"esi", [SI]>, DwarfRegNum<[4, 6, 6]>;
-  def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[5, 7, 7]>;
-  def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[6, 4, 5]>;
-  def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[7, 5, 4]>;
-  def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[16, 8, 8]>;
+  def EAX : RegisterWithSubRegs<"eax", [AX]>, DwarfRegNum<[-2, 0, 0]>;
+  def EDX : RegisterWithSubRegs<"edx", [DX]>, DwarfRegNum<[-2, 2, 2]>;
+  def ECX : RegisterWithSubRegs<"ecx", [CX]>, DwarfRegNum<[-2, 1, 1]>;
+  def EBX : RegisterWithSubRegs<"ebx", [BX]>, DwarfRegNum<[-2, 3, 3]>;
+  def ESI : RegisterWithSubRegs<"esi", [SI]>, DwarfRegNum<[-2, 6, 6]>;
+  def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[-2, 7, 7]>;
+  def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[-2, 4, 5]>;
+  def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[-2, 5, 4]>;
+  def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[-2, 8, 8]>;
 
   // X86-64 only, requires REX
   let CostPerUse = 1 in {
-  def R8D  : RegisterWithSubRegs<"r8d", [R8W]>, DwarfRegNum<[8, -2, -2]>;
-  def R9D  : RegisterWithSubRegs<"r9d", [R9W]>, DwarfRegNum<[9, -2, -2]>;
-  def R10D : RegisterWithSubRegs<"r10d", [R10W]>, DwarfRegNum<[10, -2, -2]>;
-  def R11D : RegisterWithSubRegs<"r11d", [R11W]>, DwarfRegNum<[11, -2, -2]>;
-  def R12D : RegisterWithSubRegs<"r12d", [R12W]>, DwarfRegNum<[12, -2, -2]>;
-  def R13D : RegisterWithSubRegs<"r13d", [R13W]>, DwarfRegNum<[13, -2, -2]>;
-  def R14D : RegisterWithSubRegs<"r14d", [R14W]>, DwarfRegNum<[14, -2, -2]>;
-  def R15D : RegisterWithSubRegs<"r15d", [R15W]>, DwarfRegNum<[15, -2, -2]>;
+  def R8D  : RegisterWithSubRegs<"r8d", [R8W]>;
+  def R9D  : RegisterWithSubRegs<"r9d", [R9W]>;
+  def R10D : RegisterWithSubRegs<"r10d", [R10W]>;
+  def R11D : RegisterWithSubRegs<"r11d", [R11W]>;
+  def R12D : RegisterWithSubRegs<"r12d", [R12W]>;
+  def R13D : RegisterWithSubRegs<"r13d", [R13W]>;
+  def R14D : RegisterWithSubRegs<"r14d", [R14W]>;
+  def R15D : RegisterWithSubRegs<"r15d", [R15W]>;
   }}
 
   // 64-bit registers, X86-64 only
@@ -188,22 +188,22 @@ let Namespace = "X86" in {
 
   // YMM Registers, used by AVX instructions
   let SubRegIndices = [sub_xmm] in {
-  def YMM0: RegisterWithSubRegs<"ymm0", [XMM0]>, DwarfRegNum<[17, 21, 21]>;
-  def YMM1: RegisterWithSubRegs<"ymm1", [XMM1]>, DwarfRegNum<[18, 22, 22]>;
-  def YMM2: RegisterWithSubRegs<"ymm2", [XMM2]>, DwarfRegNum<[19, 23, 23]>;
-  def YMM3: RegisterWithSubRegs<"ymm3", [XMM3]>, DwarfRegNum<[20, 24, 24]>;
-  def YMM4: RegisterWithSubRegs<"ymm4", [XMM4]>, DwarfRegNum<[21, 25, 25]>;
-  def YMM5: RegisterWithSubRegs<"ymm5", [XMM5]>, DwarfRegNum<[22, 26, 26]>;
-  def YMM6: RegisterWithSubRegs<"ymm6", [XMM6]>, DwarfRegNum<[23, 27, 27]>;
-  def YMM7: RegisterWithSubRegs<"ymm7", [XMM7]>, DwarfRegNum<[24, 28, 28]>;
-  def YMM8:  RegisterWithSubRegs<"ymm8", [XMM8]>,  DwarfRegNum<[25, -2, -2]>;
-  def YMM9:  RegisterWithSubRegs<"ymm9", [XMM9]>,  DwarfRegNum<[26, -2, -2]>;
-  def YMM10: RegisterWithSubRegs<"ymm10", [XMM10]>, DwarfRegNum<[27, -2, -2]>;
-  def YMM11: RegisterWithSubRegs<"ymm11", [XMM11]>, DwarfRegNum<[28, -2, -2]>;
-  def YMM12: RegisterWithSubRegs<"ymm12", [XMM12]>, DwarfRegNum<[29, -2, -2]>;
-  def YMM13: RegisterWithSubRegs<"ymm13", [XMM13]>, DwarfRegNum<[30, -2, -2]>;
-  def YMM14: RegisterWithSubRegs<"ymm14", [XMM14]>, DwarfRegNum<[31, -2, -2]>;
-  def YMM15: RegisterWithSubRegs<"ymm15", [XMM15]>, DwarfRegNum<[32, -2, -2]>;
+  def YMM0: RegisterWithSubRegs<"ymm0", [XMM0]>, DwarfRegAlias<XMM0>;
+  def YMM1: RegisterWithSubRegs<"ymm1", [XMM1]>, DwarfRegAlias<XMM1>;
+  def YMM2: RegisterWithSubRegs<"ymm2", [XMM2]>, DwarfRegAlias<XMM2>;
+  def YMM3: RegisterWithSubRegs<"ymm3", [XMM3]>, DwarfRegAlias<XMM3>;
+  def YMM4: RegisterWithSubRegs<"ymm4", [XMM4]>, DwarfRegAlias<XMM4>;
+  def YMM5: RegisterWithSubRegs<"ymm5", [XMM5]>, DwarfRegAlias<XMM5>;
+  def YMM6: RegisterWithSubRegs<"ymm6", [XMM6]>, DwarfRegAlias<XMM6>;
+  def YMM7: RegisterWithSubRegs<"ymm7", [XMM7]>, DwarfRegAlias<XMM7>;
+  def YMM8:  RegisterWithSubRegs<"ymm8", [XMM8]>, DwarfRegAlias<XMM8>;
+  def YMM9:  RegisterWithSubRegs<"ymm9", [XMM9]>, DwarfRegAlias<XMM9>;
+  def YMM10: RegisterWithSubRegs<"ymm10", [XMM10]>, DwarfRegAlias<XMM10>;
+  def YMM11: RegisterWithSubRegs<"ymm11", [XMM11]>, DwarfRegAlias<XMM11>;
+  def YMM12: RegisterWithSubRegs<"ymm12", [XMM12]>, DwarfRegAlias<XMM12>;
+  def YMM13: RegisterWithSubRegs<"ymm13", [XMM13]>, DwarfRegAlias<XMM13>;
+  def YMM14: RegisterWithSubRegs<"ymm14", [XMM14]>, DwarfRegAlias<XMM14>;
+  def YMM15: RegisterWithSubRegs<"ymm15", [XMM15]>, DwarfRegAlias<XMM15>;
   }
 
   // Floating point stack registers
@@ -326,104 +326,12 @@ def GR16 : RegisterClass<"X86", [i16], 16,
                          [AX, CX, DX, SI, DI, BX, BP, SP,
                           R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W]> {
   let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi)];
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    static const unsigned X86_GR16_AO_64[] = {
-      X86::AX,  X86::CX,   X86::DX,   X86::SI,   X86::DI,
-      X86::R8W, X86::R9W,  X86::R10W, X86::R11W,
-      X86::BX, X86::R14W, X86::R15W,  X86::R12W, X86::R13W, X86::BP
-    };
-
-    GR16Class::iterator
-    GR16Class::allocation_order_begin(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      if (Subtarget.is64Bit())
-        return X86_GR16_AO_64;
-      else
-        return begin();
-    }
-
-    GR16Class::iterator
-    GR16Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      if (Subtarget.is64Bit()) {
-        // Does the function dedicate RBP to being a frame ptr?
-        if (TFI->hasFP(MF) || MFI->getReserveFP())
-          // If so, don't allocate SP or BP.
-          return array_endof(X86_GR16_AO_64) - 1;
-        else
-          // If not, just don't allocate SP.
-          return array_endof(X86_GR16_AO_64);
-      } else {
-        // Does the function dedicate EBP to being a frame ptr?
-        if (TFI->hasFP(MF) || MFI->getReserveFP())
-          // If so, don't allocate SP or BP.
-          return begin() + 6;
-        else
-          // If not, just don't allocate SP.
-          return begin() + 7;
-      }
-    }
-  }];
 }
 
 def GR32 : RegisterClass<"X86", [i32], 32,
                          [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
                           R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> {
   let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    static const unsigned X86_GR32_AO_64[] = {
-      X86::EAX, X86::ECX,  X86::EDX,  X86::ESI,  X86::EDI,
-      X86::R8D, X86::R9D,  X86::R10D, X86::R11D,
-      X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP
-    };
-
-    GR32Class::iterator
-    GR32Class::allocation_order_begin(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      if (Subtarget.is64Bit())
-        return X86_GR32_AO_64;
-      else
-        return begin();
-    }
-
-    GR32Class::iterator
-    GR32Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      if (Subtarget.is64Bit()) {
-        // Does the function dedicate RBP to being a frame ptr?
-        if (TFI->hasFP(MF) || MFI->getReserveFP())
-          // If so, don't allocate ESP or EBP.
-          return array_endof(X86_GR32_AO_64) - 1;
-        else
-          // If not, just don't allocate ESP.
-          return array_endof(X86_GR32_AO_64);
-      } else {
-        // Does the function dedicate EBP to being a frame ptr?
-        if (TFI->hasFP(MF) || MFI->getReserveFP())
-          // If so, don't allocate ESP or EBP.
-          return begin() + 6;
-        else
-          // If not, just don't allocate ESP.
-          return begin() + 7;
-      }
-    }
-  }];
 }
 
 // GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
@@ -435,25 +343,6 @@ def GR64 : RegisterClass<"X86", [i64], 64,
   let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
                        (GR16 sub_16bit),
                        (GR32 sub_32bit)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GR64Class::iterator
-    GR64Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      if (!Subtarget.is64Bit())
-        return begin();  // None of these are allocatable in 32-bit.
-      // Does the function dedicate RBP to being a frame ptr?
-      if (TFI->hasFP(MF) || MFI->getReserveFP())
-        return end()-3;  // If so, don't allocate RIP, RSP or RBP
-      else
-        return end()-2;  // If not, just don't allocate RIP or RSP
-    }
-  }];
 }
 
 // Segment registers for use by MOV instructions (and others) that have a
@@ -496,7 +385,7 @@ def GR32_TC   : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX]> {
   let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
 }
 def GR64_TC   : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, RSI, RDI,
-                                                 R8, R9, R11]> {
+                                                 R8, R9, R11, RIP]> {
   let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
                        (GR16 sub_16bit),
                        (GR32_TC sub_32bit)];
@@ -543,48 +432,12 @@ def GR8_NOREX : RegisterClass<"X86", [i8], 8,
 def GR16_NOREX : RegisterClass<"X86", [i16], 16,
                                [AX, CX, DX, SI, DI, BX, BP, SP]> {
   let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GR16_NOREXClass::iterator
-    GR16_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      // Does the function dedicate RBP / EBP to being a frame ptr?
-      if (TFI->hasFP(MF) || MFI->getReserveFP())
-        // If so, don't allocate SP or BP.
-        return end() - 2;
-      else
-        // If not, just don't allocate SP.
-        return end() - 1;
-    }
-  }];
 }
 // GR32_NOREX - GR32 registers which do not require a REX prefix.
 def GR32_NOREX : RegisterClass<"X86", [i32], 32,
                                [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP]> {
   let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
                        (GR16_NOREX sub_16bit)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GR32_NOREXClass::iterator
-    GR32_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      // Does the function dedicate RBP / EBP to being a frame ptr?
-      if (TFI->hasFP(MF) || MFI->getReserveFP())
-        // If so, don't allocate ESP or EBP.
-        return end() - 2;
-      else
-        // If not, just don't allocate ESP.
-        return end() - 1;
-    }
-  }];
 }
 // GR64_NOREX - GR64 registers which do not require a REX prefix.
 def GR64_NOREX : RegisterClass<"X86", [i64], 64,
@@ -592,24 +445,6 @@ def GR64_NOREX : RegisterClass<"X86", [i64], 64,
   let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
                        (GR16_NOREX sub_16bit),
                        (GR32_NOREX sub_32bit)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GR64_NOREXClass::iterator
-    GR64_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      // Does the function dedicate RBP to being a frame ptr?
-      if (TFI->hasFP(MF) || MFI->getReserveFP())
-        // If so, don't allocate RIP, RSP or RBP.
-        return end() - 3;
-      else
-        // If not, just don't allocate RIP or RSP.
-        return end() - 2;
-    }
-  }];
 }
 
 // GR32_NOSP - GR32 registers except ESP.
@@ -617,52 +452,6 @@ def GR32_NOSP : RegisterClass<"X86", [i32], 32,
                               [EAX, ECX, EDX, ESI, EDI, EBX, EBP,
                                R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> {
   let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    static const unsigned X86_GR32_NOSP_AO_64[] = {
-      X86::EAX, X86::ECX,  X86::EDX,  X86::ESI,  X86::EDI,
-      X86::R8D, X86::R9D,  X86::R10D, X86::R11D,
-      X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP
-    };
-
-    GR32_NOSPClass::iterator
-    GR32_NOSPClass::allocation_order_begin(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      if (Subtarget.is64Bit())
-        return X86_GR32_NOSP_AO_64;
-      else
-        return begin();
-    }
-
-    GR32_NOSPClass::iterator
-    GR32_NOSPClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      if (Subtarget.is64Bit()) {
-        // Does the function dedicate RBP to being a frame ptr?
-        if (TFI->hasFP(MF) || MFI->getReserveFP())
-          // If so, don't allocate EBP.
-          return array_endof(X86_GR32_NOSP_AO_64) - 1;
-        else
-          // If not, any reg in this class is ok.
-          return array_endof(X86_GR32_NOSP_AO_64);
-      } else {
-        // Does the function dedicate EBP to being a frame ptr?
-        if (TFI->hasFP(MF) || MFI->getReserveFP())
-          // If so, don't allocate EBP.
-          return begin() + 6;
-        else
-          // If not, any reg in this class is ok.
-          return begin() + 7;
-      }
-    }
-  }];
 }
 
 // GR64_NOSP - GR64 registers except RSP (and RIP).
@@ -672,25 +461,14 @@ def GR64_NOSP : RegisterClass<"X86", [i64], 64,
   let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
                        (GR16 sub_16bit),
                        (GR32_NOSP sub_32bit)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GR64_NOSPClass::iterator
-    GR64_NOSPClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      if (!Subtarget.is64Bit())
-        return begin();  // None of these are allocatable in 32-bit.
-      // Does the function dedicate RBP to being a frame ptr?
-      if (TFI->hasFP(MF) || MFI->getReserveFP())
-        return end()-1;  // If so, don't allocate RBP
-      else
-        return end();  // If not, any reg in this class is ok.
-    }
-  }];
+}
+
+// GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except
+// ESP.
+def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32,
+                               [EAX, ECX, EDX, ESI, EDI, EBX, EBP]> {
+  let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
+                       (GR16_NOREX sub_16bit)];
 }
 
 // GR64_NOREX_NOSP - GR64_NOREX registers except RSP.
@@ -698,26 +476,7 @@ def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
                                     [RAX, RCX, RDX, RSI, RDI, RBX, RBP]> {
   let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
                        (GR16_NOREX sub_16bit),
-                       (GR32_NOREX sub_32bit)];
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GR64_NOREX_NOSPClass::iterator
-    GR64_NOREX_NOSPClass::allocation_order_end(const MachineFunction &MF) const
-  {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
-      // Does the function dedicate RBP to being a frame ptr?
-      if (TFI->hasFP(MF) || MFI->getReserveFP())
-        // If so, don't allocate RBP.
-        return end() - 1;
-      else
-        // If not, any reg in this class is ok.
-        return end();
-    }
-  }];
+                       (GR32_NOREX_NOSP sub_32bit)];
 }
 
 // A class to support the 'A' assembler constraint: EAX then EDX.
@@ -731,42 +490,12 @@ def GR32_AD : RegisterClass<"X86", [i32], 32, [EAX, EDX]> {
 def FR32 : RegisterClass<"X86", [f32], 32,
                          [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
                           XMM8, XMM9, XMM10, XMM11,
-                          XMM12, XMM13, XMM14, XMM15]> {
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    FR32Class::iterator
-    FR32Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      if (!Subtarget.is64Bit())
-        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
-      else
-        return end();
-    }
-  }];
-}
+                          XMM12, XMM13, XMM14, XMM15]>;
 
 def FR64 : RegisterClass<"X86", [f64], 64,
                          [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
                           XMM8, XMM9, XMM10, XMM11,
-                          XMM12, XMM13, XMM14, XMM15]> {
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    FR64Class::iterator
-    FR64Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      if (!Subtarget.is64Bit())
-        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
-      else
-        return end();
-    }
-  }];
-}
+                          XMM12, XMM13, XMM14, XMM15]>;
 
 
 // FIXME: This sets up the floating point register files as though they are f64
@@ -784,15 +513,7 @@ def RFP80 : RegisterClass<"X86",[f80], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
 // for transforming FPn allocations to STn registers)
 def RST : RegisterClass<"X86", [f80, f64, f32], 32,
                         [ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7]> {
-    let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    RSTClass::iterator
-    RSTClass::allocation_order_end(const MachineFunction &MF) const {
-      return begin();
-    }
-  }];
+  let isAllocatable = 0;
 }
 
 // Generic vector registers: VR64 and VR128.
@@ -803,21 +524,6 @@ def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
                            XMM8, XMM9, XMM10, XMM11,
                            XMM12, XMM13, XMM14, XMM15]> {
   let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd)];
-
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    VR128Class::iterator
-    VR128Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      if (!Subtarget.is64Bit())
-        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
-      else
-        return end();
-    }
-  }];
 }
 
 def VR256 : RegisterClass<"X86", [v32i8, v8i32, v4i64, v8f32, v4f64], 256,
@@ -825,35 +531,10 @@ def VR256 : RegisterClass<"X86", [v32i8, v8i32, v4i64, v8f32, v4f64], 256,
                            YMM8, YMM9, YMM10, YMM11,
                            YMM12, YMM13, YMM14, YMM15]> {
   let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd), (VR128 sub_xmm)];
-
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    VR256Class::iterator
-    VR256Class::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
-      if (!Subtarget.is64Bit())
-        return end()-8; // Only YMM0 to YMM7 are available in 32-bit mode.
-      else
-        return end();
-    }
-  }];
 }
 
 // Status flags registers.
 def CCR : RegisterClass<"X86", [i32], 32, [EFLAGS]> {
   let CopyCost = -1;  // Don't allow copying of status registers.
-
-  // EFLAGS is not allocatable.
-  let MethodProtos = [{
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    CCRClass::iterator
-    CCRClass::allocation_order_end(const MachineFunction &MF) const {
-      return allocation_order_begin(MF);
-    }
-  }];
+  let isAllocatable = 0;
 }
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index ba5864e..481e821 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -265,6 +265,7 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
 
   HasCLMUL = IsIntel && ((ECX >> 1) & 0x1);
   HasFMA3  = IsIntel && ((ECX >> 12) & 0x1);
+  HasPOPCNT = IsIntel && ((ECX >> 23) & 0x1);
   HasAES   = IsIntel && ((ECX >> 25) & 0x1);
 
   if (IsIntel || IsAMD) {
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index 6bec9f9..a8dd847 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -205,6 +205,16 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
     return CurDAG->getMachineNode(XCore::LMUL_l6r, dl, MVT::i32, MVT::i32,
                                   Ops, 4);
   }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    case Intrinsic::xcore_crc8:
+      SDValue Ops[] = { N->getOperand(1), N->getOperand(2), N->getOperand(3) };
+      return CurDAG->getMachineNode(XCore::CRC8_l4r, dl, MVT::i32, MVT::i32,
+                                    Ops, 3);
+    }
+    break;
+  }
   case ISD::BRIND:
     if (SDNode *ResNode = SelectBRIND(N))
       return ResNode;
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 5987e8b..8cabbbf 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -156,6 +156,8 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::ADD);
+
+  setMinFunctionAlignment(1);
 }
 
 SDValue XCoreTargetLowering::
@@ -201,12 +203,6 @@ void XCoreTargetLowering::ReplaceNodeResults(SDNode *N,
   }
 }
 
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned XCoreTargetLowering::
-getFunctionAlignment(const Function *) const {
-  return 1;
-}
-
 //===----------------------------------------------------------------------===//
 //  Misc Lower Operation implementation
 //===----------------------------------------------------------------------===//
@@ -248,9 +244,6 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
 {
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), MVT::i32);
-  // If it's a debug information descriptor, don't mess with it.
-  if (DAG.isVerifiedDebugInfoDesc(Op))
-    return GA;
   return getGlobalAddressWrapper(GA, GV, DAG);
 }
 
@@ -904,8 +897,8 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   // The ABI dictates there should be one stack slot available to the callee
   // on function entry (for saving lr).
@@ -1027,8 +1020,8 @@ XCoreTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
   CCInfo.AnalyzeCallResult(Ins, RetCC_XCore);
 
@@ -1087,8 +1080,8 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_XCore);
 
@@ -1192,12 +1185,12 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
 //===----------------------------------------------------------------------===//
 
 bool XCoreTargetLowering::
-CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
+CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+	       bool isVarArg,
                const SmallVectorImpl<ISD::OutputArg> &Outs,
                LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, Context);
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC_XCore);
 }
 
@@ -1213,10 +1206,10 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+		 getTargetMachine(), RVLocs, *DAG.getContext());
 
-  // Analize return values.
+  // Analyze return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_XCore);
 
   // If this is the first return lowered for this function, add
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index bb3f2cc..a8d67d4 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -103,9 +103,6 @@ namespace llvm {
     virtual bool isLegalAddressingMode(const AddrMode &AM,
                                        const Type *Ty) const;
 
-    /// getFunctionAlignment - Return the Log2 alignment of this function.
-    virtual unsigned getFunctionAlignment(const Function *F) const;
-
   private:
     const XCoreTargetMachine &TM;
     const XCoreSubtarget &Subtarget;
@@ -194,7 +191,8 @@ namespace llvm {
                   DebugLoc dl, SelectionDAG &DAG) const;
 
     virtual bool
-      CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
+      CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+		     bool isVarArg,
                      const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
                      LLVMContext &Context) const;
   };
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index 789546e..55c7527 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -472,7 +472,16 @@ def REMU_l3r : FL3R<"remu", urem>;
 }
 def XOR_l3r : FL3R<"xor", xor>;
 defm ASHR : FL3R_L2RBITP<"ashr", sra>;
-// TODO crc32, crc8, inpw, outpw
+
+let Constraints = "$src1 = $dst" in
+def CRC_l3r : _FL3R<(outs GRRegs:$dst),
+                     (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                     "crc32 $dst, $src2, $src3",
+                     [(set GRRegs:$dst,
+                        (int_xcore_crc32 GRRegs:$src1, GRRegs:$src2,
+                                         GRRegs:$src3))]>;
+
+// TODO inpw, outpw
 let mayStore=1 in {
 def ST16_l3r : _FL3R<(outs), (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
                 "st16 $val, $addr[$offset]",
@@ -498,6 +507,12 @@ def MACCS_l4r : _L4R<(outs GRRegs:$dst1, GRRegs:$dst2),
                     []>;
 }
 
+let Constraints = "$src1 = $dst1" in
+def CRC8_l4r : _L4R<(outs GRRegs:$dst1, GRRegs:$dst2),
+                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                    "crc8 $dst1, $dst2, $src2, $src3",
+                    []>;
+
 // Five operand long
 
 def LADD_l5r : _L5R<(outs GRRegs:$dst1, GRRegs:$dst2),
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index 0287a51..46c9e57 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -68,8 +68,8 @@ unsigned XCoreRegisterInfo::getNumArgRegs(const MachineFunction *MF)
 }
 
 bool XCoreRegisterInfo::needsFrameMoves(const MachineFunction &MF) {
-  return MF.getMMI().hasDebugInfo() || !MF.getFunction()->doesNotThrow() ||
-          UnwindTablesMandatory;
+  return MF.getMMI().hasDebugInfo() ||
+    MF.getFunction()->needsUnwindTableEntry();
 }
 
 const unsigned* XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
@@ -315,6 +315,10 @@ int XCoreRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return XCoreGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
 }
 
+int XCoreRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
+  return XCoreGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
+}
+
 unsigned XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index 770483b..7a9bc9f 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -75,6 +75,7 @@ public:
 
   //! Get DWARF debugging register number
   int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/XCore/XCoreRegisterInfo.td b/lib/Target/XCore/XCoreRegisterInfo.td
index 765f717..0951097 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.td
+++ b/lib/Target/XCore/XCoreRegisterInfo.td
@@ -48,44 +48,9 @@ def GRRegs : RegisterClass<"XCore", [i32], 32,
   // Not preserved across procedure calls
   R11,
   // Callee save
-  R4, R5, R6, R7, R8, R9, R10]> {
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    GRRegsClass::iterator
-    GRRegsClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    GRRegsClass::iterator
-    GRRegsClass::allocation_order_end(const MachineFunction &MF) const {
-      const TargetMachine &TM = MF.getTarget();
-      const TargetFrameLowering *TFI = TM.getFrameLowering();
-      if (TFI->hasFP(MF))
-        return end()-1;  // don't allocate R10
-      else
-        return end();
-    }
-  }];
-}
+  R4, R5, R6, R7, R8, R9, R10]>;
 
-def RRegs : RegisterClass<"XCore", [i32], 32,
-  // Reserved
-  [CP, DP, SP, LR]> {
-  let MethodProtos = [{
-    iterator allocation_order_begin(const MachineFunction &MF) const;
-    iterator allocation_order_end(const MachineFunction &MF) const;
-  }];
-  let MethodBodies = [{
-    RRegsClass::iterator
-    RRegsClass::allocation_order_begin(const MachineFunction &MF) const {
-      return begin();
-    }
-    RRegsClass::iterator
-    RRegsClass::allocation_order_end(const MachineFunction &MF) const {
-      // No allocatable registers
-      return begin();
-    }
-  }];
+// Reserved
+def RRegs : RegisterClass<"XCore", [i32], 32, [CP, DP, SP, LR]> {
+  let isAllocatable = 0;
 }
diff --git a/lib/Transforms/IPO/DeadTypeElimination.cpp b/lib/Transforms/IPO/DeadTypeElimination.cpp
index a509931..d3d4963 100644
--- a/lib/Transforms/IPO/DeadTypeElimination.cpp
+++ b/lib/Transforms/IPO/DeadTypeElimination.cpp
@@ -83,7 +83,8 @@ bool DTE::runOnModule(Module &M) {
   bool Changed = false;
 
   TypeSymbolTable &ST = M.getTypeSymbolTable();
-  std::set<const Type *> UsedTypes = getAnalysis<FindUsedTypes>().getTypes();
+  const SetVector<const Type*> &T = getAnalysis<FindUsedTypes>().getTypes();
+  std::set<const Type*> UsedTypes(T.begin(), T.end());
 
   // Check the symbol table for superfluous type entries...
   //
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index 9d432de..d9911bf 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -51,20 +51,32 @@ namespace {
       // Visit the GlobalVariables.
       for (Module::global_iterator I = M.global_begin(), E = M.global_end();
            I != E; ++I) {
+        if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) {
+          I->setInitializer(0);
+	} else {
+	  if (I->hasAvailableExternallyLinkage())
+	    continue;
+	  if (I->getName() == "llvm.global_ctors")
+	    continue;
+	}
+
         if (I->hasLocalLinkage())
           I->setVisibility(GlobalValue::HiddenVisibility);
         I->setLinkage(GlobalValue::ExternalLinkage);
-        if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration())
-          I->setInitializer(0);
       }
 
       // Visit the Functions.
       for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+        if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) {
+          I->deleteBody();
+	} else {
+	  if (I->hasAvailableExternallyLinkage())
+	    continue;
+	}
+
         if (I->hasLocalLinkage())
           I->setVisibility(GlobalValue::HiddenVisibility);
         I->setLinkage(GlobalValue::ExternalLinkage);
-        if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration())
-          I->deleteBody();
       }
 
       return true;
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index ded58ac..cdf7b76 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -241,15 +241,15 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
         GS.HasPHIUser = true;
       } else if (isa<CmpInst>(I)) {
         GS.isCompared = true;
-      } else if (isa<MemTransferInst>(I)) {
-        const MemTransferInst *MTI = cast<MemTransferInst>(I);
+      } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
+        if (MTI->isVolatile()) return true;
         if (MTI->getArgOperand(0) == V)
           GS.StoredType = GlobalStatus::isStored;
         if (MTI->getArgOperand(1) == V)
           GS.isLoaded = true;
-      } else if (isa<MemSetInst>(I)) {
-        assert(cast<MemSetInst>(I)->getArgOperand(0) == V &&
-               "Memset only takes one pointer!");
+      } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) {
+        assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!");
+        if (MSI->isVolatile()) return true;
         GS.StoredType = GlobalStatus::isStored;
       } else {
         return true;  // Any other non-load instruction might take address!
@@ -799,7 +799,8 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV) {
       // If we get here we could have other crazy uses that are transitively
       // loaded.
       assert((isa<PHINode>(GlobalUser) || isa<SelectInst>(GlobalUser) ||
-              isa<ConstantExpr>(GlobalUser)) && "Only expect load and stores!");
+              isa<ConstantExpr>(GlobalUser) || isa<CmpInst>(GlobalUser)) &&
+             "Only expect load and stores!");
     }
   }
 
@@ -1589,8 +1590,7 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
       GV->getInitializer()->isNullValue()) {
     if (Constant *SOVC = dyn_cast<Constant>(StoredOnceVal)) {
       if (GV->getInitializer()->getType() != SOVC->getType())
-        SOVC =
-         ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType());
+        SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType());
 
       // Optimize away any trapping uses of the loaded value.
       if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC))
@@ -2438,6 +2438,20 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
       // Cannot handle inline asm.
       if (isa<InlineAsm>(CI->getCalledValue())) return false;
 
+      if (MemSetInst *MSI = dyn_cast<MemSetInst>(CI)) {
+        if (MSI->isVolatile()) return false;
+        Constant *Ptr = getVal(Values, MSI->getDest());
+        Constant *Val = getVal(Values, MSI->getValue());
+        Constant *DestVal = ComputeLoadResult(getVal(Values, Ptr),
+                                              MutatedMemory);
+        if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
+          // This memset is a no-op.
+          ++CurInst;
+          continue;
+        }
+        return false;
+      }
+
       // Resolve function pointers.
       Function *Callee = dyn_cast<Function>(getVal(Values,
                                                    CI->getCalledValue()));
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index 9470180..2f3baeb 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -180,6 +180,7 @@ bool PruneEH::SimplifyFunction(Function *F) {
         Call->takeName(II);
         Call->setCallingConv(II->getCallingConv());
         Call->setAttributes(II->getAttributes());
+        Call->setDebugLoc(II->getDebugLoc());
 
         // Anything that used the value produced by the invoke instruction
         // now uses the value produced by the call instruction.  Note that we
@@ -238,7 +239,7 @@ void PruneEH::DeleteBasicBlock(BasicBlock *BB) {
   for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; ) {
     --I;
     if (CallInst *CI = dyn_cast<CallInst>(I)) {
-      if (!isa<DbgInfoIntrinsic>(I))
+      if (!isa<IntrinsicInst>(I))
         CGN->removeCallEdgeFor(CI);
     } else if (InvokeInst *II = dyn_cast<InvokeInst>(I))
       CGN->removeCallEdgeFor(II);
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 9c70cf8..8257d6b 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -233,7 +233,15 @@ public:
     Worklist.Add(New);
     return New;
   }
-      
+
+  // InsertNewInstWith - same as InsertNewInstBefore, but also sets the 
+  // debug loc.
+  //
+  Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) {
+    New->setDebugLoc(Old.getDebugLoc());
+    return InsertNewInstBefore(New, Old);
+  }
+
   // ReplaceInstUsesWith - This method is to be used when an instruction is
   // found to be dead, replacable with another preexisting expression.  Here
   // we add all uses of I to the worklist, replace all uses of I with the new
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 726105f..ef67701 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -111,10 +111,10 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   
   Value *Src = Builder->CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
   Value *Dest = Builder->CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
-  Instruction *L = new LoadInst(Src, "tmp", MI->isVolatile(), SrcAlign);
-  InsertNewInstBefore(L, *MI);
-  InsertNewInstBefore(new StoreInst(L, Dest, MI->isVolatile(), DstAlign),
-                      *MI);
+  LoadInst *L = Builder->CreateLoad(Src, MI->isVolatile());
+  L->setAlignment(SrcAlign);
+  StoreInst *S = Builder->CreateStore(L, Dest, MI->isVolatile());
+  S->setAlignment(DstAlign);
 
   // Set the size of the copy to 0, it will be deleted on the next iteration.
   MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType()));
@@ -154,8 +154,9 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
     
     // Extract the fill value and store.
     uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL;
-    InsertNewInstBefore(new StoreInst(ConstantInt::get(ITy, Fill),
-                                      Dest, false, Alignment), *MI);
+    StoreInst *S = Builder->CreateStore(ConstantInt::get(ITy, Fill), Dest,
+                                        MI->isVolatile());
+    S->setAlignment(Alignment);
     
     // Set the size of the copy to 0, it will be deleted on the next iteration.
     MI->setLength(Constant::getNullValue(LenC->getType()));
@@ -405,20 +406,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       if (LHSKnownNegative && RHSKnownNegative) {
         // The sign bit is set in both cases: this MUST overflow.
         // Create a simple add instruction, and insert it into the struct.
-        Instruction *Add = BinaryOperator::CreateAdd(LHS, RHS, "", &CI);
-        Worklist.Add(Add);
+        Value *Add = Builder->CreateAdd(LHS, RHS);
+        Add->takeName(&CI);
         Constant *V[] = {
-          UndefValue::get(LHS->getType()),ConstantInt::getTrue(II->getContext())
+          UndefValue::get(LHS->getType()),
+          ConstantInt::getTrue(II->getContext())
         };
         Constant *Struct = ConstantStruct::get(II->getContext(), V, 2, false);
         return InsertValueInst::Create(Struct, Add, 0);
       }
-      
+
       if (LHSKnownPositive && RHSKnownPositive) {
         // The sign bit is clear in both cases: this CANNOT overflow.
         // Create a simple add instruction, and insert it into the struct.
-        Instruction *Add = BinaryOperator::CreateNUWAdd(LHS, RHS, "", &CI);
-        Worklist.Add(Add);
+        Value *Add = Builder->CreateNUWAdd(LHS, RHS);
+        Add->takeName(&CI);
         Constant *V[] = {
           UndefValue::get(LHS->getType()),
           ConstantInt::getFalse(II->getContext())
@@ -588,6 +590,28 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
+
+  case Intrinsic::x86_sse41_pmovsxbw:
+  case Intrinsic::x86_sse41_pmovsxwd:
+  case Intrinsic::x86_sse41_pmovsxdq:
+  case Intrinsic::x86_sse41_pmovzxbw:
+  case Intrinsic::x86_sse41_pmovzxwd:
+  case Intrinsic::x86_sse41_pmovzxdq: {
+    // pmov{s|z}x ignores the upper half of their input vectors.
+    unsigned VWidth =
+      cast<VectorType>(II->getArgOperand(0)->getType())->getNumElements();
+    unsigned LowHalfElts = VWidth / 2;
+    APInt InputDemandedElts(APInt::getBitsSet(VWidth, 0, LowHalfElts));
+    APInt UndefElts(VWidth, 0);
+    if (Value *TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0),
+                                                 InputDemandedElts,
+                                                 UndefElts)) {
+      II->setArgOperand(0, TmpV);
+      return II;
+    }
+    break;
+  }
+
   case Intrinsic::ppc_altivec_vperm:
     // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
     if (ConstantVector *Mask = dyn_cast<ConstantVector>(II->getArgOperand(2))) {
@@ -813,7 +837,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
       // If OldCall dues not return void then replaceAllUsesWith undef.
       // This allows ValueHandlers and custom metadata to adjust itself.
       if (!OldCall->getType()->isVoidTy())
-        OldCall->replaceAllUsesWith(UndefValue::get(OldCall->getType()));
+        ReplaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType()));
       if (isa<CallInst>(OldCall))
         return EraseInstFromFunction(*OldCall);
       
@@ -835,8 +859,8 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
     // If CS does not return void then replaceAllUsesWith undef.
     // This allows ValueHandlers and custom metadata to adjust itself.
     if (!CS.getInstruction()->getType()->isVoidTy())
-      CS.getInstruction()->
-        replaceAllUsesWith(UndefValue::get(CS.getInstruction()->getType()));
+      ReplaceInstUsesWith(*CS.getInstruction(),
+                          UndefValue::get(CS.getInstruction()->getType()));
 
     if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
       // Don't break the CFG, insert a dummy cond branch.
@@ -1084,15 +1108,15 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
 
   Instruction *NC;
   if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
-    NC = InvokeInst::Create(Callee, II->getNormalDest(), II->getUnwindDest(),
-                            Args.begin(), Args.end(),
-                            Caller->getName(), Caller);
+    NC = Builder->CreateInvoke(Callee, II->getNormalDest(),
+                               II->getUnwindDest(), Args.begin(), Args.end());
+    NC->takeName(II);
     cast<InvokeInst>(NC)->setCallingConv(II->getCallingConv());
     cast<InvokeInst>(NC)->setAttributes(NewCallerPAL);
   } else {
-    NC = CallInst::Create(Callee, Args.begin(), Args.end(),
-                          Caller->getName(), Caller);
     CallInst *CI = cast<CallInst>(Caller);
+    NC = Builder->CreateCall(Callee, Args.begin(), Args.end());
+    NC->takeName(CI);
     if (CI->isTailCall())
       cast<CallInst>(NC)->setTailCall();
     cast<CallInst>(NC)->setCallingConv(CI->getCallingConv());
@@ -1106,6 +1130,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
       Instruction::CastOps opcode =
         CastInst::getCastOpcode(NC, false, OldRetTy, false);
       NV = NC = CastInst::Create(opcode, NC, OldRetTy, "tmp");
+      NC->setDebugLoc(Caller->getDebugLoc());
 
       // If this is an invoke instruction, we should insert it after the first
       // non-phi, instruction in the normal successor block.
@@ -1123,8 +1148,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   }
 
   if (!Caller->use_empty())
-    Caller->replaceAllUsesWith(NV);
-  
+    ReplaceInstUsesWith(*Caller, NV);
+
   EraseInstFromFunction(*Caller);
   return true;
 }
@@ -1189,7 +1214,7 @@ Instruction *InstCombiner::transformCallThroughTrampoline(CallSite CS) {
             // Add the chain argument and attributes.
             Value *NestVal = Tramp->getArgOperand(2);
             if (NestVal->getType() != NestTy)
-              NestVal = new BitCastInst(NestVal, NestTy, "nest", Caller);
+              NestVal = Builder->CreateBitCast(NestVal, NestTy, "nest");
             NewArgs.push_back(NestVal);
             NewAttrs.push_back(AttributeWithIndex::get(NestIdx, NestAttr));
           }
@@ -1255,24 +1280,19 @@ Instruction *InstCombiner::transformCallThroughTrampoline(CallSite CS) {
       if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
         NewCaller = InvokeInst::Create(NewCallee,
                                        II->getNormalDest(), II->getUnwindDest(),
-                                       NewArgs.begin(), NewArgs.end(),
-                                       Caller->getName(), Caller);
+                                       NewArgs.begin(), NewArgs.end());
         cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv());
         cast<InvokeInst>(NewCaller)->setAttributes(NewPAL);
       } else {
-        NewCaller = CallInst::Create(NewCallee, NewArgs.begin(), NewArgs.end(),
-                                     Caller->getName(), Caller);
+        NewCaller = CallInst::Create(NewCallee, NewArgs.begin(), NewArgs.end());
         if (cast<CallInst>(Caller)->isTailCall())
           cast<CallInst>(NewCaller)->setTailCall();
         cast<CallInst>(NewCaller)->
           setCallingConv(cast<CallInst>(Caller)->getCallingConv());
         cast<CallInst>(NewCaller)->setAttributes(NewPAL);
       }
-      if (!Caller->getType()->isVoidTy())
-        Caller->replaceAllUsesWith(NewCaller);
-      Caller->eraseFromParent();
-      Worklist.Remove(Caller);
-      return 0;
+
+      return NewCaller;
     }
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 6f70de8..199902a 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -133,7 +133,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
     // New is the allocation instruction, pointer typed. AI is the original
     // allocation instruction, also pointer typed. Thus, cast to use is BitCast.
     Value *NewCast = AllocaBuilder.CreateBitCast(New, AI.getType(), "tmpcast");
-    AI.replaceAllUsesWith(NewCast);
+    ReplaceInstUsesWith(AI, NewCast);
   }
   return ReplaceInstUsesWith(CI, New);
 }
@@ -211,7 +211,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, const Type *Ty,
   }
   
   Res->takeName(I);
-  return InsertNewInstBefore(Res, *I);
+  return InsertNewInstWith(Res, *I);
 }
 
 
@@ -1228,7 +1228,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
       
       
       // Remove the old Call.  With -fmath-errno, it won't get marked readnone.
-      Call->replaceAllUsesWith(UndefValue::get(Call->getType()));
+      ReplaceInstUsesWith(*Call, UndefValue::get(Call->getType()));
       EraseInstFromFunction(*Call);
       return ret;
     }
@@ -1684,8 +1684,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     // If we found a path from the src to dest, create the getelementptr now.
     if (SrcElTy == DstElTy) {
       SmallVector<Value*, 8> Idxs(NumZeros+1, ZeroUInt);
-      return GetElementPtrInst::CreateInBounds(Src, Idxs.begin(), Idxs.end(),"",
-                                               ((Instruction*)NULL));
+      return GetElementPtrInst::CreateInBounds(Src, Idxs.begin(), Idxs.end());
     }
   }
   
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index bb9b88b..c7ed098 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -469,8 +469,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
 ///
 /// If we can't emit an optimized form for this expression, this returns null.
 /// 
-static Value *EvaluateGEPOffsetExpression(User *GEP, Instruction &I,
-                                          InstCombiner &IC) {
+static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
   TargetData &TD = *IC.getTargetData();
   gep_type_iterator GTI = gep_type_begin(GEP);
   
@@ -533,10 +532,10 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, Instruction &I,
     // Cast to intptrty in case a truncation occurs.  If an extension is needed,
     // we don't need to bother extending: the extension won't affect where the
     // computation crosses zero.
-    if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth)
-      VariableIdx = new TruncInst(VariableIdx, 
-                                  TD.getIntPtrType(VariableIdx->getContext()),
-                                  VariableIdx->getName(), &I);
+    if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) {
+      const Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
+      VariableIdx = IC.Builder->CreateTrunc(VariableIdx, IntPtrTy);
+    }
     return VariableIdx;
   }
   
@@ -558,11 +557,10 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, Instruction &I,
   // Okay, we can do this evaluation.  Start by converting the index to intptr.
   const Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
   if (VariableIdx->getType() != IntPtrTy)
-    VariableIdx = CastInst::CreateIntegerCast(VariableIdx, IntPtrTy,
-                                              true /*SExt*/, 
-                                              VariableIdx->getName(), &I);
+    VariableIdx = IC.Builder->CreateIntCast(VariableIdx, IntPtrTy,
+                                            true /*Signed*/);
   Constant *OffsetVal = ConstantInt::get(IntPtrTy, NewOffs);
-  return BinaryOperator::CreateAdd(VariableIdx, OffsetVal, "offset", &I);
+  return IC.Builder->CreateAdd(VariableIdx, OffsetVal, "offset");
 }
 
 /// FoldGEPICmp - Fold comparisons between a GEP instruction and something
@@ -580,7 +578,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     // This transformation (ignoring the base and scales) is valid because we
     // know pointers can't overflow since the gep is inbounds.  See if we can
     // output an optimized form.
-    Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, I, *this);
+    Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, *this);
     
     // If not, synthesize the offset the hard way.
     if (Offset == 0)
@@ -634,6 +632,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     if (AllZeros)
       return FoldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);
 
+    bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds();
     if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) {
       // If the GEPs only differ by one index, compare it.
       unsigned NumDifferences = 0;  // Keep track of # differences.
@@ -656,7 +655,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
                                ConstantInt::get(Type::getInt1Ty(I.getContext()),
                                              ICmpInst::isTrueWhenEqual(Cond)));
 
-      else if (NumDifferences == 1) {
+      else if (NumDifferences == 1 && GEPsInBounds) {
         Value *LHSV = GEPLHS->getOperand(DiffOperand);
         Value *RHSV = GEPRHS->getOperand(DiffOperand);
         // Make sure we do a signed comparison here.
@@ -667,6 +666,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     // Only lower this if the icmp is the only user of the GEP or if we expect
     // the result to fold to a constant!
     if (TD &&
+        GEPsInBounds &&
         (isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) &&
         (isa<ConstantExpr>(GEPRHS) || GEPRHS->hasOneUse())) {
       // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)  --->  (OFFSET1 cmp OFFSET2)
@@ -919,11 +919,11 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
     if (ICI.isSigned() != (Shr->getOpcode() == Instruction::AShr))
       return 0;
     
-    // Otherwise, all lshr and all exact ashr's are equivalent to a udiv/sdiv by
-    // a power of 2.  Since we already have logic to simplify these, transform
-    // to div and then simplify the resultant comparison.
+    // Otherwise, all lshr and most exact ashr's are equivalent to a udiv/sdiv
+    // by a power of 2.  Since we already have logic to simplify these,
+    // transform to div and then simplify the resultant comparison.
     if (Shr->getOpcode() == Instruction::AShr &&
-        !Shr->isExact())
+        (!Shr->isExact() || ShAmtVal == TypeBits - 1))
       return 0;
     
     // Revisit the shift (to delete it).
@@ -2400,7 +2400,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         // fall-through
       case Instruction::SDiv:
       case Instruction::AShr:
-        if (!BO0->isExact() && !BO1->isExact())
+        if (!BO0->isExact() || !BO1->isExact())
           break;
         return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
                             BO1->getOperand(0));
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 432adc9..f499290 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -57,12 +57,14 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
       Value *Idx[2];
       Idx[0] = NullIdx;
       Idx[1] = NullIdx;
-      Value *V = GetElementPtrInst::CreateInBounds(New, Idx, Idx + 2,
-                                                   New->getName()+".sub", It);
+      Instruction *GEP =
+           GetElementPtrInst::CreateInBounds(New, Idx, Idx + 2,
+                                             New->getName()+".sub");
+      InsertNewInstBefore(GEP, *It);
 
       // Now make everything use the getelementptr instead of the original
       // allocation.
-      return ReplaceInstUsesWith(AI, V);
+      return ReplaceInstUsesWith(AI, GEP);
     } else if (isa<UndefValue>(AI.getArraySize())) {
       return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
     }
@@ -600,10 +602,12 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
   // Advance to a place where it is safe to insert the new store and
   // insert it.
   BBI = DestBB->getFirstNonPHI();
-  InsertNewInstBefore(new StoreInst(MergedVal, SI.getOperand(1),
-                                    OtherStore->isVolatile(),
-                                    SI.getAlignment()), *BBI);
-  
+  StoreInst *NewSI = new StoreInst(MergedVal, SI.getOperand(1),
+                                   OtherStore->isVolatile(),
+                                   SI.getAlignment());
+  InsertNewInstBefore(NewSI, *BBI);
+  NewSI->setDebugLoc(OtherStore->getDebugLoc()); 
+
   // Nuke the old stores.
   EraseInstFromFunction(SI);
   EraseInstFromFunction(*OtherStore);
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 57fb08a..2d29403 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -19,6 +19,60 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+
+/// simplifyValueKnownNonZero - The specific integer value is used in a context
+/// where it is known to be non-zero.  If this allows us to simplify the
+/// computation, do so and return the new operand, otherwise return null.
+static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
+  // If V has multiple uses, then we would have to do more analysis to determine
+  // if this is safe.  For example, the use could be in dynamically unreached
+  // code.
+  if (!V->hasOneUse()) return 0;
+  
+  bool MadeChange = false;
+
+  // ((1 << A) >>u B) --> (1 << (A-B))
+  // Because V cannot be zero, we know that B is less than A.
+  Value *A = 0, *B = 0, *PowerOf2 = 0;
+  if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(PowerOf2), m_Value(A))),
+                      m_Value(B))) &&
+      // The "1" can be any value known to be a power of 2.
+      isPowerOfTwo(PowerOf2, IC.getTargetData())) {
+    A = IC.Builder->CreateSub(A, B, "tmp");
+    return IC.Builder->CreateShl(PowerOf2, A);
+  }
+  
+  // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it
+  // inexact.  Similarly for <<.
+  if (BinaryOperator *I = dyn_cast<BinaryOperator>(V))
+    if (I->isLogicalShift() &&
+        isPowerOfTwo(I->getOperand(0), IC.getTargetData())) {
+      // We know that this is an exact/nuw shift and that the input is a
+      // non-zero context as well.
+      if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC)) {
+        I->setOperand(0, V2);
+        MadeChange = true;
+      }
+      
+      if (I->getOpcode() == Instruction::LShr && !I->isExact()) {
+        I->setIsExact();
+        MadeChange = true;
+      }
+      
+      if (I->getOpcode() == Instruction::Shl && !I->hasNoUnsignedWrap()) {
+        I->setHasNoUnsignedWrap();
+        MadeChange = true;
+      }
+    }
+
+  // TODO: Lots more we could do here:
+  //    If V is a phi node, we can call this on each of its operands.
+  //    "select cond, X, 0" can simplify to "X".
+  
+  return MadeChange ? V : 0;
+}
+
+
 /// MultiplyOverflows - True if the multiply can not be expressed in an int
 /// this size.
 static bool MultiplyOverflows(ConstantInt *C1, ConstantInt *C2, bool sign) {
@@ -81,6 +135,29 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
         return BinaryOperator::CreateAdd(Add, Builder->CreateMul(C1, CI));
       }
     }
+
+    // (Y - X) * (-(2**n)) -> (X - Y) * (2**n), for positive nonzero n
+    // (Y + const) * (-(2**n)) -> (-constY) * (2**n), for positive nonzero n
+    // The "* (2**n)" thus becomes a potential shifting opportunity.
+    {
+      const APInt &   Val = CI->getValue();
+      const APInt &PosVal = Val.abs();
+      if (Val.isNegative() && PosVal.isPowerOf2()) {
+        Value *X = 0, *Y = 0;
+        if (Op0->hasOneUse()) {
+          ConstantInt *C1;
+          Value *Sub = 0;
+          if (match(Op0, m_Sub(m_Value(Y), m_Value(X))))
+            Sub = Builder->CreateSub(X, Y, "suba");
+          else if (match(Op0, m_Add(m_Value(Y), m_ConstantInt(C1))))
+            Sub = Builder->CreateSub(Builder->CreateNeg(C1), Y, "subc");
+          if (Sub)
+            return
+              BinaryOperator::CreateMul(Sub,
+                                        ConstantInt::get(Y->getType(), PosVal));
+        }
+      }
+    }
   }
   
   // Simplify mul instructions with a constant RHS.
@@ -293,6 +370,12 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
 Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  // The RHS is known non-zero.
+  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this)) {
+    I.setOperand(1, V);
+    return &I;
+  }
+  
   // Handle cases involving: [su]div X, (select Cond, Y, Z)
   // This does not apply for fdiv.
   if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
@@ -499,11 +582,17 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
 Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  // The RHS is known non-zero.
+  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this)) {
+    I.setOperand(1, V);
+    return &I;
+  }
+
   // Handle cases involving: rem X, (select Cond, Y, Z)
   if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
     return &I;
 
-  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+  if (isa<ConstantInt>(Op1)) {
     if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) {
       if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) {
         if (Instruction *R = FoldOpIntoSelect(I, SI))
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index abf61bb..3777340 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -110,16 +110,20 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
     }
   }
     
-  if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst))
-    return CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
-                           LHSVal, RHSVal);
-  
+  if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst)) {
+    CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
+                                     LHSVal, RHSVal);
+    NewCI->setDebugLoc(FirstInst->getDebugLoc());
+    return NewCI;
+  }
+
   BinaryOperator *BinOp = cast<BinaryOperator>(FirstInst);
   BinaryOperator *NewBinOp =
     BinaryOperator::Create(BinOp->getOpcode(), LHSVal, RHSVal);
   if (isNUW) NewBinOp->setHasNoUnsignedWrap();
   if (isNSW) NewBinOp->setHasNoSignedWrap();
   if (isExact) NewBinOp->setIsExact();
+  NewBinOp->setDebugLoc(FirstInst->getDebugLoc());
   return NewBinOp;
 }
 
@@ -228,6 +232,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
     GetElementPtrInst::Create(Base, FixedOperands.begin()+1,
                               FixedOperands.end());
   if (AllInBounds) NewGEP->setIsInBounds();
+  NewGEP->setDebugLoc(FirstInst->getDebugLoc());
   return NewGEP;
 }
 
@@ -369,7 +374,9 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
     for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
       cast<LoadInst>(PN.getIncomingValue(i))->setVolatile(false);
   
-  return new LoadInst(PhiVal, "", isVolatile, LoadAlignment);
+  LoadInst *NewLI = new LoadInst(PhiVal, "", isVolatile, LoadAlignment);
+  NewLI->setDebugLoc(FirstLI->getDebugLoc());
+  return NewLI;
 }
 
 
@@ -469,20 +476,27 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
   }
 
   // Insert and return the new operation.
-  if (CastInst *FirstCI = dyn_cast<CastInst>(FirstInst))
-    return CastInst::Create(FirstCI->getOpcode(), PhiVal, PN.getType());
+  if (CastInst *FirstCI = dyn_cast<CastInst>(FirstInst)) {
+    CastInst *NewCI = CastInst::Create(FirstCI->getOpcode(), PhiVal,
+                                       PN.getType());
+    NewCI->setDebugLoc(FirstInst->getDebugLoc());
+    return NewCI;
+  }
   
   if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst)) {
     BinOp = BinaryOperator::Create(BinOp->getOpcode(), PhiVal, ConstantOp);
     if (isNUW) BinOp->setHasNoUnsignedWrap();
     if (isNSW) BinOp->setHasNoSignedWrap();
     if (isExact) BinOp->setIsExact();
+    BinOp->setDebugLoc(FirstInst->getDebugLoc());
     return BinOp;
   }
   
   CmpInst *CIOp = cast<CmpInst>(FirstInst);
-  return CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
-                         PhiVal, ConstantOp);
+  CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
+                                   PhiVal, ConstantOp);
+  NewCI->setDebugLoc(FirstInst->getDebugLoc());
+  return NewCI;
 }
 
 /// DeadPHICycle - Return true if this PHI node is only used by a PHI node cycle
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 61a433a..aeb3c3e 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -133,9 +133,8 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
     }
 
     // Fold this by inserting a select from the input values.
-    SelectInst *NewSI = SelectInst::Create(SI.getCondition(), TI->getOperand(0),
-                                          FI->getOperand(0), SI.getName()+".v");
-    InsertNewInstBefore(NewSI, SI);
+    Value *NewSI = Builder->CreateSelect(SI.getCondition(), TI->getOperand(0),
+                                         FI->getOperand(0), SI.getName()+".v");
     return CastInst::Create(Instruction::CastOps(TI->getOpcode()), NewSI,
                             TI->getType());
   }
@@ -174,9 +173,8 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
   }
 
   // If we reach here, they do have operations in common.
-  SelectInst *NewSI = SelectInst::Create(SI.getCondition(), OtherOpT,
-                                         OtherOpF, SI.getName()+".v");
-  InsertNewInstBefore(NewSI, SI);
+  Value *NewSI = Builder->CreateSelect(SI.getCondition(), OtherOpT,
+                                       OtherOpF, SI.getName()+".v");
 
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(TI)) {
     if (MatchIsOpZero)
@@ -224,8 +222,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
           // Avoid creating select between 2 constants unless it's selecting
           // between 0, 1 and -1.
           if (!isa<Constant>(OOp) || isSelect01(C, cast<Constant>(OOp))) {
-            Instruction *NewSel = SelectInst::Create(SI.getCondition(), OOp, C);
-            InsertNewInstBefore(NewSel, SI);
+            Value *NewSel = Builder->CreateSelect(SI.getCondition(), OOp, C);
             NewSel->takeName(TVI);
             BinaryOperator *TVI_BO = cast<BinaryOperator>(TVI);
             BinaryOperator *BO = BinaryOperator::Create(TVI_BO->getOpcode(),
@@ -260,8 +257,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
           // Avoid creating select between 2 constants unless it's selecting
           // between 0, 1 and -1.
           if (!isa<Constant>(OOp) || isSelect01(C, cast<Constant>(OOp))) {
-            Instruction *NewSel = SelectInst::Create(SI.getCondition(), C, OOp);
-            InsertNewInstBefore(NewSel, SI);
+            Value *NewSel = Builder->CreateSelect(SI.getCondition(), C, OOp);
             NewSel->takeName(FVI);
             BinaryOperator *FVI_BO = cast<BinaryOperator>(FVI);
             BinaryOperator *BO = BinaryOperator::Create(FVI_BO->getOpcode(),
@@ -282,6 +278,59 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
   return 0;
 }
 
+/// SimplifyWithOpReplaced - See if V simplifies when its operand Op is
+/// replaced with RepOp.
+static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
+                                     const TargetData *TD) {
+  // Trivial replacement.
+  if (V == Op)
+    return RepOp;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return 0;
+
+  // If this is a binary operator, try to simplify it with the replaced op.
+  if (BinaryOperator *B = dyn_cast<BinaryOperator>(I)) {
+    if (B->getOperand(0) == Op)
+      return SimplifyBinOp(B->getOpcode(), RepOp, B->getOperand(1), TD);
+    if (B->getOperand(1) == Op)
+      return SimplifyBinOp(B->getOpcode(), B->getOperand(0), RepOp, TD);
+  }
+
+  // Same for CmpInsts.
+  if (CmpInst *C = dyn_cast<CmpInst>(I)) {
+    if (C->getOperand(0) == Op)
+      return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), TD);
+    if (C->getOperand(1) == Op)
+      return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, TD);
+  }
+
+  // TODO: We could hand off more cases to instsimplify here.
+
+  // If all operands are constant after substituting Op for RepOp then we can
+  // constant fold the instruction.
+  if (Constant *CRepOp = dyn_cast<Constant>(RepOp)) {
+    // Build a list of all constant operands.
+    SmallVector<Constant*, 8> ConstOps;
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      if (I->getOperand(i) == Op)
+        ConstOps.push_back(CRepOp);
+      else if (Constant *COp = dyn_cast<Constant>(I->getOperand(i)))
+        ConstOps.push_back(COp);
+      else
+        break;
+    }
+
+    // All operands were constants, fold it.
+    if (ConstOps.size() == I->getNumOperands())
+      return ConstantFoldInstOperands(I->getOpcode(), I->getType(),
+                                      ConstOps.data(), ConstOps.size(), TD);
+  }
+
+  return 0;
+}
+
 /// visitSelectInstWithICmp - Visit a SelectInst that has an
 /// ICmpInst as its first operand.
 ///
@@ -420,25 +469,21 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
     }
   }
 
-  if (CmpLHS == TrueVal && CmpRHS == FalseVal) {
-    // Transform (X == Y) ? X : Y  -> Y
-    if (Pred == ICmpInst::ICMP_EQ)
+  // If we have an equality comparison then we know the value in one of the
+  // arms of the select. See if substituting this value into the arm and
+  // simplifying the result yields the same value as the other arm.
+  if (Pred == ICmpInst::ICMP_EQ) {
+    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TD) == TrueVal ||
+        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TD) == TrueVal)
       return ReplaceInstUsesWith(SI, FalseVal);
-    // Transform (X != Y) ? X : Y  -> X
-    if (Pred == ICmpInst::ICMP_NE)
+  } else if (Pred == ICmpInst::ICMP_NE) {
+    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TD) == FalseVal ||
+        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TD) == FalseVal)
       return ReplaceInstUsesWith(SI, TrueVal);
-    /// NOTE: if we wanted to, this is where to detect integer MIN/MAX
-
-  } else if (CmpLHS == FalseVal && CmpRHS == TrueVal) {
-    // Transform (X == Y) ? Y : X  -> X
-    if (Pred == ICmpInst::ICMP_EQ)
-      return ReplaceInstUsesWith(SI, FalseVal);
-    // Transform (X != Y) ? Y : X  -> Y
-    if (Pred == ICmpInst::ICMP_NE)
-      return ReplaceInstUsesWith(SI, TrueVal);
-    /// NOTE: if we wanted to, this is where to detect integer MIN/MAX
   }
 
+  // NOTE: if we wanted to, this is where to detect integer MIN/MAX
+
   if (isa<Constant>(CmpRHS)) {
     if (CmpLHS == TrueVal && Pred == ICmpInst::ICMP_EQ) {
       // Transform (X == C) ? X : Y -> (X == C) ? C : Y
@@ -604,9 +649,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
         return BinaryOperator::CreateOr(CondVal, FalseVal);
       }
       // Change: A = select B, false, C --> A = and !B, C
-      Value *NotCond =
-        InsertNewInstBefore(BinaryOperator::CreateNot(CondVal,
-                                           "not."+CondVal->getName()), SI);
+      Value *NotCond = Builder->CreateNot(CondVal, "not."+CondVal->getName());
       return BinaryOperator::CreateAnd(NotCond, FalseVal);
     } else if (ConstantInt *C = dyn_cast<ConstantInt>(FalseVal)) {
       if (C->getZExtValue() == false) {
@@ -614,9 +657,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
         return BinaryOperator::CreateAnd(CondVal, TrueVal);
       }
       // Change: A = select B, C, true --> A = or !B, C
-      Value *NotCond =
-        InsertNewInstBefore(BinaryOperator::CreateNot(CondVal,
-                                           "not."+CondVal->getName()), SI);
+      Value *NotCond = Builder->CreateNot(CondVal, "not."+CondVal->getName());
       return BinaryOperator::CreateOr(NotCond, TrueVal);
     }
 
@@ -755,27 +796,20 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
             // So at this point we know we have (Y -> OtherAddOp):
             //        select C, (add X, Y), (sub X, Z)
             Value *NegVal;  // Compute -Z
-            if (Constant *C = dyn_cast<Constant>(SubOp->getOperand(1))) {
-              NegVal = ConstantExpr::getNeg(C);
-            } else if (SI.getType()->isFloatingPointTy()) {
-              NegVal = InsertNewInstBefore(
-                    BinaryOperator::CreateFNeg(SubOp->getOperand(1),
-                                              "tmp"), SI);
+            if (SI.getType()->isFloatingPointTy()) {
+              NegVal = Builder->CreateFNeg(SubOp->getOperand(1));
             } else {
-              NegVal = InsertNewInstBefore(
-                    BinaryOperator::CreateNeg(SubOp->getOperand(1),
-                                              "tmp"), SI);
+              NegVal = Builder->CreateNeg(SubOp->getOperand(1));
             }
 
             Value *NewTrueOp = OtherAddOp;
             Value *NewFalseOp = NegVal;
             if (AddOp != TI)
               std::swap(NewTrueOp, NewFalseOp);
-            Instruction *NewSel =
-              SelectInst::Create(CondVal, NewTrueOp,
-                                 NewFalseOp, SI.getName() + ".p");
+            Value *NewSel = 
+              Builder->CreateSelect(CondVal, NewTrueOp,
+                                    NewFalseOp, SI.getName() + ".p");
 
-            NewSel = InsertNewInstBefore(NewSel, SI);
             if (SI.getType()->isFloatingPointTy())
               return BinaryOperator::CreateFAdd(SubOp->getOperand(0), NewSel);
             else
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 6e727ce..8fea8eb 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -313,7 +313,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       Instruction *Or = 
         BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
                                  I->getName());
-      return InsertNewInstBefore(Or, *I);
+      return InsertNewInstWith(Or, *I);
     }
     
     // If all of the demanded bits on one side are known, and all of the set
@@ -327,7 +327,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
                                                    ~RHSKnownOne & DemandedMask);
         Instruction *And = 
           BinaryOperator::CreateAnd(I->getOperand(0), AndC, "tmp");
-        return InsertNewInstBefore(And, *I);
+        return InsertNewInstWith(And, *I);
       }
     }
     
@@ -353,13 +353,13 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
           ConstantInt::get(I->getType(), NewMask & AndRHS->getValue());
         Instruction *NewAnd = 
           BinaryOperator::CreateAnd(I->getOperand(0), AndC, "tmp");
-        InsertNewInstBefore(NewAnd, *I);
+        InsertNewInstWith(NewAnd, *I);
         
         Constant *XorC =
           ConstantInt::get(I->getType(), NewMask & XorRHS->getValue());
         Instruction *NewXor =
           BinaryOperator::CreateXor(NewAnd, XorC, "tmp");
-        return InsertNewInstBefore(NewXor, *I);
+        return InsertNewInstWith(NewXor, *I);
       }
 
     // Output known-0 bits are known if clear or set in both the LHS & RHS.
@@ -472,7 +472,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     if (KnownZero[SrcBitWidth-1] || (NewBits & ~DemandedMask) == NewBits) {
       // Convert to ZExt cast
       CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName());
-      return InsertNewInstBefore(NewCast, *I);
+      return InsertNewInstWith(NewCast, *I);
     } else if (KnownOne[SrcBitWidth-1]) {    // Input sign bit known set
       KnownOne |= NewBits;
     }
@@ -515,7 +515,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         Instruction *Or =
           BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
                                    I->getName());
-        return InsertNewInstBefore(Or, *I);
+        return InsertNewInstWith(Or, *I);
       }
       
       // We can say something about the output known-zero and known-one bits,
@@ -632,7 +632,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // Perform the logical shift right.
       Instruction *NewVal = BinaryOperator::CreateLShr(
                         I->getOperand(0), I->getOperand(1), I->getName());
-      return InsertNewInstBefore(NewVal, *I);
+      return InsertNewInstWith(NewVal, *I);
     }    
 
     // If the sign bit is the only bit demanded by this ashr, then there is no
@@ -676,7 +676,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         // Perform the logical shift right.
         Instruction *NewVal = BinaryOperator::CreateLShr(
                           I->getOperand(0), SA, I->getName());
-        return InsertNewInstBefore(NewVal, *I);
+        return InsertNewInstWith(NewVal, *I);
       } else if ((KnownOne & SignBit) != 0) { // New bits are known one.
         KnownOne |= HighBits;
       }
@@ -774,12 +774,16 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
             NewVal = BinaryOperator::CreateShl(II->getArgOperand(0),
                     ConstantInt::get(I->getType(), ResultBit-InputBit));
           NewVal->takeName(I);
-          return InsertNewInstBefore(NewVal, *I);
+          return InsertNewInstWith(NewVal, *I);
         }
           
         // TODO: Could compute known zero/one bits based on the input.
         break;
       }
+      case Intrinsic::x86_sse42_crc32_64_8:
+      case Intrinsic::x86_sse42_crc32_64_64:
+        KnownZero = APInt::getHighBitsSet(64, 32);
+        return 0;
       }
     }
     ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth);
@@ -867,7 +871,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
   if (Depth == 10)
     return 0;
 
-  // If multiple users are using the root value, procede with
+  // If multiple users are using the root value, proceed with
   // simplification conservatively assuming that all elements
   // are needed.
   if (!V->hasOneUse()) {
@@ -1108,21 +1112,21 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
           Value *LHS = II->getArgOperand(0);
           Value *RHS = II->getArgOperand(1);
           // Extract the element as scalars.
-          LHS = InsertNewInstBefore(ExtractElementInst::Create(LHS, 
+          LHS = InsertNewInstWith(ExtractElementInst::Create(LHS, 
             ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II);
-          RHS = InsertNewInstBefore(ExtractElementInst::Create(RHS,
+          RHS = InsertNewInstWith(ExtractElementInst::Create(RHS,
             ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II);
           
           switch (II->getIntrinsicID()) {
           default: llvm_unreachable("Case stmts out of sync!");
           case Intrinsic::x86_sse_sub_ss:
           case Intrinsic::x86_sse2_sub_sd:
-            TmpV = InsertNewInstBefore(BinaryOperator::CreateFSub(LHS, RHS,
+            TmpV = InsertNewInstWith(BinaryOperator::CreateFSub(LHS, RHS,
                                                         II->getName()), *II);
             break;
           case Intrinsic::x86_sse_mul_ss:
           case Intrinsic::x86_sse2_mul_sd:
-            TmpV = InsertNewInstBefore(BinaryOperator::CreateFMul(LHS, RHS,
+            TmpV = InsertNewInstWith(BinaryOperator::CreateFMul(LHS, RHS,
                                                          II->getName()), *II);
             break;
           }
@@ -1132,7 +1136,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
               UndefValue::get(II->getType()), TmpV,
               ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U, false),
                                       II->getName());
-          InsertNewInstBefore(New, *II);
+          InsertNewInstWith(New, *II);
           return New;
         }            
       }
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 7a84598..92c10f5 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -240,9 +240,9 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         Constant *C2 = cast<Constant>(Op1->getOperand(1));
 
         Constant *Folded = ConstantExpr::get(Opcode, C1, C2);
-        Instruction *New = BinaryOperator::Create(Opcode, A, B, Op1->getName(),
-                                                  &I);
-        Worklist.Add(New);
+        Instruction *New = BinaryOperator::Create(Opcode, A, B);
+        InsertNewInstWith(New, I);
+        New->takeName(Op1);
         I.setOperand(0, New);
         I.setOperand(1, Folded);
         // Conservatively clear the optional flags, since they may not be
@@ -599,7 +599,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   }
 
   // Okay, we can do the transformation: create the new PHI node.
-  PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues(), "");
+  PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues());
   InsertNewInstBefore(NewPN, *PN);
   NewPN->takeName(PN);
   
@@ -1088,8 +1088,8 @@ Instruction *InstCombiner::visitFree(CallInst &FI) {
   // free undef -> unreachable.
   if (isa<UndefValue>(Op)) {
     // Insert a new store to null because we cannot modify the CFG here.
-    new StoreInst(ConstantInt::getTrue(FI.getContext()),
-           UndefValue::get(Type::getInt1PtrTy(FI.getContext())), &FI);
+    Builder->CreateStore(ConstantInt::getTrue(FI.getContext()),
+                         UndefValue::get(Type::getInt1PtrTy(FI.getContext())));
     return EraseInstFromFunction(FI);
   }
   
@@ -1261,7 +1261,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       case Intrinsic::sadd_with_overflow:
         if (*EV.idx_begin() == 0) {  // Normal result.
           Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
-          II->replaceAllUsesWith(UndefValue::get(II->getType()));
+          ReplaceInstUsesWith(*II, UndefValue::get(II->getType()));
           EraseInstFromFunction(*II);
           return BinaryOperator::CreateAdd(LHS, RHS);
         }
@@ -1278,7 +1278,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       case Intrinsic::ssub_with_overflow:
         if (*EV.idx_begin() == 0) {  // Normal result.
           Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
-          II->replaceAllUsesWith(UndefValue::get(II->getType()));
+          ReplaceInstUsesWith(*II, UndefValue::get(II->getType()));
           EraseInstFromFunction(*II);
           return BinaryOperator::CreateSub(LHS, RHS);
         }
@@ -1287,7 +1287,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       case Intrinsic::smul_with_overflow:
         if (*EV.idx_begin() == 0) {  // Normal result.
           Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
-          II->replaceAllUsesWith(UndefValue::get(II->getType()));
+          ReplaceInstUsesWith(*II, UndefValue::get(II->getType()));
           EraseInstFromFunction(*II);
           return BinaryOperator::CreateMul(LHS, RHS);
         }
@@ -1385,8 +1385,8 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
   Worklist.push_back(BB);
 
   SmallVector<Instruction*, 128> InstrsForInstCombineWorklist;
-  SmallPtrSet<ConstantExpr*, 64> FoldedConstants;
-  
+  DenseMap<ConstantExpr*, Constant*> FoldedConstants;
+
   do {
     BB = Worklist.pop_back_val();
     
@@ -1421,14 +1421,15 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
              i != e; ++i) {
           ConstantExpr *CE = dyn_cast<ConstantExpr>(i);
           if (CE == 0) continue;
-          
-          // If we already folded this constant, don't try again.
-          if (!FoldedConstants.insert(CE))
-            continue;
-          
-          Constant *NewC = ConstantFoldConstantExpression(CE, TD);
-          if (NewC && NewC != CE) {
-            *i = NewC;
+
+          Constant*& FoldRes = FoldedConstants[CE];
+          if (!FoldRes)
+            FoldRes = ConstantFoldConstantExpression(CE, TD);
+          if (!FoldRes)
+            FoldRes = CE;
+
+          if (FoldRes != CE) {
+            *i = FoldRes;
             MadeIRChange = true;
           }
         }
@@ -1575,6 +1576,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
 
     // Now that we have an instruction, try combining it to simplify it.
     Builder->SetInsertPoint(I->getParent(), I);
+    Builder->SetCurrentDebugLocation(I->getDebugLoc());
     
 #ifndef NDEBUG
     std::string OrigI;
@@ -1589,7 +1591,8 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
         DEBUG(errs() << "IC: Old = " << *I << '\n'
                      << "    New = " << *Result << '\n');
 
-        Result->setDebugLoc(I->getDebugLoc());
+        if (!I->getDebugLoc().isUnknown())
+          Result->setDebugLoc(I->getDebugLoc());
         // Everything uses the new instruction now.
         I->replaceAllUsesWith(Result);
 
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 2425342..b902213 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -40,15 +40,15 @@ using namespace llvm;
 
 namespace {
   class GCOVProfiler : public ModulePass {
-    bool runOnModule(Module &M);
   public:
     static char ID;
     GCOVProfiler()
-        : ModulePass(ID), EmitNotes(true), EmitData(true) {
+        : ModulePass(ID), EmitNotes(true), EmitData(true), Use402Format(false) {
       initializeGCOVProfilerPass(*PassRegistry::getPassRegistry());
     }
-    GCOVProfiler(bool EmitNotes, bool EmitData)
-        : ModulePass(ID), EmitNotes(EmitNotes), EmitData(EmitData) {
+    GCOVProfiler(bool EmitNotes, bool EmitData, bool use402Format = false)
+        : ModulePass(ID), EmitNotes(EmitNotes), EmitData(EmitData),
+          Use402Format(use402Format) {
       assert((EmitNotes || EmitData) && "GCOVProfiler asked to do nothing?");
       initializeGCOVProfilerPass(*PassRegistry::getPassRegistry());
     }
@@ -57,6 +57,8 @@ namespace {
     }
 
   private:
+    bool runOnModule(Module &M);
+
     // Create the GCNO files for the Module based on DebugInfo.
     void emitGCNO(DebugInfoFinder &DIF);
 
@@ -86,10 +88,13 @@ namespace {
     // list.
     void insertCounterWriteout(DebugInfoFinder &,
                                SmallVector<std::pair<GlobalVariable *,
-                                                     uint32_t>, 8> &);
+                                                     MDNode *>, 8> &);
+
+    std::string mangleName(DICompileUnit CU, std::string NewStem);
 
     bool EmitNotes;
     bool EmitData;
+    bool Use402Format;
 
     Module *M;
     LLVMContext *Ctx;
@@ -100,8 +105,9 @@ char GCOVProfiler::ID = 0;
 INITIALIZE_PASS(GCOVProfiler, "insert-gcov-profiling",
                 "Insert instrumentation for GCOV profiling", false, false)
 
-ModulePass *llvm::createGCOVProfilerPass(bool EmitNotes, bool EmitData) {
-  return new GCOVProfiler(EmitNotes, EmitData);
+ModulePass *llvm::createGCOVProfilerPass(bool EmitNotes, bool EmitData,
+                                         bool Use402Format) {
+  return new GCOVProfiler(EmitNotes, EmitData, Use402Format);
 }
 
 static DISubprogram findSubprogram(DIScope Scope) {
@@ -137,7 +143,7 @@ namespace {
       // A GCOV string is a length, followed by a NUL, then between 0 and 3 NULs
       // padding out to the next 4-byte word. The length is measured in 4-byte
       // words including padding, not bytes of actual string.
-      return (s.size() + 5) / 4;
+      return (s.size() / 4) + 1;
     }
 
     void writeGCOVString(StringRef s) {
@@ -247,7 +253,7 @@ namespace {
   // object users can construct, the blocks and lines will be rooted here.
   class GCOVFunction : public GCOVRecord {
    public:
-    GCOVFunction(DISubprogram SP, raw_ostream *os) {
+    GCOVFunction(DISubprogram SP, raw_ostream *os, bool Use402Format) {
       this->os = os;
 
       Function *F = SP.getFunction();
@@ -260,10 +266,14 @@ namespace {
       writeBytes(FunctionTag, 4);
       uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(SP.getName()) +
           1 + lengthOfGCOVString(SP.getFilename()) + 1;
+      if (!Use402Format)
+        ++BlockLen; // For second checksum.
       write(BlockLen);
       uint32_t Ident = reinterpret_cast<intptr_t>((MDNode*)SP);
       write(Ident);
-      write(0);  // checksum
+      write(0);  // checksum #1
+      if (!Use402Format)
+        write(0);  // checksum #2
       writeGCOVString(SP.getName());
       writeGCOVString(SP.getFilename());
       write(SP.getLineNumber());
@@ -318,9 +328,25 @@ namespace {
   };
 }
 
-// Replace the stem of a file, or add one if missing.
-static std::string replaceStem(std::string OrigFilename, std::string NewStem) {
-  return (sys::path::stem(OrigFilename) + "." + NewStem).str();
+std::string GCOVProfiler::mangleName(DICompileUnit CU, std::string NewStem) {
+  if (NamedMDNode *GCov = M->getNamedMetadata("llvm.gcov")) {
+    for (int i = 0, e = GCov->getNumOperands(); i != e; ++i) {
+      MDNode *N = GCov->getOperand(i);
+      if (N->getNumOperands() != 2) continue;
+      MDString *GCovFile = dyn_cast<MDString>(N->getOperand(0));
+      MDNode *CompileUnit = dyn_cast<MDNode>(N->getOperand(1));
+      if (!GCovFile || !CompileUnit) continue;
+      if (CompileUnit == CU) {
+        SmallString<128> Filename = GCovFile->getString();
+        sys::path::replace_extension(Filename, NewStem);
+        return Filename.str();
+      }
+    }
+  }
+
+  SmallString<128> Filename = CU.getFilename();
+  sys::path::replace_extension(Filename, NewStem);
+  return sys::path::filename(Filename.str());
 }
 
 bool GCOVProfiler::runOnModule(Module &M) {
@@ -346,9 +372,12 @@ void GCOVProfiler::emitGCNO(DebugInfoFinder &DIF) {
     DICompileUnit CU(*I);
     raw_fd_ostream *&out = GcnoFiles[CU];
     std::string ErrorInfo;
-    out = new raw_fd_ostream(replaceStem(CU.getFilename(), "gcno").c_str(),
-                             ErrorInfo, raw_fd_ostream::F_Binary);
-    out->write("oncg*404MVLL", 12);
+    out = new raw_fd_ostream(mangleName(CU, "gcno").c_str(), ErrorInfo,
+                             raw_fd_ostream::F_Binary);
+    if (!Use402Format)
+      out->write("oncg*404MVLL", 12);
+    else
+      out->write("oncg*402MVLL", 12);
   }
 
   for (DebugInfoFinder::iterator SPI = DIF.subprogram_begin(),
@@ -358,7 +387,7 @@ void GCOVProfiler::emitGCNO(DebugInfoFinder &DIF) {
 
     Function *F = SP.getFunction();
     if (!F) continue;
-    GCOVFunction Func(SP, os);
+    GCOVFunction Func(SP, os, Use402Format);
 
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
       GCOVBlock &Block = Func.getBlock(BB);
@@ -399,7 +428,7 @@ bool GCOVProfiler::emitProfileArcs(DebugInfoFinder &DIF) {
   if (DIF.subprogram_begin() == DIF.subprogram_end())
     return false;
 
-  SmallVector<std::pair<GlobalVariable *, uint32_t>, 8> CountersByIdent;
+  SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;
   for (DebugInfoFinder::iterator SPI = DIF.subprogram_begin(),
            SPE = DIF.subprogram_end(); SPI != SPE; ++SPI) {
     DISubprogram SP(*SPI);
@@ -422,8 +451,7 @@ bool GCOVProfiler::emitProfileArcs(DebugInfoFinder &DIF) {
                            GlobalValue::InternalLinkage,
                            Constant::getNullValue(CounterTy),
                            "__llvm_gcov_ctr", 0, false, 0);
-    CountersByIdent.push_back(
-        std::make_pair(Counters, reinterpret_cast<intptr_t>((MDNode*)SP)));
+    CountersBySP.push_back(std::make_pair(Counters, (MDNode*)SP));
 
     UniqueVector<BasicBlock *> ComplexEdgePreds;
     UniqueVector<BasicBlock *> ComplexEdgeSuccs;
@@ -490,7 +518,7 @@ bool GCOVProfiler::emitProfileArcs(DebugInfoFinder &DIF) {
     }
   }
 
-  insertCounterWriteout(DIF, CountersByIdent);
+  insertCounterWriteout(DIF, CountersBySP);
 
   return true;
 }
@@ -561,7 +589,10 @@ Constant *GCOVProfiler::getIncrementIndirectCounterFunc() {
 }
 
 Constant *GCOVProfiler::getEmitFunctionFunc() {
-  const Type *Args[] = { Type::getInt32Ty(*Ctx) };
+  const Type *Args[2] = {
+    Type::getInt32Ty(*Ctx),    // uint32_t ident
+    Type::getInt8PtrTy(*Ctx),  // const char *function_name
+  };
   const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
                                               Args, false);
   return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
@@ -597,7 +628,7 @@ GlobalVariable *GCOVProfiler::getEdgeStateValue() {
 
 void GCOVProfiler::insertCounterWriteout(
     DebugInfoFinder &DIF,
-    SmallVector<std::pair<GlobalVariable *, uint32_t>, 8> &CountersByIdent) {
+    SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> &CountersBySP) {
   const FunctionType *WriteoutFTy =
       FunctionType::get(Type::getVoidTy(*Ctx), false);
   Function *WriteoutF = Function::Create(WriteoutFTy,
@@ -615,14 +646,18 @@ void GCOVProfiler::insertCounterWriteout(
   for (DebugInfoFinder::iterator CUI = DIF.compile_unit_begin(),
            CUE = DIF.compile_unit_end(); CUI != CUE; ++CUI) {
     DICompileUnit compile_unit(*CUI);
-    std::string FilenameGcda = replaceStem(compile_unit.getFilename(), "gcda");
+    std::string FilenameGcda = mangleName(compile_unit, "gcda");
     Builder.CreateCall(StartFile,
                        Builder.CreateGlobalStringPtr(FilenameGcda));
-    for (SmallVector<std::pair<GlobalVariable *, uint32_t>, 8>::iterator
-             I = CountersByIdent.begin(), E = CountersByIdent.end();
+    for (SmallVector<std::pair<GlobalVariable *, MDNode *>, 8>::iterator
+             I = CountersBySP.begin(), E = CountersBySP.end();
          I != E; ++I) {
-      Builder.CreateCall(EmitFunction, ConstantInt::get(Type::getInt32Ty(*Ctx),
-                                                        I->second));
+      DISubprogram SP(I->second);
+      intptr_t ident = reinterpret_cast<intptr_t>(I->second);
+      Builder.CreateCall2(EmitFunction,
+                          ConstantInt::get(Type::getInt32Ty(*Ctx), ident),
+                          Builder.CreateGlobalStringPtr(SP.getName()));
+                                                        
       GlobalVariable *GV = I->first;
       unsigned Arcs =
           cast<ArrayType>(GV->getType()->getElementType())->getNumElements();
diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp
index 6b3f12d..182a43d 100644
--- a/lib/Transforms/Instrumentation/PathProfiling.cpp
+++ b/lib/Transforms/Instrumentation/PathProfiling.cpp
@@ -1351,8 +1351,6 @@ bool PathProfiler::runOnModule(Module &M) {
     return false;
   }
 
-  BasicBlock::iterator insertPoint = Main->getEntryBlock().getFirstNonPHI();
-
   llvmIncrementHashFunction = M.getOrInsertFunction(
     "llvm_increment_path_count",
     Type::getVoidTy(*Context), // return type
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 0184390..0af14ed 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -147,7 +147,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   if (!DisableBranchOpts) {
     MadeChange = false;
     for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-      MadeChange |= ConstantFoldTerminator(BB);
+      MadeChange |= ConstantFoldTerminator(BB, true);
 
     if (MadeChange)
       ModifiedDT = true;
@@ -371,9 +371,11 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
   // If these values will be promoted, find out what they will be promoted
   // to.  This helps us consider truncates on PPC as noop copies when they
   // are.
-  if (TLI.getTypeAction(SrcVT) == TargetLowering::Promote)
+  if (TLI.getTypeAction(CI->getContext(), SrcVT) ==
+      TargetLowering::TypePromoteInteger)
     SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
-  if (TLI.getTypeAction(DstVT) == TargetLowering::Promote)
+  if (TLI.getTypeAction(CI->getContext(), DstVT) ==
+      TargetLowering::TypePromoteInteger)
     DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);
 
   // If, after promotion, these are the same types, this is a noop copy.
@@ -548,7 +550,23 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
 
   // From here on out we're working with named functions.
   if (CI->getCalledFunction() == 0) return false;
-  
+
+  // llvm.dbg.value is far away from the value then iSel may not be able
+  // handle it properly. iSel will drop llvm.dbg.value if it can not 
+  // find a node corresponding to the value.
+  if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(CI))
+    if (Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue()))
+      if (!VI->isTerminator() &&
+          (DVI->getParent() != VI->getParent() || DT->dominates(DVI, VI))) {
+        DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
+        DVI->removeFromParent();
+        if (isa<PHINode>(VI))
+          DVI->insertBefore(VI->getParent()->getFirstNonPHI());
+        else
+          DVI->insertAfter(VI);
+        return true;
+      }
+
   // We'll need TargetData from here on out.
   const TargetData *TD = TLI ? TLI->getTargetData() : 0;
   if (!TD) return false;
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index efecb97..2515fd1 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -952,12 +952,12 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
       IntegerType::get(LoadTy->getContext(), NewLoadSize*8);
     DestPTy = PointerType::get(DestPTy, 
                        cast<PointerType>(PtrVal->getType())->getAddressSpace());
-    
+    Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
     PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
     LoadInst *NewLoad = Builder.CreateLoad(PtrVal);
     NewLoad->takeName(SrcVal);
     NewLoad->setAlignment(SrcVal->getAlignment());
-    
+
     DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
     DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
     
@@ -1576,6 +1576,9 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
     if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa))
       NewLoad->setMetadata(LLVMContext::MD_tbaa, Tag);
 
+    // Transfer DebugLoc.
+    NewLoad->setDebugLoc(LI->getDebugLoc());
+
     // Add the newly created load.
     ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred,
                                                         NewLoad));
@@ -1604,6 +1607,11 @@ bool GVN::processLoad(LoadInst *L) {
   if (L->isVolatile())
     return false;
 
+  if (L->use_empty()) {
+    markInstructionForDeletion(L);
+    return true;
+  }
+  
   // ... to a pointer that has been loaded from before...
   MemDepResult Dep = MD->getDependency(L);
 
@@ -2099,6 +2107,7 @@ bool GVN::performPRE(Function &F) {
 
       PREInstr->insertBefore(PREPred->getTerminator());
       PREInstr->setName(CurInst->getName() + ".pre");
+      PREInstr->setDebugLoc(CurInst->getDebugLoc());
       predMap[PREPred] = PREInstr;
       VN.add(PREInstr, ValNo);
       ++NumGVNPRE;
@@ -2118,7 +2127,7 @@ bool GVN::performPRE(Function &F) {
 
       VN.add(Phi, ValNo);
       addToLeaderTable(ValNo, Phi, CurrentBlock);
-
+      Phi->setDebugLoc(CurInst->getDebugLoc());
       CurInst->replaceAllUsesWith(Phi);
       if (Phi->getType()->isPointerTy()) {
         // Because we have added a PHI-use of the pointer value, it has now
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 09d569a..04ee7c8 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -52,20 +52,30 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Target/TargetData.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
 using namespace llvm;
 
 STATISTIC(NumRemoved , "Number of aux indvars removed");
+STATISTIC(NumWidened , "Number of indvars widened");
 STATISTIC(NumInserted, "Number of canonical indvars added");
 STATISTIC(NumReplaced, "Number of exit values replaced");
 STATISTIC(NumLFTR    , "Number of loop exit tests replaced");
+STATISTIC(NumElimExt , "Number of IV sign/zero extends eliminated");
+STATISTIC(NumElimRem , "Number of IV remainder operations eliminated");
+STATISTIC(NumElimCmp , "Number of IV comparisons eliminated");
+
+// DisableIVRewrite mode currently affects IVUsers, so is defined in libAnalysis
+// and referenced here.
+namespace llvm {
+  extern bool DisableIVRewrite;
+}
 
 namespace {
   class IndVarSimplify : public LoopPass {
@@ -73,12 +83,13 @@ namespace {
     LoopInfo        *LI;
     ScalarEvolution *SE;
     DominatorTree   *DT;
+    TargetData      *TD;
     SmallVector<WeakVH, 16> DeadInsts;
     bool Changed;
   public:
 
     static char ID; // Pass identification, replacement for typeid
-    IndVarSimplify() : LoopPass(ID) {
+    IndVarSimplify() : LoopPass(ID), IU(0), LI(0), SE(0), DT(0), TD(0) {
       initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry());
     }
 
@@ -101,15 +112,18 @@ namespace {
   private:
     bool isValidRewrite(Value *FromVal, Value *ToVal);
 
-    void EliminateIVComparisons();
-    void EliminateIVRemainders();
+    void SimplifyIVUsers(SCEVExpander &Rewriter);
+    void EliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
+    void EliminateIVRemainder(BinaryOperator *Rem,
+                              Value *IVOperand,
+                              bool IsSigned,
+                              PHINode *IVPhi);
     void RewriteNonIntegerIVs(Loop *L);
 
     ICmpInst *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
-                                   PHINode *IndVar,
-                                   BasicBlock *ExitingBlock,
-                                   BranchInst *BI,
-                                   SCEVExpander &Rewriter);
+                                        PHINode *IndVar,
+                                        SCEVExpander &Rewriter);
+
     void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
 
     void RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter);
@@ -122,7 +136,7 @@ namespace {
 
 char IndVarSimplify::ID = 0;
 INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars",
-                "Canonicalize Induction Variables", false, false)
+                "Induction Variable Simplification", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
@@ -130,7 +144,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_DEPENDENCY(IVUsers)
 INITIALIZE_PASS_END(IndVarSimplify, "indvars",
-                "Canonicalize Induction Variables", false, false)
+                "Induction Variable Simplification", false, false)
 
 Pass *llvm::createIndVarSimplifyPass() {
   return new IndVarSimplify();
@@ -183,17 +197,23 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
   return true;
 }
 
-/// LinearFunctionTestReplace - This method rewrites the exit condition of the
-/// loop to be a canonical != comparison against the incremented loop induction
-/// variable.  This pass is able to rewrite the exit tests of any loop where the
-/// SCEV analysis can determine a loop-invariant trip count of the loop, which
-/// is actually a much broader range than just linear tests.
-ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L,
-                                   const SCEV *BackedgeTakenCount,
-                                   PHINode *IndVar,
-                                   BasicBlock *ExitingBlock,
-                                   BranchInst *BI,
-                                   SCEVExpander &Rewriter) {
+/// canExpandBackedgeTakenCount - Return true if this loop's backedge taken
+/// count expression can be safely and cheaply expanded into an instruction
+/// sequence that can be used by LinearFunctionTestReplace.
+static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE) {
+  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) ||
+      BackedgeTakenCount->isZero())
+    return false;
+
+  if (!L->getExitingBlock())
+    return false;
+
+  // Can't rewrite non-branch yet.
+  BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
+  if (!BI)
+    return false;
+
   // Special case: If the backedge-taken count is a UDiv, it's very likely a
   // UDiv that ScalarEvolution produced in order to compute a precise
   // expression, rather than a UDiv from the user's code. If we can't find a
@@ -201,23 +221,68 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L,
   // rewriting the loop.
   if (isa<SCEVUDivExpr>(BackedgeTakenCount)) {
     ICmpInst *OrigCond = dyn_cast<ICmpInst>(BI->getCondition());
-    if (!OrigCond) return 0;
+    if (!OrigCond) return false;
     const SCEV *R = SE->getSCEV(OrigCond->getOperand(1));
     R = SE->getMinusSCEV(R, SE->getConstant(R->getType(), 1));
     if (R != BackedgeTakenCount) {
       const SCEV *L = SE->getSCEV(OrigCond->getOperand(0));
       L = SE->getMinusSCEV(L, SE->getConstant(L->getType(), 1));
       if (L != BackedgeTakenCount)
-        return 0;
+        return false;
     }
   }
+  return true;
+}
+
+/// getBackedgeIVType - Get the widest type used by the loop test after peeking
+/// through Truncs.
+///
+/// TODO: Unnecessary once LinearFunctionTestReplace is removed.
+static const Type *getBackedgeIVType(Loop *L) {
+  if (!L->getExitingBlock())
+    return 0;
+
+  // Can't rewrite non-branch yet.
+  BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
+  if (!BI)
+    return 0;
+
+  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!Cond)
+    return 0;
+
+  const Type *Ty = 0;
+  for(User::op_iterator OI = Cond->op_begin(), OE = Cond->op_end();
+      OI != OE; ++OI) {
+    assert((!Ty || Ty == (*OI)->getType()) && "bad icmp operand types");
+    TruncInst *Trunc = dyn_cast<TruncInst>(*OI);
+    if (!Trunc)
+      continue;
+
+    return Trunc->getSrcTy();
+  }
+  return Ty;
+}
+
+/// LinearFunctionTestReplace - This method rewrites the exit condition of the
+/// loop to be a canonical != comparison against the incremented loop induction
+/// variable.  This pass is able to rewrite the exit tests of any loop where the
+/// SCEV analysis can determine a loop-invariant trip count of the loop, which
+/// is actually a much broader range than just linear tests.
+ICmpInst *IndVarSimplify::
+LinearFunctionTestReplace(Loop *L,
+                          const SCEV *BackedgeTakenCount,
+                          PHINode *IndVar,
+                          SCEVExpander &Rewriter) {
+  assert(canExpandBackedgeTakenCount(L, SE) && "precondition");
+  BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
 
   // If the exiting block is not the same as the backedge block, we must compare
   // against the preincremented value, otherwise we prefer to compare against
   // the post-incremented value.
   Value *CmpIndVar;
   const SCEV *RHS = BackedgeTakenCount;
-  if (ExitingBlock == L->getLoopLatch()) {
+  if (L->getExitingBlock() == L->getLoopLatch()) {
     // Add one to the "backedge-taken" count to get the trip count.
     // If this addition may overflow, we have to be more pessimistic and
     // cast the induction variable before doing the add.
@@ -240,7 +305,7 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L,
     // The BackedgeTaken expression contains the number of times that the
     // backedge branches to the loop header.  This is one less than the
     // number of times the loop executes, so use the incremented indvar.
-    CmpIndVar = IndVar->getIncomingValueForBlock(ExitingBlock);
+    CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock());
   } else {
     // We have to use the preincremented value...
     RHS = SE->getTruncateOrZeroExtend(BackedgeTakenCount,
@@ -418,96 +483,519 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) {
     SE->forgetLoop(L);
 }
 
-void IndVarSimplify::EliminateIVComparisons() {
-  // Look for ICmp users.
-  for (IVUsers::iterator I = IU->begin(), E = IU->end(); I != E; ++I) {
-    IVStrideUse &UI = *I;
-    ICmpInst *ICmp = dyn_cast<ICmpInst>(UI.getUser());
-    if (!ICmp) continue;
-
-    bool Swapped = UI.getOperandValToReplace() == ICmp->getOperand(1);
-    ICmpInst::Predicate Pred = ICmp->getPredicate();
-    if (Swapped) Pred = ICmpInst::getSwappedPredicate(Pred);
-
-    // Get the SCEVs for the ICmp operands.
-    const SCEV *S = IU->getReplacementExpr(UI);
-    const SCEV *X = SE->getSCEV(ICmp->getOperand(!Swapped));
-
-    // Simplify unnecessary loops away.
-    const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent());
-    S = SE->getSCEVAtScope(S, ICmpLoop);
-    X = SE->getSCEVAtScope(X, ICmpLoop);
-
-    // If the condition is always true or always false, replace it with
-    // a constant value.
-    if (SE->isKnownPredicate(Pred, S, X))
-      ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext()));
-    else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X))
-      ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext()));
-    else
-      continue;
+namespace {
+  // Collect information about induction variables that are used by sign/zero
+  // extend operations. This information is recorded by CollectExtend and
+  // provides the input to WidenIV.
+  struct WideIVInfo {
+    const Type *WidestNativeType; // Widest integer type created [sz]ext
+    bool IsSigned;                // Was an sext user seen before a zext?
+
+    WideIVInfo() : WidestNativeType(0), IsSigned(false) {}
+  };
+  typedef std::map<PHINode *, WideIVInfo> WideIVMap;
+}
+
+/// CollectExtend - Update information about the induction variable that is
+/// extended by this sign or zero extend operation. This is used to determine
+/// the final width of the IV before actually widening it.
+static void CollectExtend(CastInst *Cast, PHINode *Phi, bool IsSigned,
+                          WideIVMap &IVMap, ScalarEvolution *SE,
+                          const TargetData *TD) {
+  const Type *Ty = Cast->getType();
+  uint64_t Width = SE->getTypeSizeInBits(Ty);
+  if (TD && !TD->isLegalInteger(Width))
+    return;
 
-    DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
-    DeadInsts.push_back(ICmp);
+  WideIVInfo &IVInfo = IVMap[Phi];
+  if (!IVInfo.WidestNativeType) {
+    IVInfo.WidestNativeType = SE->getEffectiveSCEVType(Ty);
+    IVInfo.IsSigned = IsSigned;
+    return;
   }
+
+  // We extend the IV to satisfy the sign of its first user, arbitrarily.
+  if (IVInfo.IsSigned != IsSigned)
+    return;
+
+  if (Width > SE->getTypeSizeInBits(IVInfo.WidestNativeType))
+    IVInfo.WidestNativeType = SE->getEffectiveSCEVType(Ty);
 }
 
-void IndVarSimplify::EliminateIVRemainders() {
-  // Look for SRem and URem users.
-  for (IVUsers::iterator I = IU->begin(), E = IU->end(); I != E; ++I) {
-    IVStrideUse &UI = *I;
-    BinaryOperator *Rem = dyn_cast<BinaryOperator>(UI.getUser());
-    if (!Rem) continue;
+namespace {
+/// WidenIV - The goal of this transform is to remove sign and zero extends
+/// without creating any new induction variables. To do this, it creates a new
+/// phi of the wider type and redirects all users, either removing extends or
+/// inserting truncs whenever we stop propagating the type.
+///
+class WidenIV {
+  PHINode *OrigPhi;
+  const Type *WideType;
+  bool IsSigned;
+
+  IVUsers *IU;
+  LoopInfo *LI;
+  Loop *L;
+  ScalarEvolution *SE;
+  DominatorTree *DT;
+  SmallVectorImpl<WeakVH> &DeadInsts;
+
+  PHINode *WidePhi;
+  Instruction *WideInc;
+  const SCEV *WideIncExpr;
+
+  SmallPtrSet<Instruction*,16> Processed;
+
+public:
+  WidenIV(PHINode *PN, const WideIVInfo &IVInfo, IVUsers *IUsers,
+          LoopInfo *LInfo, ScalarEvolution *SEv, DominatorTree *DTree,
+          SmallVectorImpl<WeakVH> &DI) :
+    OrigPhi(PN),
+    WideType(IVInfo.WidestNativeType),
+    IsSigned(IVInfo.IsSigned),
+    IU(IUsers),
+    LI(LInfo),
+    L(LI->getLoopFor(OrigPhi->getParent())),
+    SE(SEv),
+    DT(DTree),
+    DeadInsts(DI),
+    WidePhi(0),
+    WideInc(0),
+    WideIncExpr(0) {
+    assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");
+  }
 
-    bool isSigned = Rem->getOpcode() == Instruction::SRem;
-    if (!isSigned && Rem->getOpcode() != Instruction::URem)
-      continue;
+  bool CreateWideIV(SCEVExpander &Rewriter);
 
-    // We're only interested in the case where we know something about
-    // the numerator.
-    if (UI.getOperandValToReplace() != Rem->getOperand(0))
-      continue;
+protected:
+  Instruction *CloneIVUser(Instruction *NarrowUse,
+                           Instruction *NarrowDef,
+                           Instruction *WideDef);
 
-    // Get the SCEVs for the ICmp operands.
-    const SCEV *S = SE->getSCEV(Rem->getOperand(0));
-    const SCEV *X = SE->getSCEV(Rem->getOperand(1));
-
-    // Simplify unnecessary loops away.
-    const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent());
-    S = SE->getSCEVAtScope(S, ICmpLoop);
-    X = SE->getSCEVAtScope(X, ICmpLoop);
-
-    // i % n  -->  i  if i is in [0,n).
-    if ((!isSigned || SE->isKnownNonNegative(S)) &&
-        SE->isKnownPredicate(isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
-                             S, X))
-      Rem->replaceAllUsesWith(Rem->getOperand(0));
-    else {
-      // (i+1) % n  -->  (i+1)==n?0:(i+1)  if i is in [0,n).
-      const SCEV *LessOne =
-        SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1));
-      if ((!isSigned || SE->isKnownNonNegative(LessOne)) &&
-          SE->isKnownPredicate(isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
-                               LessOne, X)) {
-        ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ,
-                                      Rem->getOperand(0), Rem->getOperand(1),
-                                      "tmp");
-        SelectInst *Sel =
-          SelectInst::Create(ICmp,
-                             ConstantInt::get(Rem->getType(), 0),
-                             Rem->getOperand(0), "tmp", Rem);
-        Rem->replaceAllUsesWith(Sel);
-      } else
+  const SCEVAddRecExpr *GetWideRecurrence(Instruction *NarrowUse);
+
+  Instruction *WidenIVUse(Instruction *NarrowUse,
+                          Instruction *NarrowDef,
+                          Instruction *WideDef);
+};
+} // anonymous namespace
+
+/// SimplifyIVUsers - Iteratively perform simplification on IVUsers within this
+/// loop. IVUsers is treated as a worklist. Each successive simplification may
+/// push more users which may themselves be candidates for simplification.
+///
+void IndVarSimplify::SimplifyIVUsers(SCEVExpander &Rewriter) {
+  WideIVMap IVMap;
+
+  // Each round of simplification involves a round of eliminating operations
+  // followed by a round of widening IVs. A single IVUsers worklist is used
+  // across all rounds. The inner loop advances the user. If widening exposes
+  // more uses, then another pass through the outer loop is triggered.
+  for (IVUsers::iterator I = IU->begin(), E = IU->end(); I != E;) {
+    for(; I != E; ++I) {
+      Instruction *UseInst = I->getUser();
+      Value *IVOperand = I->getOperandValToReplace();
+
+      if (DisableIVRewrite) {
+        if (CastInst *Cast = dyn_cast<CastInst>(UseInst)) {
+          bool IsSigned = Cast->getOpcode() == Instruction::SExt;
+          if (IsSigned || Cast->getOpcode() == Instruction::ZExt) {
+            CollectExtend(Cast, I->getPhi(), IsSigned, IVMap, SE, TD);
+            continue;
+          }
+        }
+      }
+      if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
+        EliminateIVComparison(ICmp, IVOperand);
         continue;
+      }
+      if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) {
+        bool IsSigned = Rem->getOpcode() == Instruction::SRem;
+        if (IsSigned || Rem->getOpcode() == Instruction::URem) {
+          EliminateIVRemainder(Rem, IVOperand, IsSigned, I->getPhi());
+          continue;
+        }
+      }
+    }
+    for (WideIVMap::const_iterator I = IVMap.begin(), E = IVMap.end();
+         I != E; ++I) {
+      WidenIV Widener(I->first, I->second, IU, LI, SE, DT, DeadInsts);
+      if (Widener.CreateWideIV(Rewriter))
+        Changed = true;
     }
+  }
+}
 
-    // Inform IVUsers about the new users.
-    if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0)))
-      IU->AddUsersIfInteresting(I);
+static Value *getExtend( Value *NarrowOper, const Type *WideType,
+                               bool IsSigned, IRBuilder<> &Builder) {
+  return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) :
+                    Builder.CreateZExt(NarrowOper, WideType);
+}
 
-    DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
-    DeadInsts.push_back(Rem);
+/// CloneIVUser - Instantiate a wide operation to replace a narrow
+/// operation. This only needs to handle operations that can evaluation to
+/// SCEVAddRec. It can safely return 0 for any operation we decide not to clone.
+Instruction *WidenIV::CloneIVUser(Instruction *NarrowUse,
+                                  Instruction *NarrowDef,
+                                  Instruction *WideDef) {
+  unsigned Opcode = NarrowUse->getOpcode();
+  switch (Opcode) {
+  default:
+    return 0;
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::Sub:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    DEBUG(dbgs() << "Cloning IVUser: " << *NarrowUse << "\n");
+
+    IRBuilder<> Builder(NarrowUse);
+
+    // Replace NarrowDef operands with WideDef. Otherwise, we don't know
+    // anything about the narrow operand yet so must insert a [sz]ext. It is
+    // probably loop invariant and will be folded or hoisted. If it actually
+    // comes from a widened IV, it should be removed during a future call to
+    // WidenIVUse.
+    Value *LHS = (NarrowUse->getOperand(0) == NarrowDef) ? WideDef :
+      getExtend(NarrowUse->getOperand(0), WideType, IsSigned, Builder);
+    Value *RHS = (NarrowUse->getOperand(1) == NarrowDef) ? WideDef :
+      getExtend(NarrowUse->getOperand(1), WideType, IsSigned, Builder);
+
+    BinaryOperator *NarrowBO = cast<BinaryOperator>(NarrowUse);
+    BinaryOperator *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(),
+                                                    LHS, RHS,
+                                                    NarrowBO->getName());
+    Builder.Insert(WideBO);
+    if (NarrowBO->hasNoUnsignedWrap()) WideBO->setHasNoUnsignedWrap();
+    if (NarrowBO->hasNoSignedWrap()) WideBO->setHasNoSignedWrap();
+
+    return WideBO;
   }
+  llvm_unreachable(0);
+}
+
+// GetWideRecurrence - Is this instruction potentially interesting from IVUsers'
+// perspective after widening it's type? In other words, can the extend be
+// safely hoisted out of the loop with SCEV reducing the value to a recurrence
+// on the same loop. If so, return the sign or zero extended
+// recurrence. Otherwise return NULL.
+const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) {
+  if (!SE->isSCEVable(NarrowUse->getType()))
+    return 0;
+
+  const SCEV *NarrowExpr = SE->getSCEV(NarrowUse);
+  const SCEV *WideExpr = IsSigned ?
+    SE->getSignExtendExpr(NarrowExpr, WideType) :
+    SE->getZeroExtendExpr(NarrowExpr, WideType);
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr);
+  if (!AddRec || AddRec->getLoop() != L)
+    return 0;
+
+  return AddRec;
+}
+
+/// HoistStep - Attempt to hoist an IV increment above a potential use.
+///
+/// To successfully hoist, two criteria must be met:
+/// - IncV operands dominate InsertPos and
+/// - InsertPos dominates IncV
+///
+/// Meeting the second condition means that we don't need to check all of IncV's
+/// existing uses (it's moving up in the domtree).
+///
+/// This does not yet recursively hoist the operands, although that would
+/// not be difficult.
+static bool HoistStep(Instruction *IncV, Instruction *InsertPos,
+                      const DominatorTree *DT)
+{
+  if (DT->dominates(IncV, InsertPos))
+    return true;
+
+  if (!DT->dominates(InsertPos->getParent(), IncV->getParent()))
+    return false;
+
+  if (IncV->mayHaveSideEffects())
+    return false;
+
+  // Attempt to hoist IncV
+  for (User::op_iterator OI = IncV->op_begin(), OE = IncV->op_end();
+       OI != OE; ++OI) {
+    Instruction *OInst = dyn_cast<Instruction>(OI);
+    if (OInst && !DT->dominates(OInst, InsertPos))
+      return false;
+  }
+  IncV->moveBefore(InsertPos);
+  return true;
+}
+
+/// WidenIVUse - Determine whether an individual user of the narrow IV can be
+/// widened. If so, return the wide clone of the user.
+Instruction *WidenIV::WidenIVUse(Instruction *NarrowUse,
+                                 Instruction *NarrowDef,
+                                 Instruction *WideDef) {
+  // To be consistent with IVUsers, stop traversing the def-use chain at
+  // inner-loop phis or post-loop phis.
+  if (isa<PHINode>(NarrowUse) && LI->getLoopFor(NarrowUse->getParent()) != L)
+    return 0;
+
+  // Handle data flow merges and bizarre phi cycles.
+  if (!Processed.insert(NarrowUse))
+    return 0;
+
+  // Our raison d'etre! Eliminate sign and zero extension.
+  if (IsSigned ? isa<SExtInst>(NarrowUse) : isa<ZExtInst>(NarrowUse)) {
+    Value *NewDef = WideDef;
+    if (NarrowUse->getType() != WideType) {
+      unsigned CastWidth = SE->getTypeSizeInBits(NarrowUse->getType());
+      unsigned IVWidth = SE->getTypeSizeInBits(WideType);
+      if (CastWidth < IVWidth) {
+        // The cast isn't as wide as the IV, so insert a Trunc.
+        IRBuilder<> Builder(NarrowUse);
+        NewDef = Builder.CreateTrunc(WideDef, NarrowUse->getType());
+      }
+      else {
+        // A wider extend was hidden behind a narrower one. This may induce
+        // another round of IV widening in which the intermediate IV becomes
+        // dead. It should be very rare.
+        DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi
+              << " not wide enough to subsume " << *NarrowUse << "\n");
+        NarrowUse->replaceUsesOfWith(NarrowDef, WideDef);
+        NewDef = NarrowUse;
+      }
+    }
+    if (NewDef != NarrowUse) {
+      DEBUG(dbgs() << "INDVARS: eliminating " << *NarrowUse
+            << " replaced by " << *WideDef << "\n");
+      ++NumElimExt;
+      NarrowUse->replaceAllUsesWith(NewDef);
+      DeadInsts.push_back(NarrowUse);
+    }
+    // Now that the extend is gone, expose it's uses to IVUsers for potential
+    // further simplification within SimplifyIVUsers.
+    IU->AddUsersIfInteresting(WideDef, WidePhi);
+
+    // No further widening is needed. The deceased [sz]ext had done it for us.
+    return 0;
+  }
+  const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(NarrowUse);
+  if (!WideAddRec) {
+    // This user does not evaluate to a recurence after widening, so don't
+    // follow it. Instead insert a Trunc to kill off the original use,
+    // eventually isolating the original narrow IV so it can be removed.
+    IRBuilder<> Builder(NarrowUse);
+    Value *Trunc = Builder.CreateTrunc(WideDef, NarrowDef->getType());
+    NarrowUse->replaceUsesOfWith(NarrowDef, Trunc);
+    return 0;
+  }
+  // Reuse the IV increment that SCEVExpander created as long as it dominates
+  // NarrowUse.
+  Instruction *WideUse = 0;
+  if (WideAddRec == WideIncExpr && HoistStep(WideInc, NarrowUse, DT)) {
+    WideUse = WideInc;
+  }
+  else {
+    WideUse = CloneIVUser(NarrowUse, NarrowDef, WideDef);
+    if (!WideUse)
+      return 0;
+  }
+  // GetWideRecurrence ensured that the narrow expression could be extended
+  // outside the loop without overflow. This suggests that the wide use
+  // evaluates to the same expression as the extended narrow use, but doesn't
+  // absolutely guarantee it. Hence the following failsafe check. In rare cases
+  // where it fails, we simple throw away the newly created wide use.
+  if (WideAddRec != SE->getSCEV(WideUse)) {
+    DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse
+          << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec << "\n");
+    DeadInsts.push_back(WideUse);
+    return 0;
+  }
+
+  // Returning WideUse pushes it on the worklist.
+  return WideUse;
+}
+
+/// CreateWideIV - Process a single induction variable. First use the
+/// SCEVExpander to create a wide induction variable that evaluates to the same
+/// recurrence as the original narrow IV. Then use a worklist to forward
+/// traverse the narrow IV's def-use chain. After WidenIVUse as processed all
+/// interesting IV users, the narrow IV will be isolated for removal by
+/// DeleteDeadPHIs.
+///
+/// It would be simpler to delete uses as they are processed, but we must avoid
+/// invalidating SCEV expressions.
+///
+bool WidenIV::CreateWideIV(SCEVExpander &Rewriter) {
+  // Is this phi an induction variable?
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi));
+  if (!AddRec)
+    return false;
+
+  // Widen the induction variable expression.
+  const SCEV *WideIVExpr = IsSigned ?
+    SE->getSignExtendExpr(AddRec, WideType) :
+    SE->getZeroExtendExpr(AddRec, WideType);
+
+  assert(SE->getEffectiveSCEVType(WideIVExpr->getType()) == WideType &&
+         "Expect the new IV expression to preserve its type");
+
+  // Can the IV be extended outside the loop without overflow?
+  AddRec = dyn_cast<SCEVAddRecExpr>(WideIVExpr);
+  if (!AddRec || AddRec->getLoop() != L)
+    return false;
+
+  // An AddRec must have loop-invariant operands. Since this AddRec it
+  // materialized by a loop header phi, the expression cannot have any post-loop
+  // operands, so they must dominate the loop header.
+  assert(SE->properlyDominates(AddRec->getStart(), L->getHeader()) &&
+         SE->properlyDominates(AddRec->getStepRecurrence(*SE), L->getHeader())
+         && "Loop header phi recurrence inputs do not dominate the loop");
+
+  // The rewriter provides a value for the desired IV expression. This may
+  // either find an existing phi or materialize a new one. Either way, we
+  // expect a well-formed cyclic phi-with-increments. i.e. any operand not part
+  // of the phi-SCC dominates the loop entry.
+  Instruction *InsertPt = L->getHeader()->begin();
+  WidePhi = cast<PHINode>(Rewriter.expandCodeFor(AddRec, WideType, InsertPt));
+
+  // Remembering the WideIV increment generated by SCEVExpander allows
+  // WidenIVUse to reuse it when widening the narrow IV's increment. We don't
+  // employ a general reuse mechanism because the call above is the only call to
+  // SCEVExpander. Henceforth, we produce 1-to-1 narrow to wide uses.
+  if (BasicBlock *LatchBlock = L->getLoopLatch()) {
+    WideInc =
+      cast<Instruction>(WidePhi->getIncomingValueForBlock(LatchBlock));
+    WideIncExpr = SE->getSCEV(WideInc);
+  }
+
+  DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n");
+  ++NumWidened;
+
+  // Traverse the def-use chain using a worklist starting at the original IV.
+  assert(Processed.empty() && "expect initial state" );
+
+  // Each worklist entry has a Narrow def-use link and Wide def.
+  SmallVector<std::pair<Use *, Instruction *>, 8> NarrowIVUsers;
+  for (Value::use_iterator UI = OrigPhi->use_begin(),
+         UE = OrigPhi->use_end(); UI != UE; ++UI) {
+    NarrowIVUsers.push_back(std::make_pair(&UI.getUse(), WidePhi));
+  }
+  while (!NarrowIVUsers.empty()) {
+    Use *NarrowDefUse;
+    Instruction *WideDef;
+    tie(NarrowDefUse, WideDef) = NarrowIVUsers.pop_back_val();
+
+    // Process a def-use edge. This may replace the use, so don't hold a
+    // use_iterator across it.
+    Instruction *NarrowDef = cast<Instruction>(NarrowDefUse->get());
+    Instruction *NarrowUse = cast<Instruction>(NarrowDefUse->getUser());
+    Instruction *WideUse = WidenIVUse(NarrowUse, NarrowDef, WideDef);
+
+    // Follow all def-use edges from the previous narrow use.
+    if (WideUse) {
+      for (Value::use_iterator UI = NarrowUse->use_begin(),
+             UE = NarrowUse->use_end(); UI != UE; ++UI) {
+        NarrowIVUsers.push_back(std::make_pair(&UI.getUse(), WideUse));
+      }
+    }
+    // WidenIVUse may have removed the def-use edge.
+    if (NarrowDef->use_empty())
+      DeadInsts.push_back(NarrowDef);
+  }
+  return true;
+}
+
+void IndVarSimplify::EliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
+  unsigned IVOperIdx = 0;
+  ICmpInst::Predicate Pred = ICmp->getPredicate();
+  if (IVOperand != ICmp->getOperand(0)) {
+    // Swapped
+    assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
+    IVOperIdx = 1;
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  // Get the SCEVs for the ICmp operands.
+  const SCEV *S = SE->getSCEV(ICmp->getOperand(IVOperIdx));
+  const SCEV *X = SE->getSCEV(ICmp->getOperand(1 - IVOperIdx));
+
+  // Simplify unnecessary loops away.
+  const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent());
+  S = SE->getSCEVAtScope(S, ICmpLoop);
+  X = SE->getSCEVAtScope(X, ICmpLoop);
+
+  // If the condition is always true or always false, replace it with
+  // a constant value.
+  if (SE->isKnownPredicate(Pred, S, X))
+    ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext()));
+  else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X))
+    ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext()));
+  else
+    return;
+
+  DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+  ++NumElimCmp;
+  Changed = true;
+  DeadInsts.push_back(ICmp);
+}
+
+void IndVarSimplify::EliminateIVRemainder(BinaryOperator *Rem,
+                                          Value *IVOperand,
+                                          bool IsSigned,
+                                          PHINode *IVPhi) {
+  // We're only interested in the case where we know something about
+  // the numerator.
+  if (IVOperand != Rem->getOperand(0))
+    return;
+
+  // Get the SCEVs for the ICmp operands.
+  const SCEV *S = SE->getSCEV(Rem->getOperand(0));
+  const SCEV *X = SE->getSCEV(Rem->getOperand(1));
+
+  // Simplify unnecessary loops away.
+  const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent());
+  S = SE->getSCEVAtScope(S, ICmpLoop);
+  X = SE->getSCEVAtScope(X, ICmpLoop);
+
+  // i % n  -->  i  if i is in [0,n).
+  if ((!IsSigned || SE->isKnownNonNegative(S)) &&
+      SE->isKnownPredicate(IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                           S, X))
+    Rem->replaceAllUsesWith(Rem->getOperand(0));
+  else {
+    // (i+1) % n  -->  (i+1)==n?0:(i+1)  if i is in [0,n).
+    const SCEV *LessOne =
+      SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1));
+    if (IsSigned && !SE->isKnownNonNegative(LessOne))
+      return;
+
+    if (!SE->isKnownPredicate(IsSigned ?
+                              ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                              LessOne, X))
+      return;
+
+    ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ,
+                                  Rem->getOperand(0), Rem->getOperand(1),
+                                  "tmp");
+    SelectInst *Sel =
+      SelectInst::Create(ICmp,
+                         ConstantInt::get(Rem->getType(), 0),
+                         Rem->getOperand(0), "tmp", Rem);
+    Rem->replaceAllUsesWith(Sel);
+  }
+
+  // Inform IVUsers about the new users.
+  if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0)))
+    IU->AddUsersIfInteresting(I, IVPhi);
+
+  DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
+  ++NumElimRem;
+  Changed = true;
+  DeadInsts.push_back(Rem);
 }
 
 bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
@@ -526,6 +1014,8 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   LI = &getAnalysis<LoopInfo>();
   SE = &getAnalysis<ScalarEvolution>();
   DT = &getAnalysis<DominatorTree>();
+  TD = getAnalysisIfAvailable<TargetData>();
+
   DeadInsts.clear();
   Changed = false;
 
@@ -533,11 +1023,12 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // transform them to use integer recurrences.
   RewriteNonIntegerIVs(L);
 
-  BasicBlock *ExitingBlock = L->getExitingBlock(); // may be null
   const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
 
   // Create a rewriter object which we'll use to transform the code with.
   SCEVExpander Rewriter(*SE);
+  if (DisableIVRewrite)
+    Rewriter.disableCanonicalMode();
 
   // Check to see if this loop has a computable loop-invariant execution count.
   // If so, this means that we can compute the final value of any expressions
@@ -548,33 +1039,42 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount))
     RewriteLoopExitValues(L, Rewriter);
 
-  // Simplify ICmp IV users.
-  EliminateIVComparisons();
-
-  // Simplify SRem and URem IV users.
-  EliminateIVRemainders();
+  // Eliminate redundant IV users.
+  SimplifyIVUsers(Rewriter);
 
   // Compute the type of the largest recurrence expression, and decide whether
   // a canonical induction variable should be inserted.
   const Type *LargestType = 0;
   bool NeedCannIV = false;
-  if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
-    LargestType = BackedgeTakenCount->getType();
-    LargestType = SE->getEffectiveSCEVType(LargestType);
+  bool ExpandBECount = canExpandBackedgeTakenCount(L, SE);
+  if (ExpandBECount) {
     // If we have a known trip count and a single exit block, we'll be
     // rewriting the loop exit test condition below, which requires a
     // canonical induction variable.
-    if (ExitingBlock)
-      NeedCannIV = true;
-  }
-  for (IVUsers::const_iterator I = IU->begin(), E = IU->end(); I != E; ++I) {
-    const Type *Ty =
-      SE->getEffectiveSCEVType(I->getOperandValToReplace()->getType());
+    NeedCannIV = true;
+    const Type *Ty = BackedgeTakenCount->getType();
+    if (DisableIVRewrite) {
+      // In this mode, SimplifyIVUsers may have already widened the IV used by
+      // the backedge test and inserted a Trunc on the compare's operand. Get
+      // the wider type to avoid creating a redundant narrow IV only used by the
+      // loop test.
+      LargestType = getBackedgeIVType(L);
+    }
     if (!LargestType ||
         SE->getTypeSizeInBits(Ty) >
+        SE->getTypeSizeInBits(LargestType))
+      LargestType = SE->getEffectiveSCEVType(Ty);
+  }
+  if (!DisableIVRewrite) {
+    for (IVUsers::const_iterator I = IU->begin(), E = IU->end(); I != E; ++I) {
+      NeedCannIV = true;
+      const Type *Ty =
+        SE->getEffectiveSCEVType(I->getOperandValToReplace()->getType());
+      if (!LargestType ||
+          SE->getTypeSizeInBits(Ty) >
           SE->getTypeSizeInBits(LargestType))
-      LargestType = Ty;
-    NeedCannIV = true;
+        LargestType = Ty;
+    }
   }
 
   // Now that we know the largest of the induction variable expressions
@@ -614,19 +1114,17 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // If we have a trip count expression, rewrite the loop's exit condition
   // using it.  We can currently only handle loops with a single exit.
   ICmpInst *NewICmp = 0;
-  if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
-      !BackedgeTakenCount->isZero() &&
-      ExitingBlock) {
+  if (ExpandBECount) {
+    assert(canExpandBackedgeTakenCount(L, SE) &&
+           "canonical IV disrupted BackedgeTaken expansion");
     assert(NeedCannIV &&
            "LinearFunctionTestReplace requires a canonical induction variable");
-    // Can't rewrite non-branch yet.
-    if (BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator()))
-      NewICmp = LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar,
-                                          ExitingBlock, BI, Rewriter);
+    NewICmp = LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar,
+                                        Rewriter);
   }
-
   // Rewrite IV-derived expressions.
-  RewriteIVExpressions(L, Rewriter);
+  if (!DisableIVRewrite)
+    RewriteIVExpressions(L, Rewriter);
 
   // Clear the rewriter cache, because values that are in the rewriter's cache
   // can be deleted in the loop below, causing the AssertingVH in the cache to
@@ -649,7 +1147,8 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // For completeness, inform IVUsers of the IV use in the newly-created
   // loop exit test instruction.
   if (NewICmp)
-    IU->AddUsersIfInteresting(cast<Instruction>(NewICmp->getOperand(0)));
+    IU->AddUsersIfInteresting(cast<Instruction>(NewICmp->getOperand(0)),
+                              IndVar);
 
   // Clean up dead instructions.
   Changed |= DeleteDeadPHIs(L->getHeader());
@@ -1080,5 +1579,5 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   }
 
   // Add a new IVUsers entry for the newly-created integer PHI.
-  IU->AddUsersIfInteresting(NewPHI);
+  IU->AddUsersIfInteresting(NewPHI, NewPHI);
 }
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 7168177..cf18ff0 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -706,7 +706,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
     DEBUG(dbgs() << "  In block '" << BB->getName()
           << "' folding terminator: " << *BB->getTerminator() << '\n');
     ++NumFolds;
-    ConstantFoldTerminator(BB);
+    ConstantFoldTerminator(BB, true);
     return true;
   }
 
@@ -929,9 +929,10 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   if (UnavailablePred) {
     assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
            "Can't handle critical edge here!");
-    Value *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr", false,
+    LoadInst *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr", false,
                                  LI->getAlignment(),
                                  UnavailablePred->getTerminator());
+    NewVal->setDebugLoc(LI->getDebugLoc());
     AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
   }
 
@@ -944,6 +945,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "",
                                 LoadBB->begin());
   PN->takeName(LI);
+  PN->setDebugLoc(LI->getDebugLoc());
 
   // Insert new entries into the PHI for each predecessor.  A single block may
   // have multiple entries here.
@@ -1375,7 +1377,8 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
 
   // We didn't copy the terminator from BB over to NewBB, because there is now
   // an unconditional jump to SuccBB.  Insert the unconditional jump.
-  BranchInst::Create(SuccBB, NewBB);
+  BranchInst *NewBI =BranchInst::Create(SuccBB, NewBB);
+  NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc());
 
   // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
   // PHI nodes for NewBB now.
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 93de9cf..13bd022 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -372,7 +372,11 @@ bool LICM::canSinkOrHoistInst(Instruction &I) {
     return !pointerInvalidatedByLoop(LI->getOperand(0), Size,
                                      LI->getMetadata(LLVMContext::MD_tbaa));
   } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
-    // Handle obvious cases efficiently.
+    // Don't sink or hoist dbg info; it's legal, but not useful.
+    if (isa<DbgInfoIntrinsic>(I))
+      return false;
+
+    // Handle simple cases by querying alias analysis.
     AliasAnalysis::ModRefBehavior Behavior = AA->getModRefBehavior(CI);
     if (Behavior == AliasAnalysis::DoesNotAccessMemory)
       return true;
@@ -445,8 +449,7 @@ void LICM::sink(Instruction &I) {
   // enough that we handle it as a special (more efficient) case.  It is more
   // efficient to handle because there are no PHI nodes that need to be placed.
   if (ExitBlocks.size() == 1) {
-    if (!isa<DbgInfoIntrinsic>(I) && 
-        !DT->dominates(I.getParent(), ExitBlocks[0])) {
+    if (!DT->dominates(I.getParent(), ExitBlocks[0])) {
       // Instruction is not used, just delete it.
       CurAST->deleteValue(&I);
       // If I has users in unreachable blocks, eliminate.
@@ -602,13 +605,15 @@ namespace {
     SmallPtrSet<Value*, 4> &PointerMustAliases;
     SmallVectorImpl<BasicBlock*> &LoopExitBlocks;
     AliasSetTracker &AST;
+    DebugLoc DL;
   public:
     LoopPromoter(Value *SP,
                  const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
                  SmallPtrSet<Value*, 4> &PMA,
-                 SmallVectorImpl<BasicBlock*> &LEB, AliasSetTracker &ast)
-      : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
-        LoopExitBlocks(LEB), AST(ast) {}
+                 SmallVectorImpl<BasicBlock*> &LEB, AliasSetTracker &ast,
+                 DebugLoc dl)
+      : LoadAndStorePromoter(Insts, S, 0, 0), SomePtr(SP),
+        PointerMustAliases(PMA), LoopExitBlocks(LEB), AST(ast), DL(dl) {}
     
     virtual bool isInstInList(Instruction *I,
                               const SmallVectorImpl<Instruction*> &) const {
@@ -629,7 +634,8 @@ namespace {
         BasicBlock *ExitBlock = LoopExitBlocks[i];
         Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
         Instruction *InsertPos = ExitBlock->getFirstNonPHI();
-        new StoreInst(LiveInValue, SomePtr, InsertPos);
+        StoreInst *NewSI = new StoreInst(LiveInValue, SomePtr, InsertPos);
+        NewSI->setDebugLoc(DL);
       }
     }
 
@@ -727,6 +733,12 @@ void LICM::PromoteAliasSet(AliasSet &AS) {
   Changed = true;
   ++NumPromoted;
 
+  // Grab a debug location for the inserted loads/stores; given that the
+  // inserted loads/stores have little relation to the original loads/stores,
+  // this code just arbitrarily picks a location from one, since any debug
+  // location is better than none.
+  DebugLoc DL = LoopUses[0]->getDebugLoc();
+
   SmallVector<BasicBlock*, 8> ExitBlocks;
   CurLoop->getUniqueExitBlocks(ExitBlocks);
   
@@ -734,13 +746,14 @@ void LICM::PromoteAliasSet(AliasSet &AS) {
   SmallVector<PHINode*, 16> NewPHIs;
   SSAUpdater SSA(&NewPHIs);
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
-                        *CurAST);
+                        *CurAST, DL);
   
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
   LoadInst *PreheaderLoad =
     new LoadInst(SomePtr, SomePtr->getName()+".promoted",
                  Preheader->getTerminator());
+  PreheaderLoad->setDebugLoc(DL);
   SSA.AddAvailableValue(Preheader, PreheaderLoad);
 
   // Rewrite all the loads in the loop and remember all the definitions from
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 1366231..dbf6eec 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -128,11 +128,11 @@ INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
 
 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); }
 
-/// DeleteDeadInstruction - Delete this instruction.  Before we do, go through
+/// deleteDeadInstruction - Delete this instruction.  Before we do, go through
 /// and zero out all the operands of this instruction.  If any of them become
 /// dead, delete them and the computation tree that feeds them.
 ///
-static void DeleteDeadInstruction(Instruction *I, ScalarEvolution &SE) {
+static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE) {
   SmallVector<Instruction*, 32> NowDeadInsts;
 
   NowDeadInsts.push_back(I);
@@ -162,6 +162,14 @@ static void DeleteDeadInstruction(Instruction *I, ScalarEvolution &SE) {
   } while (!NowDeadInsts.empty());
 }
 
+/// deleteIfDeadInstruction - If the specified value is a dead instruction,
+/// delete it and any recursively used instructions.
+static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (isInstructionTriviallyDead(I))
+      deleteDeadInstruction(I, SE);    
+}
+
 bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
   CurLoop = L;
 
@@ -454,31 +462,35 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
     return false;
   }
 
-
-  // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
-  // this into a memset in the loop preheader now if we want.  However, this
-  // would be unsafe to do if there is anything else in the loop that may read
-  // or write to the aliased location.  Check for an alias.
-  if (mayLoopAccessLocation(DestPtr, AliasAnalysis::ModRef,
-                            CurLoop, BECount,
-                            StoreSize, getAnalysis<AliasAnalysis>(), TheStore))
-    return false;
-
-  // Okay, everything looks good, insert the memset.
-  BasicBlock *Preheader = CurLoop->getLoopPreheader();
-
-  IRBuilder<> Builder(Preheader->getTerminator());
-
   // The trip count of the loop and the base pointer of the addrec SCEV is
   // guaranteed to be loop invariant, which means that it should dominate the
-  // header.  Just insert code for it in the preheader.
+  // header.  This allows us to insert code for it in the preheader.
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  IRBuilder<> Builder(Preheader->getTerminator());
   SCEVExpander Expander(*SE);
-
+  
+  // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
+  // this into a memset in the loop preheader now if we want.  However, this
+  // would be unsafe to do if there is anything else in the loop that may read
+  // or write to the aliased location.  Check for any overlap by generating the
+  // base pointer and checking the region.
   unsigned AddrSpace = cast<PointerType>(DestPtr->getType())->getAddressSpace();
   Value *BasePtr =
     Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace),
                            Preheader->getTerminator());
 
+
+  if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef,
+                            CurLoop, BECount,
+                            StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){
+    Expander.clear();
+    // If we generated new code for the base pointer, clean up.
+    deleteIfDeadInstruction(BasePtr, *SE);
+    return false;
+  }
+  
+  // Okay, everything looks good, insert the memset.
+
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
   const Type *IntPtr = TD->getIntPtrType(DestPtr->getContext());
@@ -521,7 +533,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
 
   // Okay, the memset has been formed.  Zap the original store and anything that
   // feeds into it.
-  DeleteDeadInstruction(TheStore, *SE);
+  deleteDeadInstruction(TheStore, *SE);
   ++NumMemSet;
   return true;
 }
@@ -539,41 +551,51 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
 
   LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
 
+  // The trip count of the loop and the base pointer of the addrec SCEV is
+  // guaranteed to be loop invariant, which means that it should dominate the
+  // header.  This allows us to insert code for it in the preheader.
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  IRBuilder<> Builder(Preheader->getTerminator());
+  SCEVExpander Expander(*SE);
+  
   // Okay, we have a strided store "p[i]" of a loaded value.  We can turn
   // this into a memcpy in the loop preheader now if we want.  However, this
   // would be unsafe to do if there is anything else in the loop that may read
-  // or write to the stored location (including the load feeding the stores).
-  // Check for an alias.
-  if (mayLoopAccessLocation(SI->getPointerOperand(), AliasAnalysis::ModRef,
+  // or write the memory region we're storing to.  This includes the load that
+  // feeds the stores.  Check for an alias by generating the base address and
+  // checking everything.
+  Value *StoreBasePtr =
+    Expander.expandCodeFor(StoreEv->getStart(),
+                           Builder.getInt8PtrTy(SI->getPointerAddressSpace()),
+                           Preheader->getTerminator());
+  
+  if (mayLoopAccessLocation(StoreBasePtr, AliasAnalysis::ModRef,
                             CurLoop, BECount, StoreSize,
-                            getAnalysis<AliasAnalysis>(), SI))
+                            getAnalysis<AliasAnalysis>(), SI)) {
+    Expander.clear();
+    // If we generated new code for the base pointer, clean up.
+    deleteIfDeadInstruction(StoreBasePtr, *SE);
     return false;
+  }
 
   // For a memcpy, we have to make sure that the input array is not being
   // mutated by the loop.
-  if (mayLoopAccessLocation(LI->getPointerOperand(), AliasAnalysis::Mod,
-                            CurLoop, BECount, StoreSize,
-                            getAnalysis<AliasAnalysis>(), SI))
-    return false;
-
-  // Okay, everything looks good, insert the memcpy.
-  BasicBlock *Preheader = CurLoop->getLoopPreheader();
-
-  IRBuilder<> Builder(Preheader->getTerminator());
-
-  // The trip count of the loop and the base pointer of the addrec SCEV is
-  // guaranteed to be loop invariant, which means that it should dominate the
-  // header.  Just insert code for it in the preheader.
-  SCEVExpander Expander(*SE);
-
   Value *LoadBasePtr =
     Expander.expandCodeFor(LoadEv->getStart(),
                            Builder.getInt8PtrTy(LI->getPointerAddressSpace()),
                            Preheader->getTerminator());
-  Value *StoreBasePtr =
-    Expander.expandCodeFor(StoreEv->getStart(),
-                           Builder.getInt8PtrTy(SI->getPointerAddressSpace()),
-                           Preheader->getTerminator());
+
+  if (mayLoopAccessLocation(LoadBasePtr, AliasAnalysis::Mod, CurLoop, BECount,
+                            StoreSize, getAnalysis<AliasAnalysis>(), SI)) {
+    Expander.clear();
+    // If we generated new code for the base pointer, clean up.
+    deleteIfDeadInstruction(LoadBasePtr, *SE);
+    deleteIfDeadInstruction(StoreBasePtr, *SE);
+    return false;
+  }
+  
+  // Okay, everything is safe, we can transform this!
+  
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
@@ -589,18 +611,19 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
   Value *NumBytes =
     Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
 
-  Value *NewCall =
+  CallInst *NewCall =
     Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes,
                          std::min(SI->getAlignment(), LI->getAlignment()));
+  NewCall->setDebugLoc(SI->getDebugLoc());
 
   DEBUG(dbgs() << "  Formed memcpy: " << *NewCall << "\n"
                << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
                << "    from store ptr=" << *StoreEv << " at: " << *SI << "\n");
-  (void)NewCall;
+  
 
   // Okay, the memset has been formed.  Zap the original store and anything that
   // feeds into it.
-  DeleteDeadInstruction(SI, *SE);
+  deleteDeadInstruction(SI, *SE);
   ++NumMemCpy;
   return true;
 }
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 5abc790..73ebd61 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -209,7 +209,12 @@ struct Formula {
   /// when AM.Scale is not zero.
   const SCEV *ScaledReg;
 
-  Formula() : ScaledReg(0) {}
+  /// UnfoldedOffset - An additional constant offset which added near the
+  /// use. This requires a temporary register, but the offset itself can
+  /// live in an add immediate field rather than a register.
+  int64_t UnfoldedOffset;
+
+  Formula() : ScaledReg(0), UnfoldedOffset(0) {}
 
   void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
 
@@ -379,6 +384,10 @@ void Formula::print(raw_ostream &OS) const {
       OS << "<unknown>";
     OS << ')';
   }
+  if (UnfoldedOffset != 0) {
+    if (!First) OS << " + "; else First = false;
+    OS << "imm(" << UnfoldedOffset << ')';
+  }
 }
 
 void Formula::dump() const {
@@ -771,8 +780,10 @@ void Cost::RateFormula(const Formula &F,
     RatePrimaryRegister(BaseReg, Regs, L, SE, DT);
   }
 
-  if (F.BaseRegs.size() > 1)
-    NumBaseAdds += F.BaseRegs.size() - 1;
+  // Determine how many (unfolded) adds we'll need inside the loop.
+  size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
+  if (NumBaseParts > 1)
+    NumBaseAdds += NumBaseParts - 1;
 
   // Tally up the non-zero immediates.
   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
@@ -1793,7 +1804,8 @@ LSRInstance::OptimizeLoopTermCond() {
         ExitingBlock->getInstList().insert(TermBr, Cond);
 
         // Clone the IVUse, as the old use still exists!
-        CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
+        CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace(),
+                              CondUse->getPhi());
         TermBr->replaceUsesOfWith(OldCond, Cond);
       }
     }
@@ -1945,7 +1957,8 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
         if (F.BaseRegs == OrigF.BaseRegs &&
             F.ScaledReg == OrigF.ScaledReg &&
             F.AM.BaseGV == OrigF.AM.BaseGV &&
-            F.AM.Scale == OrigF.AM.Scale) {
+            F.AM.Scale == OrigF.AM.Scale &&
+            F.UnfoldedOffset == OrigF.UnfoldedOffset) {
           if (F.AM.BaseOffs == 0)
             return &LU;
           // This is the formula where all the registers and symbols matched;
@@ -2061,6 +2074,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
         // x == y  -->  x - y == 0
         const SCEV *N = SE.getSCEV(NV);
         if (SE.isLoopInvariant(N, L)) {
+          // S is normalized, so normalize N before folding it into S
+          // to keep the result normalized.
+          N = TransformForPostIncUse(Normalize, N, CI, 0,
+                                     LF.PostIncLoops, SE, DT);
           Kind = LSRUse::ICmpZero;
           S = SE.getMinusSCEV(N, S);
         }
@@ -2313,8 +2330,29 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
       if (InnerSum->isZero())
         continue;
       Formula F = Base;
-      F.BaseRegs[i] = InnerSum;
-      F.BaseRegs.push_back(*J);
+
+      // Add the remaining pieces of the add back into the new formula.
+      const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
+      if (TLI && InnerSumSC &&
+          SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
+          TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+                                   InnerSumSC->getValue()->getZExtValue())) {
+        F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
+                           InnerSumSC->getValue()->getZExtValue();
+        F.BaseRegs.erase(F.BaseRegs.begin() + i);
+      } else
+        F.BaseRegs[i] = InnerSum;
+
+      // Add J as its own register, or an unfolded immediate.
+      const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
+      if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
+          TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+                                   SC->getValue()->getZExtValue()))
+        F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
+                           SC->getValue()->getZExtValue();
+      else
+        F.BaseRegs.push_back(*J);
+
       if (InsertFormula(LU, LUIdx, F))
         // If that formula hadn't been seen before, recurse to find more like
         // it.
@@ -2482,6 +2520,15 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
         continue;
     }
 
+    // Check that multiplying with the unfolded offset doesn't overflow.
+    if (F.UnfoldedOffset != 0) {
+      if (F.UnfoldedOffset == INT64_MIN && Factor == -1)
+        continue;
+      F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
+      if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
+        continue;
+    }
+
     // If we make it here and it's legal, add it.
     (void)InsertFormula(LU, LUIdx, F);
   next:;
@@ -2664,7 +2711,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
       // other orig regs.
       ImmMapTy::const_iterator OtherImms[] = {
         Imms.begin(), prior(Imms.end()),
-        Imms.upper_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
+        Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
       };
       for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
         ImmMapTy::const_iterator M = OtherImms[i];
@@ -2738,8 +2785,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
           Formula NewF = F;
           NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
           if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
-                          LU.Kind, LU.AccessTy, TLI))
-            continue;
+                          LU.Kind, LU.AccessTy, TLI)) {
+            if (!TLI ||
+                !TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
+              continue;
+            NewF = F;
+            NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
+          }
           NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
 
           // If the new formula has a constant in a register, and adding the
@@ -3488,6 +3540,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
     }
   }
 
+  // Expand the unfolded offset portion.
+  int64_t UnfoldedOffset = F.UnfoldedOffset;
+  if (UnfoldedOffset != 0) {
+    // Just add the immediate values.
+    Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
+                                                       UnfoldedOffset)));
+  }
+
   // Emit instructions summing all the operands.
   const SCEV *FullS = Ops.empty() ?
                       SE.getConstant(IntTy, 0) :
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index b4e3d31..e05f29c 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -258,6 +258,7 @@ bool LoopUnswitch::processCurrentLoop() {
       if (LoopCond && SI->getNumCases() > 1) {
         // Find a value to unswitch on:
         // FIXME: this should chose the most expensive case!
+        // FIXME: scan for a case with a non-critical edge?
         Constant *UnswitchVal = SI->getCaseValue(1);
         // Do not process same value again and again.
         if (!UnswitchedVals.insert(UnswitchVal))
@@ -560,6 +561,8 @@ void LoopUnswitch::SplitExitEdges(Loop *L,
     BasicBlock *ExitBlock = ExitBlocks[i];
     SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock),
                                        pred_end(ExitBlock));
+    // Although SplitBlockPredecessors doesn't preserve loop-simplify in
+    // general, if we call it on all predecessors of all exits then it does.
     SplitBlockPredecessors(ExitBlock, Preds.data(), Preds.size(),
                            ".us-lcssa", this);
   }
@@ -786,8 +789,13 @@ void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
   // If this is the edge to the header block for a loop, remove the loop and
   // promote all subloops.
   if (Loop *BBLoop = LI->getLoopFor(BB)) {
-    if (BBLoop->getLoopLatch() == BB)
+    if (BBLoop->getLoopLatch() == BB) {
       RemoveLoopFromHierarchy(BBLoop);
+      if (currentLoop == BBLoop) {
+        currentLoop = 0;
+        redoLoop = false;
+      }
+    }
   }
 
   // Remove the block from the loop info, which removes it from any loops it
@@ -859,7 +867,6 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
   
   // FOLD boolean conditions (X|LIC), (X&LIC).  Fold conditional branches,
   // selects, switches.
-  std::vector<User*> Users(LIC->use_begin(), LIC->use_end());
   std::vector<Instruction*> Worklist;
   LLVMContext &Context = Val->getContext();
 
@@ -875,13 +882,14 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
       Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()), 
                                      !cast<ConstantInt>(Val)->getZExtValue());
     
-    for (unsigned i = 0, e = Users.size(); i != e; ++i)
-      if (Instruction *U = cast<Instruction>(Users[i])) {
-        if (!L->contains(U))
-          continue;
-        U->replaceUsesOfWith(LIC, Replacement);
-        Worklist.push_back(U);
-      }
+    for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end();
+         UI != E; ++UI) {
+      Instruction *U = dyn_cast<Instruction>(*UI);
+      if (!U || !L->contains(U))
+        continue;
+      U->replaceUsesOfWith(LIC, Replacement);
+      Worklist.push_back(U);
+    }
     SimplifyCode(Worklist, L);
     return;
   }
@@ -889,9 +897,10 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
   // Otherwise, we don't know the precise value of LIC, but we do know that it
   // is certainly NOT "Val".  As such, simplify any uses in the loop that we
   // can.  This case occurs when we unswitch switch statements.
-  for (unsigned i = 0, e = Users.size(); i != e; ++i) {
-    Instruction *U = cast<Instruction>(Users[i]);
-    if (!L->contains(U))
+  for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end();
+       UI != E; ++UI) {
+    Instruction *U = dyn_cast<Instruction>(*UI);
+    if (!U || !L->contains(U))
       continue;
 
     Worklist.push_back(U);
@@ -909,13 +918,22 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     // Found a dead case value.  Don't remove PHI nodes in the 
     // successor if they become single-entry, those PHI nodes may
     // be in the Users list.
-        
+
+    BasicBlock *Switch = SI->getParent();
+    BasicBlock *SISucc = SI->getSuccessor(DeadCase);
+    BasicBlock *Latch = L->getLoopLatch();
+    if (!SI->findCaseDest(SISucc)) continue;  // Edge is critical.
+    // If the DeadCase successor dominates the loop latch, then the
+    // transformation isn't safe since it will delete the sole predecessor edge
+    // to the latch.
+    if (Latch && DT->dominates(SISucc, Latch))
+      continue;
+
     // FIXME: This is a hack.  We need to keep the successor around
     // and hooked up so as to preserve the loop structure, because
     // trying to update it is complicated.  So instead we preserve the
     // loop structure and put the block on a dead code path.
-    BasicBlock *Switch = SI->getParent();
-    SplitEdge(Switch, SI->getSuccessor(DeadCase), this);
+    SplitEdge(Switch, SISucc, this);
     // Compute the successors instead of relying on the return value
     // of SplitEdge, since it may have split the switch successor
     // after PHI nodes.
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index a3035cb..be5aa2e 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/IRBuilder.h"
@@ -459,7 +460,10 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
           for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i)
             dbgs() << *Range.TheStores[i] << '\n';
           dbgs() << "With: " << *AMemSet << '\n');
-    
+
+    if (!Range.TheStores.empty())
+      AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
+
     // Zap all the stores.
     for (SmallVector<Instruction*, 16>::const_iterator
          SI = Range.TheStores.begin(),
@@ -484,11 +488,28 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   // a memcpy.
   if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
     if (!LI->isVolatile() && LI->hasOneUse()) {
-      MemDepResult dep = MD->getDependency(LI);
+      MemDepResult ldep = MD->getDependency(LI);
       CallInst *C = 0;
-      if (dep.isClobber() && !isa<MemCpyInst>(dep.getInst()))
-        C = dyn_cast<CallInst>(dep.getInst());
-      
+      if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
+        C = dyn_cast<CallInst>(ldep.getInst());
+
+      if (C) {
+        // Check that nothing touches the dest of the "copy" between
+        // the call and the store.
+        MemDepResult sdep = MD->getDependency(SI);
+        if (!sdep.isNonLocal()) {
+          bool FoundCall = false;
+          for (BasicBlock::iterator I = SI, E = sdep.getInst(); I != E; --I) {
+            if (&*I == C) {
+              FoundCall = true;
+              break;
+            }
+          }
+          if (!FoundCall)
+            C = 0;
+        }
+      }
+
       if (C) {
         bool changed = performCallSlotOptzn(LI,
                         SI->getPointerOperand()->stripPointerCasts(), 
@@ -863,12 +884,16 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
   if (C1 == 0 || C1->getValue().getZExtValue() < ByValSize)
     return false;
 
-  // Get the alignment of the byval.  If it is greater than the memcpy, then we
-  // can't do the substitution.  If the call doesn't specify the alignment, then
-  // it is some target specific value that we can't know.
+  // Get the alignment of the byval.  If the call doesn't specify the alignment,
+  // then it is some target specific value that we can't know.
   unsigned ByValAlign = CS.getParamAlignment(ArgNo+1);
-  if (ByValAlign == 0 || MDep->getAlignment() < ByValAlign)
-    return false;  
+  if (ByValAlign == 0) return false;
+  
+  // If it is greater than the memcpy, then we check to see if we can force the
+  // source of the memcpy to the alignment we need.  If we fail, we bail out.
+  if (MDep->getAlignment() < ByValAlign &&
+      getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, TD) < ByValAlign)
+    return false;
   
   // Verify that the copied-from memory doesn't change in between the memcpy and
   // the byval call.
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index db8eb85..083412e 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -655,7 +655,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
   
   // Just mark all destinations executable!
   // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
-  if (isa<IndirectBrInst>(&TI))
+  if (isa<IndirectBrInst>(TI))
     return true;
   
 #ifndef NDEBUG
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index 8178c27..8938b28 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -30,6 +30,7 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/Analysis/DIBuilder.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -341,7 +342,8 @@ void ConvertToScalarInfo::MergeInType(const Type *In, uint64_t Offset,
     // If we're accessing something that could be an element of a vector, see
     // if the implied vector agrees with what we already have and if Offset is
     // compatible with it.
-    if (Offset % EltSize == 0 && AllocaSize % EltSize == 0) {
+    if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 &&
+        (!VectorTy || Offset * 8 < VectorTy->getPrimitiveSizeInBits())) {
       if (!VectorTy) {
         VectorTy = VectorType::get(In, AllocaSize/EltSize);
         return;
@@ -741,8 +743,9 @@ ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType,
   // If the result alloca is a vector type, this is either an element
   // access or a bitcast to another vector type of the same size.
   if (const VectorType *VTy = dyn_cast<VectorType>(FromType)) {
+    unsigned FromTypeSize = TD.getTypeAllocSize(FromType);
     unsigned ToTypeSize = TD.getTypeAllocSize(ToType);
-    if (ToTypeSize == AllocaSize) {
+    if (FromTypeSize == ToTypeSize) {
       // If the two types have the same primitive size, use a bit cast.
       // Otherwise, it is two vectors with the same element type that has
       // the same allocation size but different number of elements so use
@@ -754,13 +757,13 @@ ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType,
         return CreateShuffleVectorCast(FromVal, ToType, Builder);
     }
 
-    if (isPowerOf2_64(AllocaSize / ToTypeSize)) {
+    if (isPowerOf2_64(FromTypeSize / ToTypeSize)) {
       assert(!(ToType->isVectorTy() && Offset != 0) && "Can't extract a value "
              "of a smaller vector type at a nonzero offset.");
 
       const Type *CastElementTy = getScaledElementType(FromType, ToType,
                                                        ToTypeSize * 8);
-      unsigned NumCastVectorElements = AllocaSize / ToTypeSize;
+      unsigned NumCastVectorElements = FromTypeSize / ToTypeSize;
 
       LLVMContext &Context = FromVal->getContext();
       const Type *CastTy = VectorType::get(CastElementTy,
@@ -1051,8 +1054,9 @@ namespace {
 class AllocaPromoter : public LoadAndStorePromoter {
   AllocaInst *AI;
 public:
-  AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S)
-    : LoadAndStorePromoter(Insts, S), AI(0) {}
+  AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
+                 DbgDeclareInst *DD, DIBuilder *&DB)
+    : LoadAndStorePromoter(Insts, S, DD, DB), AI(0) {}
   
   void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) {
     // Remember which alloca we're promoting (for isInstInList).
@@ -1329,7 +1333,6 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
   return true;
 }
 
-
 bool SROA::performPromotion(Function &F) {
   std::vector<AllocaInst*> Allocas;
   DominatorTree *DT = 0;
@@ -1340,6 +1343,7 @@ bool SROA::performPromotion(Function &F) {
 
   bool Changed = false;
   SmallVector<Instruction*, 64> Insts;
+  DIBuilder *DIB = 0;
   while (1) {
     Allocas.clear();
 
@@ -1363,8 +1367,11 @@ bool SROA::performPromotion(Function &F) {
         for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
              UI != E; ++UI)
           Insts.push_back(cast<Instruction>(*UI));
-        
-        AllocaPromoter(Insts, SSA).run(AI, Insts);
+
+        DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI);
+        if (DDI && !DIB)
+          DIB = new DIBuilder(*AI->getParent()->getParent()->getParent());
+        AllocaPromoter(Insts, SSA, DDI, DIB).run(AI, Insts);
         Insts.clear();
       }
     }
@@ -1372,6 +1379,10 @@ bool SROA::performPromotion(Function &F) {
     Changed = true;
   }
 
+  // FIXME: Is there a better way to handle the lazy initialization of DIB
+  // so that there doesn't need to be an explicit delete?
+  delete DIB;
+
   return Changed;
 }
 
@@ -1831,9 +1842,10 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
         //   %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1
         // (Also works for arrays instead of structs)
         Value *Insert = UndefValue::get(LIType);
+        IRBuilder<> Builder(LI);
         for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
-          Value *Load = new LoadInst(NewElts[i], "load", LI);
-          Insert = InsertValueInst::Create(Insert, Load, i, "insert", LI);
+          Value *Load = Builder.CreateLoad(NewElts[i], "load");
+          Insert = Builder.CreateInsertValue(Insert, Load, i, "insert");
         }
         LI->replaceAllUsesWith(Insert);
         DeadInsts.push_back(LI);
@@ -1858,9 +1870,10 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
         //   %val.1 = extractvalue { i32, i32 } %val, 1
         //   store i32 %val.1, i32* %alloc.1
         // (Also works for arrays instead of structs)
+        IRBuilder<> Builder(SI);
         for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
-          Value *Extract = ExtractValueInst::Create(Val, i, Val->getName(), SI);
-          new StoreInst(Extract, NewElts[i], SI);
+          Value *Extract = Builder.CreateExtractValue(Val, i, Val->getName());
+          Builder.CreateStore(Extract, NewElts[i]);
         }
         DeadInsts.push_back(SI);
       } else if (SIType->isIntegerTy() &&
@@ -2481,19 +2494,22 @@ static bool isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
     }
 
     if (CallSite CS = U) {
-      // If this is a readonly/readnone call site, then we know it is just a
-      // load and we can ignore it.
-      if (CS.onlyReadsMemory())
-        continue;
-
       // If this is the function being called then we treat it like a load and
       // ignore it.
       if (CS.isCallee(UI))
         continue;
 
+      // If this is a readonly/readnone call site, then we know it is just a
+      // load (but one that potentially returns the value itself), so we can
+      // ignore it if we know that the value isn't captured.
+      unsigned ArgNo = CS.getArgumentNo(UI);
+      if (CS.onlyReadsMemory() &&
+          (CS.getInstruction()->use_empty() ||
+           CS.paramHasAttr(ArgNo+1, Attribute::NoCapture)))
+        continue;
+
       // If this is being passed as a byval argument, the caller is making a
       // copy, so it is only a read of the alloca.
-      unsigned ArgNo = CS.getArgumentNo(UI);
       if (CS.paramHasAttr(ArgNo+1, Attribute::ByVal))
         continue;
     }
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 1137c2b..7e9cc80 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -96,6 +96,7 @@ static void ChangeToCall(InvokeInst *II) {
   NewCall->takeName(II);
   NewCall->setCallingConv(II->getCallingConv());
   NewCall->setAttributes(II->getAttributes());
+  NewCall->setDebugLoc(II->getDebugLoc());
   II->replaceAllUsesWith(NewCall);
 
   // Follow the call by a branch to the normal destination.
@@ -163,7 +164,7 @@ static bool MarkAliveBlocks(BasicBlock *BB,
         Changed = true;
       }
 
-    Changed |= ConstantFoldTerminator(BB);
+    Changed |= ConstantFoldTerminator(BB, true);
     for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
       Worklist.push_back(*SI);
   } while (!Worklist.empty());
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 539cc6f..e21eb9d 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -59,6 +59,7 @@
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InlineCost.h"
@@ -209,10 +210,10 @@ bool TailCallElim::runOnFunction(Function &F) {
     }
   }
 
-  // Finally, if this function contains no non-escaping allocas, mark all calls
-  // in the function as eligible for tail calls (there is no stack memory for
-  // them to access).
-  if (!FunctionContainsEscapingAllocas)
+  // Finally, if this function contains no non-escaping allocas, or calls
+  // setjmp, mark all calls in the function as eligible for tail calls
+  //(there is no stack memory for them to access).
+  if (!FunctionContainsEscapingAllocas && !F.callsFunctionThatReturnsTwice())
     for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
       for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
         if (CallInst *CI = dyn_cast<CallInst>(I)) {
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index c705cc5..92464e8 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -542,11 +542,9 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
 /// GetFirstDebugLocInBasicBlock - Return first valid DebugLoc entry in a 
 /// given basic block.
 DebugLoc llvm::GetFirstDebugLocInBasicBlock(const BasicBlock *BB) {
-  for (BasicBlock::const_iterator BI = BB->begin(), BE = BB->end(); 
-       BI != BE; ++BI) {
-    DebugLoc DL = BI->getDebugLoc();
-    if (!DL.isUnknown())
-      return DL;
-  }
+  if (const Instruction *I = BB->getFirstNonPHI())
+    return I->getDebugLoc();
+  // Scanning entire block may be too expensive, if the first instruction
+  // does not have valid location info.
   return DebugLoc();
 }
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index caf2aeb..d6206a3 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -180,7 +180,8 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
   BasicBlock *NewBB = BasicBlock::Create(TI->getContext(),
                       TIBB->getName() + "." + DestBB->getName() + "_crit_edge");
   // Create our unconditional branch.
-  BranchInst::Create(DestBB, NewBB);
+  BranchInst *NewBI = BranchInst::Create(DestBB, NewBB);
+  NewBI->setDebugLoc(TI->getDebugLoc());
 
   // Branch to the new block, breaking the edge.
   TI->setSuccessor(SuccNum, NewBB);
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 4a90751..14bb17f 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -362,12 +362,8 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) {
   Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
   const FunctionType *FT = Callee->getFunctionType();
-  BasicBlock *BB = CI->getParent();
   LLVMContext &Context = CI->getParent()->getContext();
-  IRBuilder<> B(Context);
-
-  // Set the builder to the instruction after the call.
-  B.SetInsertPoint(BB, CI);
+  IRBuilder<> B(CI);
 
   if (Name == "__memcpy_chk") {
     // Check if this has the right signature.
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 7d17909..8416170 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -10,6 +10,13 @@
 // This file implements inlining of a function into a call site, resolving
 // parameters and the return value as appropriate.
 //
+// The code in this file for handling inlines through invoke
+// instructions preserves semantics only under some assumptions about
+// the behavior of unwinders which correspond to gcc-style libUnwind
+// exception personality functions.  Eventually the IR will be
+// improved to make this unnecessary, but until then, this code is
+// marked [LIBUNWIND].
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -28,6 +35,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CallSite.h"
+#include "llvm/Support/IRBuilder.h"
 using namespace llvm;
 
 bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI) {
@@ -37,6 +45,372 @@ bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI) {
   return InlineFunction(CallSite(II), IFI);
 }
 
+/// [LIBUNWIND] Look for an llvm.eh.exception call in the given block.
+static EHExceptionInst *findExceptionInBlock(BasicBlock *bb) {
+  for (BasicBlock::iterator i = bb->begin(), e = bb->end(); i != e; i++) {
+    EHExceptionInst *exn = dyn_cast<EHExceptionInst>(i);
+    if (exn) return exn;
+  }
+
+  return 0;
+}
+
+/// [LIBUNWIND] Look for the 'best' llvm.eh.selector instruction for
+/// the given llvm.eh.exception call.
+static EHSelectorInst *findSelectorForException(EHExceptionInst *exn) {
+  BasicBlock *exnBlock = exn->getParent();
+
+  EHSelectorInst *outOfBlockSelector = 0;
+  for (Instruction::use_iterator
+         ui = exn->use_begin(), ue = exn->use_end(); ui != ue; ++ui) {
+    EHSelectorInst *sel = dyn_cast<EHSelectorInst>(*ui);
+    if (!sel) continue;
+
+    // Immediately accept an eh.selector in the same block as the
+    // excepton call.
+    if (sel->getParent() == exnBlock) return sel;
+
+    // Otherwise, use the first selector we see.
+    if (!outOfBlockSelector) outOfBlockSelector = sel;
+  }
+
+  return outOfBlockSelector;
+}
+
+/// [LIBUNWIND] Find the (possibly absent) call to @llvm.eh.selector
+/// in the given landing pad.  In principle, llvm.eh.exception is
+/// required to be in the landing pad; in practice, SplitCriticalEdge
+/// can break that invariant, and then inlining can break it further.
+/// There's a real need for a reliable solution here, but until that
+/// happens, we have some fragile workarounds here.
+static EHSelectorInst *findSelectorForLandingPad(BasicBlock *lpad) {
+  // Look for an exception call in the actual landing pad.
+  EHExceptionInst *exn = findExceptionInBlock(lpad);
+  if (exn) return findSelectorForException(exn);
+
+  // Okay, if that failed, look for one in an obvious successor.  If
+  // we find one, we'll fix the IR by moving things back to the
+  // landing pad.
+
+  bool dominates = true; // does the lpad dominate the exn call
+  BasicBlock *nonDominated = 0; // if not, the first non-dominated block
+  BasicBlock *lastDominated = 0; // and the block which branched to it
+
+  BasicBlock *exnBlock = lpad;
+
+  // We need to protect against lpads that lead into infinite loops.
+  SmallPtrSet<BasicBlock*,4> visited;
+  visited.insert(exnBlock);
+
+  do {
+    // We're not going to apply this hack to anything more complicated
+    // than a series of unconditional branches, so if the block
+    // doesn't terminate in an unconditional branch, just fail.  More
+    // complicated cases can arise when, say, sinking a call into a
+    // split unwind edge and then inlining it; but that can do almost
+    // *anything* to the CFG, including leaving the selector
+    // completely unreachable.  The only way to fix that properly is
+    // to (1) prohibit transforms which move the exception or selector
+    // values away from the landing pad, e.g. by producing them with
+    // instructions that are pinned to an edge like a phi, or
+    // producing them with not-really-instructions, and (2) making
+    // transforms which split edges deal with that.
+    BranchInst *branch = dyn_cast<BranchInst>(&exnBlock->back());
+    if (!branch || branch->isConditional()) return 0;
+
+    BasicBlock *successor = branch->getSuccessor(0);
+
+    // Fail if we found an infinite loop.
+    if (!visited.insert(successor)) return 0;
+
+    // If the successor isn't dominated by exnBlock:
+    if (!successor->getSinglePredecessor()) {
+      // We don't want to have to deal with threading the exception
+      // through multiple levels of phi, so give up if we've already
+      // followed a non-dominating edge.
+      if (!dominates) return 0;
+
+      // Otherwise, remember this as a non-dominating edge.
+      dominates = false;
+      nonDominated = successor;
+      lastDominated = exnBlock;
+    }
+
+    exnBlock = successor;
+
+    // Can we stop here?
+    exn = findExceptionInBlock(exnBlock);
+  } while (!exn);
+
+  // Look for a selector call for the exception we found.
+  EHSelectorInst *selector = findSelectorForException(exn);
+  if (!selector) return 0;
+
+  // The easy case is when the landing pad still dominates the
+  // exception call, in which case we can just move both calls back to
+  // the landing pad.
+  if (dominates) {
+    selector->moveBefore(lpad->getFirstNonPHI());
+    exn->moveBefore(selector);
+    return selector;
+  }
+
+  // Otherwise, we have to split at the first non-dominating block.
+  // The CFG looks basically like this:
+  //    lpad:
+  //      phis_0
+  //      insnsAndBranches_1
+  //      br label %nonDominated
+  //    nonDominated:
+  //      phis_2
+  //      insns_3
+  //      %exn = call i8* @llvm.eh.exception()
+  //      insnsAndBranches_4
+  //      %selector = call @llvm.eh.selector(i8* %exn, ...
+  // We need to turn this into:
+  //    lpad:
+  //      phis_0
+  //      %exn0 = call i8* @llvm.eh.exception()
+  //      %selector0 = call @llvm.eh.selector(i8* %exn0, ...
+  //      insnsAndBranches_1
+  //      br label %split // from lastDominated
+  //    nonDominated:
+  //      phis_2 (without edge from lastDominated)
+  //      %exn1 = call i8* @llvm.eh.exception()
+  //      %selector1 = call i8* @llvm.eh.selector(i8* %exn1, ...
+  //      br label %split
+  //    split:
+  //      phis_2 (edge from lastDominated, edge from split)
+  //      %exn = phi ...
+  //      %selector = phi ...
+  //      insns_3
+  //      insnsAndBranches_4
+
+  assert(nonDominated);
+  assert(lastDominated);
+
+  // First, make clones of the intrinsics to go in lpad.
+  EHExceptionInst *lpadExn = cast<EHExceptionInst>(exn->clone());
+  EHSelectorInst *lpadSelector = cast<EHSelectorInst>(selector->clone());
+  lpadSelector->setArgOperand(0, lpadExn);
+  lpadSelector->insertBefore(lpad->getFirstNonPHI());
+  lpadExn->insertBefore(lpadSelector);
+
+  // Split the non-dominated block.
+  BasicBlock *split =
+    nonDominated->splitBasicBlock(nonDominated->getFirstNonPHI(),
+                                  nonDominated->getName() + ".lpad-fix");
+
+  // Redirect the last dominated branch there.
+  cast<BranchInst>(lastDominated->back()).setSuccessor(0, split);
+
+  // Move the existing intrinsics to the end of the old block.
+  selector->moveBefore(&nonDominated->back());
+  exn->moveBefore(selector);
+
+  Instruction *splitIP = &split->front();
+
+  // For all the phis in nonDominated, make a new phi in split to join
+  // that phi with the edge from lastDominated.
+  for (BasicBlock::iterator
+         i = nonDominated->begin(), e = nonDominated->end(); i != e; ++i) {
+    PHINode *phi = dyn_cast<PHINode>(i);
+    if (!phi) break;
+
+    PHINode *splitPhi = PHINode::Create(phi->getType(), 2, phi->getName(),
+                                        splitIP);
+    phi->replaceAllUsesWith(splitPhi);
+    splitPhi->addIncoming(phi, nonDominated);
+    splitPhi->addIncoming(phi->removeIncomingValue(lastDominated),
+                          lastDominated);
+  }
+
+  // Make new phis for the exception and selector.
+  PHINode *exnPhi = PHINode::Create(exn->getType(), 2, "", splitIP);
+  exn->replaceAllUsesWith(exnPhi);
+  selector->setArgOperand(0, exn); // except for this use
+  exnPhi->addIncoming(exn, nonDominated);
+  exnPhi->addIncoming(lpadExn, lastDominated);
+
+  PHINode *selectorPhi = PHINode::Create(selector->getType(), 2, "", splitIP);
+  selector->replaceAllUsesWith(selectorPhi);
+  selectorPhi->addIncoming(selector, nonDominated);
+  selectorPhi->addIncoming(lpadSelector, lastDominated);
+
+  return lpadSelector;
+}
+
+namespace {
+  /// A class for recording information about inlining through an invoke.
+  class InvokeInliningInfo {
+    BasicBlock *OuterUnwindDest;
+    EHSelectorInst *OuterSelector;
+    BasicBlock *InnerUnwindDest;
+    PHINode *InnerExceptionPHI;
+    PHINode *InnerSelectorPHI;
+    SmallVector<Value*, 8> UnwindDestPHIValues;
+
+  public:
+    InvokeInliningInfo(InvokeInst *II) :
+      OuterUnwindDest(II->getUnwindDest()), OuterSelector(0),
+      InnerUnwindDest(0), InnerExceptionPHI(0), InnerSelectorPHI(0) {
+
+      // If there are PHI nodes in the unwind destination block, we
+      // need to keep track of which values came into them from the
+      // invoke before removing the edge from this block.
+      llvm::BasicBlock *invokeBB = II->getParent();
+      for (BasicBlock::iterator I = OuterUnwindDest->begin();
+             isa<PHINode>(I); ++I) {
+        // Save the value to use for this edge.
+        PHINode *phi = cast<PHINode>(I);
+        UnwindDestPHIValues.push_back(phi->getIncomingValueForBlock(invokeBB));
+      }
+    }
+
+    /// The outer unwind destination is the target of unwind edges
+    /// introduced for calls within the inlined function.
+    BasicBlock *getOuterUnwindDest() const {
+      return OuterUnwindDest;
+    }
+
+    EHSelectorInst *getOuterSelector() {
+      if (!OuterSelector)
+        OuterSelector = findSelectorForLandingPad(OuterUnwindDest);
+      return OuterSelector;
+    }
+
+    BasicBlock *getInnerUnwindDest();
+
+    bool forwardEHResume(CallInst *call, BasicBlock *src);
+
+    /// Add incoming-PHI values to the unwind destination block for
+    /// the given basic block, using the values for the original
+    /// invoke's source block.
+    void addIncomingPHIValuesFor(BasicBlock *BB) const {
+      addIncomingPHIValuesForInto(BB, OuterUnwindDest);
+    }
+
+    void addIncomingPHIValuesForInto(BasicBlock *src, BasicBlock *dest) const {
+      BasicBlock::iterator I = dest->begin();
+      for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {
+        PHINode *phi = cast<PHINode>(I);
+        phi->addIncoming(UnwindDestPHIValues[i], src);
+      }
+    }
+  };
+}
+
+/// Get or create a target for the branch out of rewritten calls to
+/// llvm.eh.resume.
+BasicBlock *InvokeInliningInfo::getInnerUnwindDest() {
+  if (InnerUnwindDest) return InnerUnwindDest;
+
+  // Find and hoist the llvm.eh.exception and llvm.eh.selector calls
+  // in the outer landing pad to immediately following the phis.
+  EHSelectorInst *selector = getOuterSelector();
+  if (!selector) return 0;
+
+  // The call to llvm.eh.exception *must* be in the landing pad.
+  Instruction *exn = cast<Instruction>(selector->getArgOperand(0));
+  assert(exn->getParent() == OuterUnwindDest);
+
+  // TODO: recognize when we've already done this, so that we don't
+  // get a linear number of these when inlining calls into lots of
+  // invokes with the same landing pad.
+
+  // Do the hoisting.
+  Instruction *splitPoint = exn->getParent()->getFirstNonPHI();
+  assert(splitPoint != selector && "selector-on-exception dominance broken!");
+  if (splitPoint == exn) {
+    selector->removeFromParent();
+    selector->insertAfter(exn);
+    splitPoint = selector->getNextNode();
+  } else {
+    exn->moveBefore(splitPoint);
+    selector->moveBefore(splitPoint);
+  }
+
+  // Split the landing pad.
+  InnerUnwindDest = OuterUnwindDest->splitBasicBlock(splitPoint,
+                                        OuterUnwindDest->getName() + ".body");
+
+  // The number of incoming edges we expect to the inner landing pad.
+  const unsigned phiCapacity = 2;
+
+  // Create corresponding new phis for all the phis in the outer landing pad.
+  BasicBlock::iterator insertPoint = InnerUnwindDest->begin();
+  BasicBlock::iterator I = OuterUnwindDest->begin();
+  for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {
+    PHINode *outerPhi = cast<PHINode>(I);
+    PHINode *innerPhi = PHINode::Create(outerPhi->getType(), phiCapacity,
+                                        outerPhi->getName() + ".lpad-body",
+                                        insertPoint);
+    outerPhi->replaceAllUsesWith(innerPhi);
+    innerPhi->addIncoming(outerPhi, OuterUnwindDest);
+  }
+
+  // Create a phi for the exception value...
+  InnerExceptionPHI = PHINode::Create(exn->getType(), phiCapacity,
+                                      "exn.lpad-body", insertPoint);
+  exn->replaceAllUsesWith(InnerExceptionPHI);
+  selector->setArgOperand(0, exn); // restore this use
+  InnerExceptionPHI->addIncoming(exn, OuterUnwindDest);
+
+  // ...and the selector.
+  InnerSelectorPHI = PHINode::Create(selector->getType(), phiCapacity,
+                                     "selector.lpad-body", insertPoint);
+  selector->replaceAllUsesWith(InnerSelectorPHI);
+  InnerSelectorPHI->addIncoming(selector, OuterUnwindDest);
+
+  // All done.
+  return InnerUnwindDest;
+}
+
+/// [LIBUNWIND] Try to forward the given call, which logically occurs
+/// at the end of the given block, as a branch to the inner unwind
+/// block.  Returns true if the call was forwarded.
+bool InvokeInliningInfo::forwardEHResume(CallInst *call, BasicBlock *src) {
+  // First, check whether this is a call to the intrinsic.
+  Function *fn = dyn_cast<Function>(call->getCalledValue());
+  if (!fn || fn->getName() != "llvm.eh.resume")
+    return false;
+  
+  // At this point, we need to return true on all paths, because
+  // otherwise we'll construct an invoke of the intrinsic, which is
+  // not well-formed.
+
+  // Try to find or make an inner unwind dest, which will fail if we
+  // can't find a selector call for the outer unwind dest.
+  BasicBlock *dest = getInnerUnwindDest();
+  bool hasSelector = (dest != 0);
+
+  // If we failed, just use the outer unwind dest, dropping the
+  // exception and selector on the floor.
+  if (!hasSelector)
+    dest = OuterUnwindDest;
+
+  // Make a branch.
+  BranchInst::Create(dest, src);
+
+  // Update the phis in the destination.  They were inserted in an
+  // order which makes this work.
+  addIncomingPHIValuesForInto(src, dest);
+
+  if (hasSelector) {
+    InnerExceptionPHI->addIncoming(call->getArgOperand(0), src);
+    InnerSelectorPHI->addIncoming(call->getArgOperand(1), src);
+  }
+
+  return true;
+}
+
+/// [LIBUNWIND] Check whether this selector is "only cleanups":
+///   call i32 @llvm.eh.selector(blah, blah, i32 0)
+static bool isCleanupOnlySelector(EHSelectorInst *selector) {
+  if (selector->getNumArgOperands() != 3) return false;
+  ConstantInt *val = dyn_cast<ConstantInt>(selector->getArgOperand(2));
+  return (val && val->isZero());
+}
 
 /// HandleCallsInBlockInlinedThroughInvoke - When we inline a basic block into
 /// an invoke, we have to turn all of the calls that can throw into
@@ -44,9 +418,9 @@ bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI) {
 /// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI
 /// nodes in that block with the values specified in InvokeDestPHIValues.
 ///
-static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
-                                                   BasicBlock *InvokeDest,
-                           const SmallVectorImpl<Value*> &InvokeDestPHIValues) {
+/// Returns true to indicate that the next block should be skipped.
+static bool HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
+                                                   InvokeInliningInfo &Invoke) {
   for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
     Instruction *I = BBI++;
     
@@ -54,6 +428,38 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
     // instructions require no special handling.
     CallInst *CI = dyn_cast<CallInst>(I);
     if (CI == 0) continue;
+
+    // LIBUNWIND: merge selector instructions.
+    if (EHSelectorInst *Inner = dyn_cast<EHSelectorInst>(CI)) {
+      EHSelectorInst *Outer = Invoke.getOuterSelector();
+      if (!Outer) continue;
+
+      bool innerIsOnlyCleanup = isCleanupOnlySelector(Inner);
+      bool outerIsOnlyCleanup = isCleanupOnlySelector(Outer);
+
+      // If both selectors contain only cleanups, we don't need to do
+      // anything.  TODO: this is really just a very specific instance
+      // of a much more general optimization.
+      if (innerIsOnlyCleanup && outerIsOnlyCleanup) continue;
+
+      // Otherwise, we just append the outer selector to the inner selector.
+      SmallVector<Value*, 16> NewSelector;
+      for (unsigned i = 0, e = Inner->getNumArgOperands(); i != e; ++i)
+        NewSelector.push_back(Inner->getArgOperand(i));
+      for (unsigned i = 2, e = Outer->getNumArgOperands(); i != e; ++i)
+        NewSelector.push_back(Outer->getArgOperand(i));
+
+      CallInst *NewInner = CallInst::Create(Inner->getCalledValue(),
+                                            NewSelector.begin(),
+                                            NewSelector.end(),
+                                            "",
+                                            Inner);
+      // No need to copy attributes, calling convention, etc.
+      NewInner->takeName(Inner);
+      Inner->replaceAllUsesWith(NewInner);
+      Inner->eraseFromParent();
+      continue;
+    }
     
     // If this call cannot unwind, don't convert it to an invoke.
     if (CI->doesNotThrow())
@@ -62,37 +468,45 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
     // Convert this function call into an invoke instruction.
     // First, split the basic block.
     BasicBlock *Split = BB->splitBasicBlock(CI, CI->getName()+".noexc");
-    
-    // Next, create the new invoke instruction, inserting it at the end
-    // of the old basic block.
+
+    // Delete the unconditional branch inserted by splitBasicBlock
+    BB->getInstList().pop_back();
+
+    // LIBUNWIND: If this is a call to @llvm.eh.resume, just branch
+    // directly to the new landing pad.
+    if (Invoke.forwardEHResume(CI, BB)) {
+      // TODO: 'Split' is now unreachable; clean it up.
+
+      // We want to leave the original call intact so that the call
+      // graph and other structures won't get misled.  We also have to
+      // avoid processing the next block, or we'll iterate here forever.
+      return true;
+    }
+
+    // Otherwise, create the new invoke instruction.
     ImmutableCallSite CS(CI);
     SmallVector<Value*, 8> InvokeArgs(CS.arg_begin(), CS.arg_end());
     InvokeInst *II =
-      InvokeInst::Create(CI->getCalledValue(), Split, InvokeDest,
+      InvokeInst::Create(CI->getCalledValue(), Split,
+                         Invoke.getOuterUnwindDest(),
                          InvokeArgs.begin(), InvokeArgs.end(),
-                         CI->getName(), BB->getTerminator());
+                         CI->getName(), BB);
     II->setCallingConv(CI->getCallingConv());
     II->setAttributes(CI->getAttributes());
     
     // Make sure that anything using the call now uses the invoke!  This also
     // updates the CallGraph if present, because it uses a WeakVH.
     CI->replaceAllUsesWith(II);
-    
-    // Delete the unconditional branch inserted by splitBasicBlock
-    BB->getInstList().pop_back();
+
     Split->getInstList().pop_front();  // Delete the original call
-    
+
     // Update any PHI nodes in the exceptional block to indicate that
     // there is now a new entry in them.
-    unsigned i = 0;
-    for (BasicBlock::iterator I = InvokeDest->begin();
-         isa<PHINode>(I); ++I, ++i)
-      cast<PHINode>(I)->addIncoming(InvokeDestPHIValues[i], BB);
-    
-    // This basic block is now complete, the caller will continue scanning the
-    // next one.
-    return;
+    Invoke.addIncomingPHIValuesFor(BB);
+    return false;
   }
+
+  return false;
 }
   
 
@@ -106,17 +520,6 @@ static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
 static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
                                 ClonedCodeInfo &InlinedCodeInfo) {
   BasicBlock *InvokeDest = II->getUnwindDest();
-  SmallVector<Value*, 8> InvokeDestPHIValues;
-
-  // If there are PHI nodes in the unwind destination block, we need to
-  // keep track of which values came into them from this invoke, then remove
-  // the entry for this block.
-  BasicBlock *InvokeBlock = II->getParent();
-  for (BasicBlock::iterator I = InvokeDest->begin(); isa<PHINode>(I); ++I) {
-    PHINode *PN = cast<PHINode>(I);
-    // Save the value to use for this edge.
-    InvokeDestPHIValues.push_back(PN->getIncomingValueForBlock(InvokeBlock));
-  }
 
   Function *Caller = FirstNewBlock->getParent();
 
@@ -132,11 +535,17 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
     InvokeDest->removePredecessor(II->getParent());
     return;
   }
+
+  InvokeInliningInfo Invoke(II);
   
   for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; ++BB){
     if (InlinedCodeInfo.ContainsCalls)
-      HandleCallsInBlockInlinedThroughInvoke(BB, InvokeDest,
-                                             InvokeDestPHIValues);
+      if (HandleCallsInBlockInlinedThroughInvoke(BB, Invoke)) {
+        // Honor a request to skip the next block.  We don't need to
+        // consider UnwindInsts in this case either.
+        ++BB;
+        continue;
+      }
 
     if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
       // An UnwindInst requires special handling when it gets inlined into an
@@ -150,12 +559,7 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
 
       // Update any PHI nodes in the exceptional block to indicate that
       // there is now a new entry in them.
-      unsigned i = 0;
-      for (BasicBlock::iterator I = InvokeDest->begin();
-           isa<PHINode>(I); ++I, ++i) {
-        PHINode *PN = cast<PHINode>(I);
-        PN->addIncoming(InvokeDestPHIValues[i], BB);
-      }
+      Invoke.addIncomingPHIValuesFor(BB);
     }
   }
 
@@ -299,21 +703,48 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
     ConstantInt::get(Type::getInt32Ty(Context), 1),
     ConstantInt::getFalse(Context) // isVolatile
   };
-  CallInst *TheMemCpy =
-    CallInst::Create(MemCpyFn, CallArgs, CallArgs+5, "", TheCall);
-  
-  // If we have a call graph, update it.
-  if (CallGraph *CG = IFI.CG) {
-    CallGraphNode *MemCpyCGN = CG->getOrInsertFunction(MemCpyFn);
-    CallGraphNode *CallerNode = (*CG)[Caller];
-    CallerNode->addCalledFunction(TheMemCpy, MemCpyCGN);
-  }
+  CallInst::Create(MemCpyFn, CallArgs, CallArgs+5, "", TheCall);
   
   // Uses of the argument in the function should use our new alloca
   // instead.
   return NewAlloca;
 }
 
+// isUsedByLifetimeMarker - Check whether this Value is used by a lifetime
+// intrinsic.
+static bool isUsedByLifetimeMarker(Value *V) {
+  for (Value::use_iterator UI = V->use_begin(), UE = V->use_end(); UI != UE;
+       ++UI) {
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(*UI)) {
+      switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::lifetime_start:
+      case Intrinsic::lifetime_end:
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// hasLifetimeMarkers - Check whether the given alloca already has
+// lifetime.start or lifetime.end intrinsics.
+static bool hasLifetimeMarkers(AllocaInst *AI) {
+  const Type *Int8PtrTy = Type::getInt8PtrTy(AI->getType()->getContext());
+  if (AI->getType() == Int8PtrTy)
+    return isUsedByLifetimeMarker(AI);
+
+  // Do a scan to find all the bitcasts to i8*.
+  for (Value::use_iterator I = AI->use_begin(), E = AI->use_end(); I != E;
+       ++I) {
+    if (I->getType() != Int8PtrTy) continue;
+    if (!isa<BitCastInst>(*I)) continue;
+    if (isUsedByLifetimeMarker(*I))
+      return true;
+  }
+  return false;
+}
+
 // InlineFunction - This function inlines the called function into the basic
 // block of the caller.  This returns false if it is not possible to inline this
 // call.  The program is still in a well defined state if this occurs though.
@@ -460,6 +891,26 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
     }
   }
 
+  // Leave lifetime markers for the static alloca's, scoping them to the
+  // function we just inlined.
+  if (!IFI.StaticAllocas.empty()) {
+    IRBuilder<> builder(FirstNewBlock->begin());
+    for (unsigned ai = 0, ae = IFI.StaticAllocas.size(); ai != ae; ++ai) {
+      AllocaInst *AI = IFI.StaticAllocas[ai];
+
+      // If the alloca is already scoped to something smaller than the whole
+      // function then there's no need to add redundant, less accurate markers.
+      if (hasLifetimeMarkers(AI))
+        continue;
+
+      builder.CreateLifetimeStart(AI);
+      for (unsigned ri = 0, re = Returns.size(); ri != re; ++ri) {
+        IRBuilder<> builder(Returns[ri]);
+        builder.CreateLifetimeEnd(AI);
+      }
+    }
+  }
+
   // If the inlined code contained dynamic alloca instructions, wrap the inlined
   // code with llvm.stacksave/llvm.stackrestore intrinsics.
   if (InlinedFunctionInfo.ContainsDynamicAllocas) {
@@ -468,25 +919,14 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
     Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave);
     Function *StackRestore=Intrinsic::getDeclaration(M,Intrinsic::stackrestore);
 
-    // If we are preserving the callgraph, add edges to the stacksave/restore
-    // functions for the calls we insert.
-    CallGraphNode *StackSaveCGN = 0, *StackRestoreCGN = 0, *CallerNode = 0;
-    if (CallGraph *CG = IFI.CG) {
-      StackSaveCGN    = CG->getOrInsertFunction(StackSave);
-      StackRestoreCGN = CG->getOrInsertFunction(StackRestore);
-      CallerNode = (*CG)[Caller];
-    }
-
     // Insert the llvm.stacksave.
     CallInst *SavedPtr = CallInst::Create(StackSave, "savedstack",
                                           FirstNewBlock->begin());
-    if (IFI.CG) CallerNode->addCalledFunction(SavedPtr, StackSaveCGN);
 
     // Insert a call to llvm.stackrestore before any return instructions in the
     // inlined function.
     for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
-      CallInst *CI = CallInst::Create(StackRestore, SavedPtr, "", Returns[i]);
-      if (IFI.CG) CallerNode->addCalledFunction(CI, StackRestoreCGN);
+      CallInst::Create(StackRestore, SavedPtr, "", Returns[i]);
     }
 
     // Count the number of StackRestore calls we insert.
@@ -498,8 +938,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
       for (Function::iterator BB = FirstNewBlock, E = Caller->end();
            BB != E; ++BB)
         if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
-          CallInst *CI = CallInst::Create(StackRestore, SavedPtr, "", UI);
-          if (IFI.CG) CallerNode->addCalledFunction(CI, StackRestoreCGN);
+          CallInst::Create(StackRestore, SavedPtr, "", UI);
           ++NumStackRestores;
         }
     }
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 4bca2fc..3bdbaa5 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Instructions.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Metadata.h"
 #include "llvm/Operator.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -34,6 +35,7 @@
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
@@ -43,12 +45,16 @@ using namespace llvm;
 //  Local constant propagation.
 //
 
-// ConstantFoldTerminator - If a terminator instruction is predicated on a
-// constant value, convert it into an unconditional branch to the constant
-// destination.
-//
-bool llvm::ConstantFoldTerminator(BasicBlock *BB) {
+/// ConstantFoldTerminator - If a terminator instruction is predicated on a
+/// constant value, convert it into an unconditional branch to the constant
+/// destination.  This is a nontrivial operation because the successors of this
+/// basic block must have their PHI nodes updated.
+/// Also calls RecursivelyDeleteTriviallyDeadInstructions() on any branch/switch
+/// conditions and indirectbr addresses this might make dead if
+/// DeleteDeadConditions is true.
+bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
   TerminatorInst *T = BB->getTerminator();
+  IRBuilder<> Builder(T);
 
   // Branch - See if we are conditional jumping on constant
   if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
@@ -71,7 +77,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) {
       OldDest->removePredecessor(BB);
 
       // Replace the conditional branch with an unconditional one.
-      BranchInst::Create(Destination, BI);
+      Builder.CreateBr(Destination);
       BI->eraseFromParent();
       return true;
     }
@@ -86,8 +92,11 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) {
       Dest1->removePredecessor(BI->getParent());
 
       // Replace the conditional branch with an unconditional one.
-      BranchInst::Create(Dest1, BI);
+      Builder.CreateBr(Dest1);
+      Value *Cond = BI->getCondition();
       BI->eraseFromParent();
+      if (DeleteDeadConditions)
+        RecursivelyDeleteTriviallyDeadInstructions(Cond);
       return true;
     }
     return false;
@@ -136,7 +145,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) {
     // now.
     if (TheOnlyDest) {
       // Insert the new branch.
-      BranchInst::Create(TheOnlyDest, SI);
+      Builder.CreateBr(TheOnlyDest);
       BasicBlock *BB = SI->getParent();
 
       // Remove entries from PHI nodes which we no longer branch to...
@@ -150,17 +159,21 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) {
       }
 
       // Delete the old switch.
-      BB->getInstList().erase(SI);
+      Value *Cond = SI->getCondition();
+      SI->eraseFromParent();
+      if (DeleteDeadConditions)
+        RecursivelyDeleteTriviallyDeadInstructions(Cond);
       return true;
     }
     
     if (SI->getNumSuccessors() == 2) {
       // Otherwise, we can fold this switch into a conditional branch
       // instruction if it has only one non-default destination.
-      Value *Cond = new ICmpInst(SI, ICmpInst::ICMP_EQ, SI->getCondition(),
-                                 SI->getSuccessorValue(1), "cond");
+      Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
+                                         SI->getSuccessorValue(1), "cond");
+
       // Insert the new branch.
-      BranchInst::Create(SI->getSuccessor(1), SI->getSuccessor(0), Cond, SI);
+      Builder.CreateCondBr(Cond, SI->getSuccessor(1), SI->getSuccessor(0));
 
       // Delete the old switch.
       SI->eraseFromParent();
@@ -175,7 +188,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) {
           dyn_cast<BlockAddress>(IBI->getAddress()->stripPointerCasts())) {
       BasicBlock *TheOnlyDest = BA->getBasicBlock();
       // Insert the new branch.
-      BranchInst::Create(TheOnlyDest, IBI);
+      Builder.CreateBr(TheOnlyDest);
       
       for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
         if (IBI->getDestination(i) == TheOnlyDest)
@@ -183,7 +196,10 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB) {
         else
           IBI->getDestination(i)->removePredecessor(IBI->getParent());
       }
+      Value *Address = IBI->getAddress();
       IBI->eraseFromParent();
+      if (DeleteDeadConditions)
+        RecursivelyDeleteTriviallyDeadInstructions(Address);
       
       // If we didn't find our destination in the IBI successor list, then we
       // have undefined behavior.  Replace the unconditional branch with an
@@ -785,10 +801,19 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   if (!DIVar.Verify())
     return false;
 
-  Instruction *DbgVal = 
-    Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0,
-                                    DIVar, SI);
-  
+  Instruction *DbgVal = NULL;
+  // If an argument is zero extended then use argument directly. The ZExt
+  // may be zapped by an optimization pass in future.
+  Argument *ExtendedArg = NULL;
+  if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
+    ExtendedArg = dyn_cast<Argument>(ZExt->getOperand(0));
+  if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
+    ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0));
+  if (ExtendedArg)
+    DbgVal = Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, SI);
+  else
+    DbgVal = Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, SI);
+
   // Propagate any debug metadata from the store onto the dbg.value.
   DebugLoc SIDL = SI->getDebugLoc();
   if (!SIDL.isUnknown())
@@ -853,3 +878,15 @@ bool llvm::LowerDbgDeclare(Function &F) {
   }
   return true;
 }
+
+/// FindAllocaDbgDeclare - Finds the llvm.dbg.declare intrinsic describing the
+/// alloca 'V', if any.
+DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {
+  if (MDNode *DebugNode = MDNode::getIfExists(V->getContext(), V))
+    for (Value::use_iterator UI = DebugNode->use_begin(),
+         E = DebugNode->use_end(); UI != E; ++UI)
+      if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI))
+        return DDI;
+
+  return 0;
+}
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 50c9ae2..a1736b9 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -100,18 +100,6 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
   return true;
 }
 
-/// FindAllocaDbgDeclare - Finds the llvm.dbg.declare intrinsic describing the
-/// alloca 'V', if any.
-static DbgDeclareInst *FindAllocaDbgDeclare(Value *V) {
-  if (MDNode *DebugNode = MDNode::getIfExists(V->getContext(), V))
-    for (Value::use_iterator UI = DebugNode->use_begin(),
-         E = DebugNode->use_end(); UI != E; ++UI)
-      if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI))
-        return DDI;
-
-  return 0;
-}
-
 namespace {
   struct AllocaInfo;
 
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index 2860c3e..b336194 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -14,7 +14,9 @@
 #define DEBUG_TYPE "ssaupdater"
 #include "llvm/Constants.h"
 #include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/DIBuilder.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Allocator.h"
@@ -22,6 +24,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
 
@@ -355,7 +358,8 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
 
 LoadAndStorePromoter::
 LoadAndStorePromoter(const SmallVectorImpl<Instruction*> &Insts,
-                     SSAUpdater &S, StringRef BaseName) : SSA(S) {
+                     SSAUpdater &S, DbgDeclareInst *DD, DIBuilder *DB,
+                     StringRef BaseName) : SSA(S), DDI(DD), DIB(DB) {
   if (Insts.empty()) return;
   
   Value *SomeVal;
@@ -402,9 +406,11 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
     // single user in it, we can rewrite it trivially.
     if (BlockUses.size() == 1) {
       // If it is a store, it is a trivial def of the value in the block.
-      if (StoreInst *SI = dyn_cast<StoreInst>(User))
+      if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+        if (DDI)
+          ConvertDebugDeclareToDebugValue(DDI, SI, *DIB);
         SSA.AddAvailableValue(BB, SI->getOperand(0));
-      else 
+      } else 
         // Otherwise it is a load, queue it to rewrite as a live-in load.
         LiveInLoads.push_back(cast<LoadInst>(User));
       BlockUses.clear();
@@ -453,12 +459,15 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
         continue;
       }
       
-      if (StoreInst *S = dyn_cast<StoreInst>(II)) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(II)) {
         // If this is a store to an unrelated pointer, ignore it.
-        if (!isInstInList(S, Insts)) continue;
-        
+        if (!isInstInList(SI, Insts)) continue;
+
+        if (DDI)
+          ConvertDebugDeclareToDebugValue(DDI, SI, *DIB);
+
         // Remember that this is the active value in the block.
-        StoredValue = S->getOperand(0);
+        StoredValue = SI->getOperand(0);
       }
     }
     
@@ -513,4 +522,7 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
     instructionDeleted(User);
     User->eraseFromParent();
   }
+
+  if (DDI)
+    DDI->eraseFromParent();
 }
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 18b8573..6df846c 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -20,6 +20,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/ADT/DenseMap.h"
@@ -31,6 +32,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/NoFolder.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <set>
@@ -55,16 +58,18 @@ class SimplifyCFGOpt {
   BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI,
     std::vector<std::pair<ConstantInt*, BasicBlock*> > &Cases);
   bool SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
-                                                     BasicBlock *Pred);
-  bool FoldValueComparisonIntoPredecessors(TerminatorInst *TI);
+                                                     BasicBlock *Pred,
+                                                     IRBuilder<> &Builder);
+  bool FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
+                                           IRBuilder<> &Builder);
 
-  bool SimplifyReturn(ReturnInst *RI);
-  bool SimplifyUnwind(UnwindInst *UI);
+  bool SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder);
+  bool SimplifyUnwind(UnwindInst *UI, IRBuilder<> &Builder);
   bool SimplifyUnreachable(UnreachableInst *UI);
-  bool SimplifySwitch(SwitchInst *SI);
+  bool SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder);
   bool SimplifyIndirectBr(IndirectBrInst *IBI);
-  bool SimplifyUncondBranch(BranchInst *BI);
-  bool SimplifyCondBranch(BranchInst *BI);
+  bool SimplifyUncondBranch(BranchInst *BI, IRBuilder <> &Builder);
+  bool SimplifyCondBranch(BranchInst *BI, IRBuilder <>&Builder);
 
 public:
   explicit SimplifyCFGOpt(const TargetData *td) : TD(td) {}
@@ -541,7 +546,8 @@ ValuesOverlap(std::vector<std::pair<ConstantInt*, BasicBlock*> > &C1,
 /// form of jump threading.
 bool SimplifyCFGOpt::
 SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
-                                              BasicBlock *Pred) {
+                                              BasicBlock *Pred,
+                                              IRBuilder<> &Builder) {
   Value *PredVal = isValueEqualityComparison(Pred->getTerminator());
   if (!PredVal) return false;  // Not a value comparison in predecessor.
 
@@ -574,7 +580,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
       // uncond br.
       assert(ThisCases.size() == 1 && "Branch can only have one case!");
       // Insert the new branch.
-      Instruction *NI = BranchInst::Create(ThisDef, TI);
+      Instruction *NI = Builder.CreateBr(ThisDef);
       (void) NI;
 
       // Remove PHI node entries for the dead edge.
@@ -639,7 +645,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
       CheckEdge = 0;
 
   // Insert the new branch.
-  Instruction *NI = BranchInst::Create(TheRealDest, TI);
+  Instruction *NI = Builder.CreateBr(TheRealDest);
   (void) NI;
 
   DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
@@ -674,7 +680,8 @@ static int ConstantIntSortPredicate(const void *P1, const void *P2) {
 /// equality comparison instruction (either a switch or a branch on "X == c").
 /// See if any of the predecessors of the terminator block are value comparisons
 /// on the same value.  If so, and if safe to do so, fold them together.
-bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI) {
+bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
+                                                         IRBuilder<> &Builder) {
   BasicBlock *BB = TI->getParent();
   Value *CV = isValueEqualityComparison(TI);  // CondVal
   assert(CV && "Not a comparison?");
@@ -767,16 +774,18 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI) {
       for (unsigned i = 0, e = NewSuccessors.size(); i != e; ++i)
         AddPredecessorToBlock(NewSuccessors[i], Pred, BB);
 
+      Builder.SetInsertPoint(PTI);
       // Convert pointer to int before we switch.
       if (CV->getType()->isPointerTy()) {
         assert(TD && "Cannot switch on pointer without TargetData");
-        CV = new PtrToIntInst(CV, TD->getIntPtrType(CV->getContext()),
-                              "magicptr", PTI);
+        CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getContext()),
+                                    "magicptr");
       }
 
       // Now that the successors are updated, create the new Switch instruction.
-      SwitchInst *NewSI = SwitchInst::Create(CV, PredDefault,
-                                             PredCases.size(), PTI);
+      SwitchInst *NewSI = Builder.CreateSwitch(CV, PredDefault,
+                                               PredCases.size());
+      NewSI->setDebugLoc(PTI->getDebugLoc());
       for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
         NewSI->addCase(PredCases[i].first, PredCases[i].second);
 
@@ -900,6 +909,7 @@ HoistTerminator:
     NT->takeName(I1);
   }
 
+  IRBuilder<true, NoFolder> Builder(NT);
   // Hoisting one of the terminators from our successor is a great thing.
   // Unfortunately, the successors of the if/else blocks may have PHI nodes in
   // them.  If they do, all PHI entries for BB1/BB2 must agree for all PHI
@@ -916,9 +926,11 @@ HoistTerminator:
       // These values do not agree.  Insert a select instruction before NT
       // that determines the right value.
       SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
-      if (SI == 0)
-        SI = SelectInst::Create(BI->getCondition(), BB1V, BB2V,
-                                BB1V->getName()+"."+BB2V->getName(), NT);
+      if (SI == 0) 
+        SI = cast<SelectInst>
+          (Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
+                                BB1V->getName()+"."+BB2V->getName()));
+
       // Make the PHI node use the select for all incoming values for BB1/BB2
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
         if (PN->getIncomingBlock(i) == BB1 || PN->getIncomingBlock(i) == BB2)
@@ -1076,13 +1088,16 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
 
   // Create a select whose true value is the speculatively executed value and
   // false value is the previously determined FalseV.
+  IRBuilder<true, NoFolder> Builder(BI);
   SelectInst *SI;
   if (Invert)
-    SI = SelectInst::Create(BrCond, FalseV, HInst,
-                            FalseV->getName() + "." + HInst->getName(), BI);
+    SI = cast<SelectInst>
+      (Builder.CreateSelect(BrCond, FalseV, HInst,
+                            FalseV->getName() + "." + HInst->getName()));
   else
-    SI = SelectInst::Create(BrCond, HInst, FalseV,
-                            HInst->getName() + "." + FalseV->getName(), BI);
+    SI = cast<SelectInst>
+      (Builder.CreateSelect(BrCond, HInst, FalseV,
+                            HInst->getName() + "." + FalseV->getName()));
 
   // Make the PHI node use the select for all incoming values for "then" and
   // "if" blocks.
@@ -1156,6 +1171,8 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) {
     BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue());
     
     if (RealDest == BB) continue;  // Skip self loops.
+    // Skip if the predecessor's terminator is an indirect branch.
+    if (isa<IndirectBrInst>(PredBB->getTerminator())) continue;
     
     // The dest block might have PHI nodes, other predecessors and other
     // difficult cases.  Instead of being smart about this, just insert a new
@@ -1211,7 +1228,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) {
         BB->removePredecessor(PredBB);
         PredBBTI->setSuccessor(i, EdgeBB);
       }
-    
+
     // Recurse, simplifying any other constants.
     return FoldCondBranchOnPHI(BI, TD) | true;
   }
@@ -1320,6 +1337,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
   // If we can still promote the PHI nodes after this gauntlet of tests,
   // do all of the PHI's now.
   Instruction *InsertPt = DomBlock->getTerminator();
+  IRBuilder<true, NoFolder> Builder(InsertPt);
   
   // Move all 'aggressive' instructions, which are defined in the
   // conditional parts of the if's up to the dominating block.
@@ -1337,7 +1355,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
     Value *TrueVal  = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
     Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue);
     
-    Value *NV = SelectInst::Create(IfCond, TrueVal, FalseVal, "", InsertPt);
+    SelectInst *NV = 
+      cast<SelectInst>(Builder.CreateSelect(IfCond, TrueVal, FalseVal, ""));
     PN->replaceAllUsesWith(NV);
     NV->takeName(PN);
     PN->eraseFromParent();
@@ -1347,7 +1366,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
   // has been flattened.  Change DomBlock to jump directly to our new block to
   // avoid other simplifycfg's kicking in on the diamond.
   TerminatorInst *OldTI = DomBlock->getTerminator();
-  BranchInst::Create(BB, OldTI);
+  Builder.SetInsertPoint(OldTI);
+  Builder.CreateBr(BB);
   OldTI->eraseFromParent();
   return true;
 }
@@ -1355,7 +1375,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
 /// SimplifyCondBranchToTwoReturns - If we found a conditional branch that goes
 /// to two returning blocks, try to merge them together into one return,
 /// introducing a select if the return values disagree.
-static bool SimplifyCondBranchToTwoReturns(BranchInst *BI) {
+static bool SimplifyCondBranchToTwoReturns(BranchInst *BI, 
+                                           IRBuilder<> &Builder) {
   assert(BI->isConditional() && "Must be a conditional branch");
   BasicBlock *TrueSucc = BI->getSuccessor(0);
   BasicBlock *FalseSucc = BI->getSuccessor(1);
@@ -1370,13 +1391,14 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI) {
   if (!FalseSucc->getFirstNonPHIOrDbg()->isTerminator())
     return false;
 
+  Builder.SetInsertPoint(BI);
   // Okay, we found a branch that is going to two return nodes.  If
   // there is no return value for this function, just change the
   // branch into a return.
   if (FalseRet->getNumOperands() == 0) {
     TrueSucc->removePredecessor(BI->getParent());
     FalseSucc->removePredecessor(BI->getParent());
-    ReturnInst::Create(BI->getContext(), 0, BI);
+    Builder.CreateRetVoid();
     EraseTerminatorInstAndDCECond(BI);
     return true;
   }
@@ -1419,14 +1441,14 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI) {
     } else if (isa<UndefValue>(TrueValue)) {
       TrueValue = FalseValue;
     } else {
-      TrueValue = SelectInst::Create(BrCond, TrueValue,
-                                     FalseValue, "retval", BI);
+      TrueValue = Builder.CreateSelect(BrCond, TrueValue,
+                                       FalseValue, "retval");
     }
   }
 
-  Value *RI = !TrueValue ?
-              ReturnInst::Create(BI->getContext(), BI) :
-              ReturnInst::Create(BI->getContext(), TrueValue, BI);
+  Value *RI = !TrueValue ? 
+    Builder.CreateRetVoid() : Builder.CreateRet(TrueValue);
+
   (void) RI;
       
   DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
@@ -1443,6 +1465,7 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI) {
 /// the predecessor and use logical operations to pick the right destination.
 bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
   BasicBlock *BB = BI->getParent();
+
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
   if (Cond == 0 || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
     Cond->getParent() != BB || !Cond->hasOneUse())
@@ -1563,7 +1586,8 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     }
 
     DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
-    
+    IRBuilder<> Builder(PBI);    
+
     // If we need to invert the condition in the pred block to match, do so now.
     if (InvertPredCond) {
       Value *NewCond = PBI->getCondition();
@@ -1572,8 +1596,8 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
         CmpInst *CI = cast<CmpInst>(NewCond);
         CI->setPredicate(CI->getInversePredicate());
       } else {
-        NewCond = BinaryOperator::CreateNot(NewCond,
-                                  PBI->getCondition()->getName()+".not", PBI);
+        NewCond = Builder.CreateNot(NewCond, 
+                                    PBI->getCondition()->getName()+".not");
       }
       
       PBI->setCondition(NewCond);
@@ -1600,8 +1624,9 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     New->takeName(Cond);
     Cond->setName(New->getName()+".old");
     
-    Value *NewCond = BinaryOperator::Create(Opc, PBI->getCondition(),
-                                            New, "or.cond", PBI);
+    Instruction *NewCond = 
+      cast<Instruction>(Builder.CreateBinOp(Opc, PBI->getCondition(),
+                                            New, "or.cond"));
     PBI->setCondition(NewCond);
     if (PBI->getSuccessor(0) == BB) {
       AddPredecessorToBlock(TrueDest, PredBlock, BB);
@@ -1744,23 +1769,22 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
   }  
   
   DEBUG(dbgs() << *PBI->getParent()->getParent());
-  
+
   // BI may have other predecessors.  Because of this, we leave
   // it alone, but modify PBI.
   
   // Make sure we get to CommonDest on True&True directions.
   Value *PBICond = PBI->getCondition();
+  IRBuilder<true, NoFolder> Builder(PBI);
   if (PBIOp)
-    PBICond = BinaryOperator::CreateNot(PBICond,
-                                        PBICond->getName()+".not",
-                                        PBI);
+    PBICond = Builder.CreateNot(PBICond, PBICond->getName()+".not");
+
   Value *BICond = BI->getCondition();
   if (BIOp)
-    BICond = BinaryOperator::CreateNot(BICond,
-                                       BICond->getName()+".not",
-                                       PBI);
+    BICond = Builder.CreateNot(BICond, BICond->getName()+".not");
+
   // Merge the conditions.
-  Value *Cond = BinaryOperator::CreateOr(PBICond, BICond, "brmerge", PBI);
+  Value *Cond = Builder.CreateOr(PBICond, BICond, "brmerge");
   
   // Modify PBI to branch on the new condition to the new dests.
   PBI->setCondition(Cond);
@@ -1783,8 +1807,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
     Value *PBIV = PN->getIncomingValue(PBBIdx);
     if (BIV != PBIV) {
       // Insert a select in PBI to pick the right value.
-      Value *NV = SelectInst::Create(PBICond, PBIV, BIV,
-                                     PBIV->getName()+".mux", PBI);
+      Value *NV = cast<SelectInst>
+        (Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName()+".mux"));
       PN->setIncomingValue(PBBIdx, NV);
     }
   }
@@ -1823,16 +1847,19 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
       Succ->removePredecessor(OldTerm->getParent());
   }
 
+  IRBuilder<> Builder(OldTerm);
+  Builder.SetCurrentDebugLocation(OldTerm->getDebugLoc());
+
   // Insert an appropriate new terminator.
   if ((KeepEdge1 == 0) && (KeepEdge2 == 0)) {
     if (TrueBB == FalseBB)
       // We were only looking for one successor, and it was present.
       // Create an unconditional branch to it.
-      BranchInst::Create(TrueBB, OldTerm);
+      Builder.CreateBr(TrueBB);
     else
       // We found both of the successors we were looking for.
       // Create a conditional branch sharing the condition of the select.
-      BranchInst::Create(TrueBB, FalseBB, Cond, OldTerm);
+      Builder.CreateCondBr(Cond, TrueBB, FalseBB);
   } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
     // Neither of the selected blocks were successors, so this
     // terminator must be unreachable.
@@ -1843,10 +1870,10 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
     // the edge to the one that wasn't must be unreachable.
     if (KeepEdge1 == 0)
       // Only TrueBB was found.
-      BranchInst::Create(TrueBB, OldTerm);
+      Builder.CreateBr(TrueBB);
     else
       // Only FalseBB was found.
-      BranchInst::Create(FalseBB, OldTerm);
+      Builder.CreateBr(FalseBB);
   }
 
   EraseTerminatorInstAndDCECond(OldTerm);
@@ -1911,8 +1938,10 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) {
 /// We prefer to split the edge to 'end' so that there is a true/false entry to
 /// the PHI, merging the third icmp into the switch.
 static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
-                                                  const TargetData *TD) {
+                                                  const TargetData *TD,
+                                                  IRBuilder<> &Builder) {
   BasicBlock *BB = ICI->getParent();
+
   // If the block has any PHIs in it or the icmp has multiple uses, it is too
   // complex.
   if (isa<PHINode>(BB->begin()) || !ICI->hasOneUse()) return false;
@@ -1990,7 +2019,9 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
   SI->addCase(Cst, NewBB);
   
   // NewBB branches to the phi block, add the uncond branch and the phi entry.
-  BranchInst::Create(SuccBlock, NewBB);
+  Builder.SetInsertPoint(NewBB);
+  Builder.SetCurrentDebugLocation(SI->getDebugLoc());
+  Builder.CreateBr(SuccBlock);
   PHIUse->addIncoming(NewCst, NewBB);
   return true;
 }
@@ -1998,7 +2029,8 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
 /// SimplifyBranchOnICmpChain - The specified branch is a conditional branch.
 /// Check to see if it is branching on an or/and chain of icmp instructions, and
 /// fold it into a switch instruction if so.
-static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD) {
+static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
+                                      IRBuilder<> &Builder) {
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
   if (Cond == 0) return false;
   
@@ -2054,11 +2086,12 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD) {
     BasicBlock *NewBB = BB->splitBasicBlock(BI, "switch.early.test");
     // Remove the uncond branch added to the old block.
     TerminatorInst *OldTI = BB->getTerminator();
-    
+    Builder.SetInsertPoint(OldTI);
+
     if (TrueWhenEqual)
-      BranchInst::Create(EdgeBB, NewBB, ExtraCase, OldTI);
+      Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB);
     else
-      BranchInst::Create(NewBB, EdgeBB, ExtraCase, OldTI);
+      Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
       
     OldTI->eraseFromParent();
     
@@ -2070,18 +2103,19 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD) {
           << "\nEXTRABB = " << *BB);
     BB = NewBB;
   }
-  
+
+  Builder.SetInsertPoint(BI);
   // Convert pointer to int before we switch.
   if (CompVal->getType()->isPointerTy()) {
     assert(TD && "Cannot switch on pointer without TargetData");
-    CompVal = new PtrToIntInst(CompVal,
-                               TD->getIntPtrType(CompVal->getContext()),
-                               "magicptr", BI);
+    CompVal = Builder.CreatePtrToInt(CompVal,
+                                     TD->getIntPtrType(CompVal->getContext()),
+                                     "magicptr");
   }
   
   // Create the new switch instruction now.
-  SwitchInst *New = SwitchInst::Create(CompVal, DefaultBB, Values.size(), BI);
-  
+  SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
+
   // Add all of the 'cases' to the switch instruction.
   for (unsigned i = 0, e = Values.size(); i != e; ++i)
     New->addCase(Values[i], EdgeBB);
@@ -2104,7 +2138,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD) {
   return true;
 }
 
-bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI) {
+bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
   BasicBlock *BB = RI->getParent();
   if (!BB->getFirstNonPHIOrDbg()->isTerminator()) return false;
   
@@ -2148,13 +2182,13 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI) {
     // Check to see if the non-BB successor is also a return block.
     if (isa<ReturnInst>(BI->getSuccessor(0)->getTerminator()) &&
         isa<ReturnInst>(BI->getSuccessor(1)->getTerminator()) &&
-        SimplifyCondBranchToTwoReturns(BI))
+        SimplifyCondBranchToTwoReturns(BI, Builder))
       return true;
   }
   return false;
 }
 
-bool SimplifyCFGOpt::SimplifyUnwind(UnwindInst *UI) {
+bool SimplifyCFGOpt::SimplifyUnwind(UnwindInst *UI, IRBuilder<> &Builder) {
   // Check to see if the first instruction in this block is just an unwind.
   // If so, replace any invoke instructions which use this as an exception
   // destination with call instructions.
@@ -2169,14 +2203,16 @@ bool SimplifyCFGOpt::SimplifyUnwind(UnwindInst *UI) {
     if (II && II->getUnwindDest() == BB) {
       // Insert a new branch instruction before the invoke, because this
       // is now a fall through.
-      BranchInst *BI = BranchInst::Create(II->getNormalDest(), II);
+      Builder.SetInsertPoint(II);
+      BranchInst *BI = Builder.CreateBr(II->getNormalDest());
       Pred->getInstList().remove(II);   // Take out of symbol table
       
       // Insert the call now.
       SmallVector<Value*,8> Args(II->op_begin(), II->op_end()-3);
-      CallInst *CI = CallInst::Create(II->getCalledValue(),
-                                      Args.begin(), Args.end(),
-                                      II->getName(), BI);
+      Builder.SetInsertPoint(BI);
+      CallInst *CI = Builder.CreateCall(II->getCalledValue(),
+                                        Args.begin(), Args.end(),
+                                        II->getName());
       CI->setCallingConv(II->getCallingConv());
       CI->setAttributes(II->getAttributes());
       // If the invoke produced a value, the Call now does instead.
@@ -2235,7 +2271,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
   SmallVector<BasicBlock*, 8> Preds(pred_begin(BB), pred_end(BB));
   for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
     TerminatorInst *TI = Preds[i]->getTerminator();
-    
+    IRBuilder<> Builder(TI);
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
       if (BI->isUnconditional()) {
         if (BI->getSuccessor(0) == BB) {
@@ -2245,10 +2281,10 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
         }
       } else {
         if (BI->getSuccessor(0) == BB) {
-          BranchInst::Create(BI->getSuccessor(1), BI);
+          Builder.CreateBr(BI->getSuccessor(1));
           EraseTerminatorInstAndDCECond(BI);
         } else if (BI->getSuccessor(1) == BB) {
-          BranchInst::Create(BI->getSuccessor(0), BI);
+          Builder.CreateBr(BI->getSuccessor(0));
           EraseTerminatorInstAndDCECond(BI);
           Changed = true;
         }
@@ -2312,14 +2348,15 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
       if (II->getUnwindDest() == BB) {
         // Convert the invoke to a call instruction.  This would be a good
         // place to note that the call does not throw though.
-        BranchInst *BI = BranchInst::Create(II->getNormalDest(), II);
+        BranchInst *BI = Builder.CreateBr(II->getNormalDest());
         II->removeFromParent();   // Take out of symbol table
         
         // Insert the call now...
         SmallVector<Value*, 8> Args(II->op_begin(), II->op_end()-3);
-        CallInst *CI = CallInst::Create(II->getCalledValue(),
-                                        Args.begin(), Args.end(),
-                                        II->getName(), BI);
+        Builder.SetInsertPoint(BI);
+        CallInst *CI = Builder.CreateCall(II->getCalledValue(),
+                                          Args.begin(), Args.end(),
+                                          II->getName());
         CI->setCallingConv(II->getCallingConv());
         CI->setAttributes(II->getAttributes());
         // If the invoke produced a value, the call does now instead.
@@ -2343,7 +2380,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
 
 /// TurnSwitchRangeIntoICmp - Turns a switch with that contains only a
 /// integer range comparison into a sub, an icmp and a branch.
-static bool TurnSwitchRangeIntoICmp(SwitchInst *SI) {
+static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
   assert(SI->getNumCases() > 2 && "Degenerate switch?");
 
   // Make sure all cases point to the same destination and gather the values.
@@ -2368,9 +2405,9 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI) {
 
   Value *Sub = SI->getCondition();
   if (!Offset->isNullValue())
-    Sub = BinaryOperator::CreateAdd(Sub, Offset, Sub->getName()+".off", SI);
-  Value *Cmp = new ICmpInst(SI, ICmpInst::ICMP_ULT, Sub, NumCases, "switch");
-  BranchInst::Create(SI->getSuccessor(1), SI->getDefaultDest(), Cmp, SI);
+    Sub = Builder.CreateAdd(Sub, Offset, Sub->getName()+".off");
+  Value *Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
+  Builder.CreateCondBr(Cmp, SI->getSuccessor(1), SI->getDefaultDest());
 
   // Prune obsolete incoming values off the successor's PHI nodes.
   for (BasicBlock::iterator BBI = SI->getSuccessor(1)->begin();
@@ -2383,7 +2420,37 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI) {
   return true;
 }
 
-bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI) {
+/// EliminateDeadSwitchCases - Compute masked bits for the condition of a switch
+/// and use it to remove dead cases.
+static bool EliminateDeadSwitchCases(SwitchInst *SI) {
+  Value *Cond = SI->getCondition();
+  unsigned Bits = cast<IntegerType>(Cond->getType())->getBitWidth();
+  APInt KnownZero(Bits, 0), KnownOne(Bits, 0);
+  ComputeMaskedBits(Cond, APInt::getAllOnesValue(Bits), KnownZero, KnownOne);
+
+  // Gather dead cases.
+  SmallVector<ConstantInt*, 8> DeadCases;
+  for (unsigned I = 1, E = SI->getNumCases(); I != E; ++I) {
+    if ((SI->getCaseValue(I)->getValue() & KnownZero) != 0 ||
+        (SI->getCaseValue(I)->getValue() & KnownOne) != KnownOne) {
+      DeadCases.push_back(SI->getCaseValue(I));
+      DEBUG(dbgs() << "SimplifyCFG: switch case '"
+                   << SI->getCaseValue(I)->getValue() << "' is dead.\n");
+    }
+  }
+
+  // Remove dead cases from the switch.
+  for (unsigned I = 0, E = DeadCases.size(); I != E; ++I) {
+    unsigned Case = SI->findCaseValue(DeadCases[I]);
+    // Prune unused values from PHI nodes.
+    SI->getSuccessor(Case)->removePredecessor(SI->getParent());
+    SI->removeCase(Case);
+  }
+
+  return !DeadCases.empty();
+}
+
+bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   // If this switch is too complex to want to look at, ignore it.
   if (!isValueEqualityComparison(SI))
     return false;
@@ -2393,7 +2460,7 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI) {
   // If we only have one predecessor, and if it is a branch on this value,
   // see if that predecessor totally determines the outcome of this switch.
   if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
-    if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred))
+    if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder))
       return SimplifyCFG(BB) | true;
 
   Value *Cond = SI->getCondition();
@@ -2408,13 +2475,17 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI) {
   while (isa<DbgInfoIntrinsic>(BBI))
     ++BBI;
   if (SI == &*BBI)
-    if (FoldValueComparisonIntoPredecessors(SI))
+    if (FoldValueComparisonIntoPredecessors(SI, Builder))
       return SimplifyCFG(BB) | true;
 
   // Try to transform the switch into an icmp and a branch.
-  if (TurnSwitchRangeIntoICmp(SI))
+  if (TurnSwitchRangeIntoICmp(SI, Builder))
     return SimplifyCFG(BB) | true;
-  
+
+  // Remove unreachable cases.
+  if (EliminateDeadSwitchCases(SI))
+    return SimplifyCFG(BB) | true;
+
   return false;
 }
 
@@ -2455,7 +2526,7 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
   return Changed;
 }
 
-bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI) {
+bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
   BasicBlock *BB = BI->getParent();
   
   // If the Terminator is the only non-phi instruction, simplify the block.
@@ -2470,7 +2541,8 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI) {
     if (ICI->isEquality() && isa<ConstantInt>(ICI->getOperand(1))) {
       for (++I; isa<DbgInfoIntrinsic>(I); ++I)
         ;
-      if (I->isTerminator() && TryToSimplifyUncondBranchWithICmpInIt(ICI, TD))
+      if (I->isTerminator() 
+          && TryToSimplifyUncondBranchWithICmpInIt(ICI, TD, Builder))
         return true;
     }
   
@@ -2478,7 +2550,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI) {
 }
 
 
-bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI) {
+bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   BasicBlock *BB = BI->getParent();
   
   // Conditional branch
@@ -2487,7 +2559,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI) {
     // see if that predecessor totally determines the outcome of this
     // switch.
     if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
-      if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred))
+      if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
         return SimplifyCFG(BB) | true;
     
     // This block must be empty, except for the setcond inst, if it exists.
@@ -2497,20 +2569,20 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI) {
     while (isa<DbgInfoIntrinsic>(I))
       ++I;
     if (&*I == BI) {
-      if (FoldValueComparisonIntoPredecessors(BI))
+      if (FoldValueComparisonIntoPredecessors(BI, Builder))
         return SimplifyCFG(BB) | true;
     } else if (&*I == cast<Instruction>(BI->getCondition())){
       ++I;
       // Ignore dbg intrinsics.
       while (isa<DbgInfoIntrinsic>(I))
         ++I;
-      if (&*I == BI && FoldValueComparisonIntoPredecessors(BI))
+      if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
         return SimplifyCFG(BB) | true;
     }
   }
   
   // Try to turn "br (X == 0 | X == 1), T, F" into a switch instruction.
-  if (SimplifyBranchOnICmpChain(BI, TD))
+  if (SimplifyBranchOnICmpChain(BI, TD, Builder))
     return true;
   
   // We have a conditional branch to two blocks that are only reachable
@@ -2581,7 +2653,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 
   // Check to see if we can constant propagate this terminator instruction
   // away...
-  Changed |= ConstantFoldTerminator(BB);
+  Changed |= ConstantFoldTerminator(BB, true);
 
   // Check for and eliminate duplicate PHI nodes in this block.
   Changed |= EliminateDuplicatePHINodes(BB);
@@ -2593,27 +2665,30 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
   if (MergeBlockIntoPredecessor(BB))
     return true;
   
+  IRBuilder<> Builder(BB);
+
   // If there is a trivial two-entry PHI node in this basic block, and we can
   // eliminate it, do so now.
   if (PHINode *PN = dyn_cast<PHINode>(BB->begin()))
     if (PN->getNumIncomingValues() == 2)
       Changed |= FoldTwoEntryPHINode(PN, TD);
 
+  Builder.SetInsertPoint(BB->getTerminator());
   if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
     if (BI->isUnconditional()) {
-      if (SimplifyUncondBranch(BI)) return true;
+      if (SimplifyUncondBranch(BI, Builder)) return true;
     } else {
-      if (SimplifyCondBranch(BI)) return true;
+      if (SimplifyCondBranch(BI, Builder)) return true;
     }
   } else if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
-    if (SimplifyReturn(RI)) return true;
+    if (SimplifyReturn(RI, Builder)) return true;
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
-    if (SimplifySwitch(SI)) return true;
+    if (SimplifySwitch(SI, Builder)) return true;
   } else if (UnreachableInst *UI =
                dyn_cast<UnreachableInst>(BB->getTerminator())) {
     if (SimplifyUnreachable(UI)) return true;
   } else if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) {
-    if (SimplifyUnwind(UI)) return true;
+    if (SimplifyUnwind(UI, Builder)) return true;
   } else if (IndirectBrInst *IBI =
                dyn_cast<IndirectBrInst>(BB->getTerminator())) {
     if (SimplifyIndirectBr(IBI)) return true;
diff --git a/lib/VMCore/Attributes.cpp b/lib/VMCore/Attributes.cpp
index 92152a3..ee257db 100644
--- a/lib/VMCore/Attributes.cpp
+++ b/lib/VMCore/Attributes.cpp
@@ -36,6 +36,8 @@ std::string Attribute::getAsString(Attributes Attrs) {
     Result += "noreturn ";
   if (Attrs & Attribute::NoUnwind)
     Result += "nounwind ";
+  if (Attrs & Attribute::UWTable)
+    Result += "uwtable ";
   if (Attrs & Attribute::InReg)
     Result += "inreg ";
   if (Attrs & Attribute::NoAlias)
diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp
index 4541f38..f8f15ca 100644
--- a/lib/VMCore/AutoUpgrade.cpp
+++ b/lib/VMCore/AutoUpgrade.cpp
@@ -285,7 +285,33 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     }
 
     break;
-  case 'x': 
+  case 'x':
+    // This fixes the poorly named crc32 intrinsics
+    if (Name.compare(5, 13, "x86.sse42.crc", 13) == 0) {
+      const char* NewFnName = NULL;
+      if (Name.compare(18, 2, "32", 2) == 0) {
+        if (Name.compare(20, 2, ".8") == 0 && Name.length() == 22) {
+          NewFnName = "llvm.x86.sse42.crc32.32.8";
+        } else if (Name.compare(20, 3, ".16") == 0 && Name.length() == 23) {
+          NewFnName = "llvm.x86.sse42.crc32.32.16";
+        } else if (Name.compare(20, 3, ".32") == 0 && Name.length() == 23) {
+          NewFnName = "llvm.x86.sse42.crc32.32.32";
+        }
+      }
+      else if (Name.compare(18, 2, "64", 2) == 0) {
+        if (Name.compare(20, 2, ".8") == 0 && Name.length() == 22) {
+          NewFnName = "llvm.x86.sse42.crc32.64.8";
+        } else if (Name.compare(20, 3, ".64") == 0 && Name.length() == 23) {
+          NewFnName = "llvm.x86.sse42.crc32.64.64";
+        }
+      }
+      if (NewFnName) {
+        F->setName(NewFnName);
+        NewFn = F;
+        return true;
+      }
+    }
+
     // This fixes all MMX shift intrinsic instructions to take a
     // x86_mmx instead of a v1i64, v2i32, v4i16, or v8i8.
     if (Name.compare(5, 8, "x86.mmx.", 8) == 0) {
@@ -533,6 +559,13 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
       // Calls to these instructions are transformed into unaligned loads.
       NewFn = 0;
       return true;
+    } else if (Name.compare(5, 16, "x86.sse.movnt.ps", 16) == 0 ||
+               Name.compare(5, 17, "x86.sse2.movnt.dq", 17) == 0 ||
+               Name.compare(5, 17, "x86.sse2.movnt.pd", 17) == 0 ||
+               Name.compare(5, 17, "x86.sse2.movnt.i", 16) == 0) {
+      // Calls to these instructions are transformed into nontemporal stores.
+      NewFn = 0;
+      return true;
     } else if (Name.compare(5, 17, "x86.ssse3.pshuf.w", 17) == 0) {
       // This is an SSE/MMX instruction.
       const Type *X86_MMXTy = VectorType::getX86_MMXTy(FTy->getContext());
@@ -975,6 +1008,31 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
       // Remove intrinsic.
       CI->eraseFromParent();
+    } else if (F->getName() == "llvm.x86.sse.movnt.ps" ||
+               F->getName() == "llvm.x86.sse2.movnt.dq" ||
+               F->getName() == "llvm.x86.sse2.movnt.pd" ||
+               F->getName() == "llvm.x86.sse2.movnt.i") {
+      IRBuilder<> Builder(C);
+      Builder.SetInsertPoint(CI->getParent(), CI);
+
+      Module *M = F->getParent();
+      SmallVector<Value *, 1> Elts;
+      Elts.push_back(ConstantInt::get(Type::getInt32Ty(C), 1));
+      MDNode *Node = MDNode::get(C, Elts);
+
+      Value *Arg0 = CI->getArgOperand(0);
+      Value *Arg1 = CI->getArgOperand(1);
+
+      // Convert the type of the pointer to a pointer to the stored type.
+      Value *BC = Builder.CreateBitCast(Arg0,
+                                        PointerType::getUnqual(Arg1->getType()),
+                                        "cast");
+      StoreInst *SI = Builder.CreateStore(Arg1, BC);
+      SI->setMetadata(M->getMDKindID("nontemporal"), Node);
+      SI->setAlignment(16);
+
+      // Remove intrinsic.
+      CI->eraseFromParent();
     } else {
       llvm_unreachable("Unknown function for CallInst upgrade.");
     }
diff --git a/lib/VMCore/DebugInfoProbe.cpp b/lib/VMCore/DebugInfoProbe.cpp
index 334c3d8..d1275ff 100644
--- a/lib/VMCore/DebugInfoProbe.cpp
+++ b/lib/VMCore/DebugInfoProbe.cpp
@@ -51,7 +51,6 @@ namespace llvm {
     unsigned NumDbgLineLost, NumDbgValueLost;
     std::string PassName;
     Function *TheFn;
-    std::set<unsigned> LineNos;
     std::set<MDNode *> DbgVariables;
     std::set<Instruction *> MissingDebugLoc;
   };
@@ -60,37 +59,19 @@ namespace llvm {
 //===----------------------------------------------------------------------===//
 // DebugInfoProbeImpl
 
-static void collect(Function &F, std::set<unsigned> &Lines) {
-  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
-    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); 
-         BI != BE; ++BI) {
-      const DebugLoc &DL = BI->getDebugLoc();
-      unsigned LineNo = 0;
-      if (!DL.isUnknown()) {
-        if (MDNode *N = DL.getInlinedAt(F.getContext()))
-          LineNo = DebugLoc::getFromDILocation(N).getLine();
-        else
-          LineNo = DL.getLine();
-
-        Lines.insert(LineNo);
-      }
-    }
-}
-
 /// initialize - Collect information before running an optimization pass.
 void DebugInfoProbeImpl::initialize(StringRef PName, Function &F) {
   if (!EnableDebugInfoProbe) return;
   PassName = PName;
 
-  LineNos.clear();
   DbgVariables.clear();
+  MissingDebugLoc.clear();
   TheFn = &F;
-  collect(F, LineNos);
 
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); 
          BI != BE; ++BI) {
-      if (BI->getDebugLoc().isUnknown())
+      if (!isa<PHINode>(BI) && BI->getDebugLoc().isUnknown())
         MissingDebugLoc.insert(BI);
       if (!isa<DbgInfoIntrinsic>(BI)) continue;
       Value *Addr = NULL;
@@ -130,30 +111,16 @@ void DebugInfoProbeImpl::report() {
 /// must be used after initialization.
 void DebugInfoProbeImpl::finalize(Function &F) {
   if (!EnableDebugInfoProbe) return;
-  std::set<unsigned> LineNos2;
-  collect(F, LineNos2);
   assert (TheFn == &F && "Invalid function to measure!");
 
-  for (std::set<unsigned>::iterator I = LineNos.begin(),
-         E = LineNos.end(); I != E; ++I) {
-    unsigned LineNo = *I;
-    if (LineNos2.count(LineNo) == 0) {
-      DEBUG(dbgs() 
-            << "DebugInfoProbe("
-            << PassName
-            << "): Losing dbg info for source line " 
-            << LineNo << "\n");
-      ++NumDbgLineLost;
-    }
-  }
-
   std::set<MDNode *>DbgVariables2;
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); 
          BI != BE; ++BI) {
-      if (BI->getDebugLoc().isUnknown() &&
+      if (!isa<PHINode>(BI) && BI->getDebugLoc().isUnknown() &&
           MissingDebugLoc.count(BI) == 0) {
-        DEBUG(dbgs() << "DebugInfoProbe(" << PassName << "): --- ");
+        ++NumDbgLineLost;
+        DEBUG(dbgs() << "DebugInfoProbe (" << PassName << "): --- ");
         DEBUG(BI->print(dbgs()));
         DEBUG(dbgs() << "\n");
       }
diff --git a/lib/VMCore/Function.cpp b/lib/VMCore/Function.cpp
index 013c458..0ae0bdb 100644
--- a/lib/VMCore/Function.cpp
+++ b/lib/VMCore/Function.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Support/Threading.h"
 #include "SymbolTableListTraitsImpl.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 using namespace llvm;
 
@@ -78,6 +79,12 @@ bool Argument::hasByValAttr() const {
   return getParent()->paramHasAttr(getArgNo()+1, Attribute::ByVal);
 }
 
+unsigned Argument::getParamAlignment() const {
+  assert(getType()->isPointerTy() && "Only pointers have alignments");
+  return getParent()->getParamAlignment(getArgNo()+1);
+  
+}
+
 /// hasNestAttr - Return true if this argument has the nest attribute on
 /// it in its containing function.
 bool Argument::hasNestAttr() const {
@@ -406,4 +413,36 @@ bool Function::hasAddressTaken(const User* *PutOffender) const {
   return false;
 }
 
+/// callsFunctionThatReturnsTwice - Return true if the function has a call to
+/// setjmp or other function that gcc recognizes as "returning twice".
+///
+/// FIXME: Remove after <rdar://problem/8031714> is fixed.
+/// FIXME: Is the obove FIXME valid?
+bool Function::callsFunctionThatReturnsTwice() const {
+  const Module *M = this->getParent();
+  static const char *ReturnsTwiceFns[] = {
+    "_setjmp",
+    "setjmp",
+    "sigsetjmp",
+    "setjmp_syscall",
+    "savectx",
+    "qsetjmp",
+    "vfork",
+    "getcontext"
+  };
+
+  for (unsigned I = 0; I < array_lengthof(ReturnsTwiceFns); ++I)
+    if (const Function *Callee = M->getFunction(ReturnsTwiceFns[I])) {
+      if (!Callee->use_empty())
+        for (Value::const_use_iterator
+               I = Callee->use_begin(), E = Callee->use_end();
+             I != E; ++I)
+          if (const CallInst *CI = dyn_cast<CallInst>(*I))
+            if (CI->getParent()->getParent() == this)
+              return true;
+    }
+
+  return false;
+}
+
 // vim: sw=2 ai
diff --git a/lib/VMCore/IRBuilder.cpp b/lib/VMCore/IRBuilder.cpp
index 2149155..f2d469a 100644
--- a/lib/VMCore/IRBuilder.cpp
+++ b/lib/VMCore/IRBuilder.cpp
@@ -60,7 +60,6 @@ static CallInst *createCallHelper(Value *Callee, Value *const* Ops,
   return CI;  
 }
 
-
 CallInst *IRBuilderBase::
 CreateMemSet(Value *Ptr, Value *Val, Value *Size, unsigned Align,
              bool isVolatile, MDNode *TBAATag) {
@@ -118,3 +117,33 @@ CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
   
   return CI;  
 }
+
+CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
+  assert(isa<PointerType>(Ptr->getType()) &&
+	 "lifetime.start only applies to pointers.");
+  Ptr = getCastedInt8PtrValue(Ptr);
+  if (!Size)
+    Size = getInt64(-1);
+  else
+    assert(Size->getType() == getInt64Ty() &&
+	   "lifetime.start requires the size to be an i64");
+  Value *Ops[] = { Size, Ptr };
+  Module *M = BB->getParent()->getParent();
+  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_start);
+  return createCallHelper(TheFn, Ops, 2, this);
+}
+
+CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) {
+  assert(isa<PointerType>(Ptr->getType()) &&
+	 "lifetime.end only applies to pointers.");
+  Ptr = getCastedInt8PtrValue(Ptr);
+  if (!Size)
+    Size = getInt64(-1);
+  else
+    assert(Size->getType() == getInt64Ty() &&
+	   "lifetime.end requires the size to be an i64");
+  Value *Ops[] = { Size, Ptr };
+  Module *M = BB->getParent()->getParent();
+  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_end);
+  return createCallHelper(TheFn, Ops, 2, this);
+}
diff --git a/lib/VMCore/InlineAsm.cpp b/lib/VMCore/InlineAsm.cpp
index e4f99f0..bd3667d 100644
--- a/lib/VMCore/InlineAsm.cpp
+++ b/lib/VMCore/InlineAsm.cpp
@@ -181,6 +181,11 @@ bool InlineAsm::ConstraintInfo::Parse(StringRef Str,
       multipleAlternativeIndex++;
       pCodes = &multipleAlternatives[multipleAlternativeIndex].Codes;
       ++I;
+    } else if (*I == '^') {
+      // Multi-letter constraint
+      // FIXME: For now assuming these are 2-character constraints.
+      pCodes->push_back(std::string(I+1, I+3));
+      I += 3;
     } else {
       // Single letter constraint.
       pCodes->push_back(std::string(I, I+1));
diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp
index 61da9b6..8f4eabe 100644
--- a/lib/VMCore/Instructions.cpp
+++ b/lib/VMCore/Instructions.cpp
@@ -2076,6 +2076,7 @@ unsigned CastInst::isEliminableCastPair(
 
 CastInst *CastInst::Create(Instruction::CastOps op, Value *S, const Type *Ty, 
   const Twine &Name, Instruction *InsertBefore) {
+  assert(castIsValid(op, S, Ty) && "Invalid cast!");
   // Construct and return the appropriate CastInst subclass
   switch (op) {
     case Trunc:    return new TruncInst    (S, Ty, Name, InsertBefore);
@@ -2098,6 +2099,7 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, const Type *Ty,
 
 CastInst *CastInst::Create(Instruction::CastOps op, Value *S, const Type *Ty,
   const Twine &Name, BasicBlock *InsertAtEnd) {
+  assert(castIsValid(op, S, Ty) && "Invalid cast!");
   // Construct and return the appropriate CastInst subclass
   switch (op) {
     case Trunc:    return new TruncInst    (S, Ty, Name, InsertAtEnd);
@@ -2254,60 +2256,56 @@ bool CastInst::isCastable(const Type *SrcTy, const Type *DestTy) {
   if (SrcTy == DestTy)
     return true;
 
+  if (const VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy))
+    if (const VectorType *DestVecTy = dyn_cast<VectorType>(DestTy))
+      if (SrcVecTy->getNumElements() == DestVecTy->getNumElements()) {
+        // An element by element cast.  Valid if casting the elements is valid.
+        SrcTy = SrcVecTy->getElementType();
+        DestTy = DestVecTy->getElementType();
+      }
+
   // Get the bit sizes, we'll need these
-  unsigned SrcBits = SrcTy->getScalarSizeInBits();   // 0 for ptr
-  unsigned DestBits = DestTy->getScalarSizeInBits(); // 0 for ptr
+  unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();   // 0 for ptr
+  unsigned DestBits = DestTy->getPrimitiveSizeInBits(); // 0 for ptr
 
   // Run through the possibilities ...
-  if (DestTy->isIntegerTy()) {                   // Casting to integral
-    if (SrcTy->isIntegerTy()) {                  // Casting from integral
+  if (DestTy->isIntegerTy()) {               // Casting to integral
+    if (SrcTy->isIntegerTy()) {                // Casting from integral
         return true;
-    } else if (SrcTy->isFloatingPointTy()) {     // Casting from floating pt
+    } else if (SrcTy->isFloatingPointTy()) {   // Casting from floating pt
       return true;
-    } else if (const VectorType *PTy = dyn_cast<VectorType>(SrcTy)) {
-                                               // Casting from vector
-      return DestBits == PTy->getBitWidth();
+    } else if (SrcTy->isVectorTy()) {          // Casting from vector
+      return DestBits == SrcBits;
     } else {                                   // Casting from something else
       return SrcTy->isPointerTy();
     }
-  } else if (DestTy->isFloatingPointTy()) {      // Casting to floating pt
-    if (SrcTy->isIntegerTy()) {                  // Casting from integral
+  } else if (DestTy->isFloatingPointTy()) {  // Casting to floating pt
+    if (SrcTy->isIntegerTy()) {                // Casting from integral
       return true;
-    } else if (SrcTy->isFloatingPointTy()) {     // Casting from floating pt
+    } else if (SrcTy->isFloatingPointTy()) {   // Casting from floating pt
       return true;
-    } else if (const VectorType *PTy = dyn_cast<VectorType>(SrcTy)) {
-                                               // Casting from vector
-      return DestBits == PTy->getBitWidth();
+    } else if (SrcTy->isVectorTy()) {          // Casting from vector
+      return DestBits == SrcBits;
     } else {                                   // Casting from something else
       return false;
     }
-  } else if (const VectorType *DestPTy = dyn_cast<VectorType>(DestTy)) {
-                                                // Casting to vector
-    if (const VectorType *SrcPTy = dyn_cast<VectorType>(SrcTy)) {
-                                                // Casting from vector
-      return DestPTy->getBitWidth() == SrcPTy->getBitWidth();
-    } else if (DestPTy->getBitWidth() == SrcBits) {
-      return true;                              // float/int -> vector
-    } else if (SrcTy->isX86_MMXTy()) {
-      return DestPTy->getBitWidth() == 64;      // MMX to 64-bit vector
-    } else {
-      return false;
-    }
+  } else if (DestTy->isVectorTy()) {         // Casting to vector
+    return DestBits == SrcBits;
   } else if (DestTy->isPointerTy()) {        // Casting to pointer
-    if (SrcTy->isPointerTy()) {              // Casting from pointer
+    if (SrcTy->isPointerTy()) {                // Casting from pointer
       return true;
-    } else if (SrcTy->isIntegerTy()) {            // Casting from integral
+    } else if (SrcTy->isIntegerTy()) {         // Casting from integral
       return true;
-    } else {                                    // Casting from something else
+    } else {                                   // Casting from something else
       return false;
     }
   } else if (DestTy->isX86_MMXTy()) {
-    if (const VectorType *SrcPTy = dyn_cast<VectorType>(SrcTy)) {
-      return SrcPTy->getBitWidth() == 64;       // 64-bit vector to MMX
+    if (SrcTy->isVectorTy()) {
+      return DestBits == SrcBits;       // 64-bit vector to MMX
     } else {
       return false;
     }
-  } else {                                      // Casting to something else
+  } else {                                   // Casting to something else
     return false;
   }
 }
@@ -2322,14 +2320,27 @@ bool CastInst::isCastable(const Type *SrcTy, const Type *DestTy) {
 Instruction::CastOps
 CastInst::getCastOpcode(
   const Value *Src, bool SrcIsSigned, const Type *DestTy, bool DestIsSigned) {
-  // Get the bit sizes, we'll need these
   const Type *SrcTy = Src->getType();
-  unsigned SrcBits = SrcTy->getScalarSizeInBits();   // 0 for ptr
-  unsigned DestBits = DestTy->getScalarSizeInBits(); // 0 for ptr
 
   assert(SrcTy->isFirstClassType() && DestTy->isFirstClassType() &&
          "Only first class types are castable!");
 
+  if (SrcTy == DestTy)
+    return BitCast;
+
+  if (const VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy))
+    if (const VectorType *DestVecTy = dyn_cast<VectorType>(DestTy))
+      if (SrcVecTy->getNumElements() == DestVecTy->getNumElements()) {
+        // An element by element cast.  Find the appropriate opcode based on the
+        // element types.
+        SrcTy = SrcVecTy->getElementType();
+        DestTy = DestVecTy->getElementType();
+      }
+
+  // Get the bit sizes, we'll need these
+  unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();   // 0 for ptr
+  unsigned DestBits = DestTy->getPrimitiveSizeInBits(); // 0 for ptr
+
   // Run through the possibilities ...
   if (DestTy->isIntegerTy()) {                      // Casting to integral
     if (SrcTy->isIntegerTy()) {                     // Casting from integral
@@ -2348,10 +2359,9 @@ CastInst::getCastOpcode(
         return FPToSI;                              // FP -> sint
       else
         return FPToUI;                              // FP -> uint 
-    } else if (const VectorType *PTy = dyn_cast<VectorType>(SrcTy)) {
-      assert(DestBits == PTy->getBitWidth() &&
-               "Casting vector to integer of different width");
-      PTy = NULL;
+    } else if (SrcTy->isVectorTy()) {
+      assert(DestBits == SrcBits &&
+             "Casting vector to integer of different width");
       return BitCast;                             // Same size, no-op cast
     } else {
       assert(SrcTy->isPointerTy() &&
@@ -2372,29 +2382,17 @@ CastInst::getCastOpcode(
       } else  {
         return BitCast;                             // same size, no-op cast
       }
-    } else if (const VectorType *PTy = dyn_cast<VectorType>(SrcTy)) {
-      assert(DestBits == PTy->getBitWidth() &&
+    } else if (SrcTy->isVectorTy()) {
+      assert(DestBits == SrcBits &&
              "Casting vector to floating point of different width");
-      PTy = NULL;
       return BitCast;                             // same size, no-op cast
     } else {
       llvm_unreachable("Casting pointer or non-first class to float");
     }
-  } else if (const VectorType *DestPTy = dyn_cast<VectorType>(DestTy)) {
-    if (const VectorType *SrcPTy = dyn_cast<VectorType>(SrcTy)) {
-      assert(DestPTy->getBitWidth() == SrcPTy->getBitWidth() &&
-             "Casting vector to vector of different widths");
-      SrcPTy = NULL;
-      return BitCast;                             // vector -> vector
-    } else if (DestPTy->getBitWidth() == SrcBits) {
-      return BitCast;                               // float/int -> vector
-    } else if (SrcTy->isX86_MMXTy()) {
-      assert(DestPTy->getBitWidth()==64 &&
-             "Casting X86_MMX to vector of wrong width");
-      return BitCast;                             // MMX to 64-bit vector
-    } else {
-      assert(!"Illegal cast to vector (wrong type or size)");
-    }
+  } else if (DestTy->isVectorTy()) {
+    assert(DestBits == SrcBits &&
+           "Illegal cast to vector (wrong type or size)");
+    return BitCast;
   } else if (DestTy->isPointerTy()) {
     if (SrcTy->isPointerTy()) {
       return BitCast;                               // ptr -> ptr
@@ -2404,9 +2402,8 @@ CastInst::getCastOpcode(
       assert(!"Casting pointer to other than pointer or int");
     }
   } else if (DestTy->isX86_MMXTy()) {
-    if (isa<VectorType>(SrcTy)) {
-      assert(cast<VectorType>(SrcTy)->getBitWidth() == 64 &&
-             "Casting vector of wrong width to X86_MMX");
+    if (SrcTy->isVectorTy()) {
+      assert(DestBits == SrcBits && "Casting vector of wrong width to X86_MMX");
       return BitCast;                               // 64-bit vector to MMX
     } else {
       assert(!"Illegal cast to X86_MMX");
@@ -2442,46 +2439,40 @@ CastInst::castIsValid(Instruction::CastOps op, Value *S, const Type *DstTy) {
   unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
   unsigned DstBitSize = DstTy->getScalarSizeInBits();
 
+  // If these are vector types, get the lengths of the vectors (using zero for
+  // scalar types means that checking that vector lengths match also checks that
+  // scalars are not being converted to vectors or vectors to scalars).
+  unsigned SrcLength = SrcTy->isVectorTy() ?
+    cast<VectorType>(SrcTy)->getNumElements() : 0;
+  unsigned DstLength = DstTy->isVectorTy() ?
+    cast<VectorType>(DstTy)->getNumElements() : 0;
+
   // Switch on the opcode provided
   switch (op) {
   default: return false; // This is an input error
   case Instruction::Trunc:
-    return SrcTy->isIntOrIntVectorTy() &&
-           DstTy->isIntOrIntVectorTy()&& SrcBitSize > DstBitSize;
+    return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() &&
+      SrcLength == DstLength && SrcBitSize > DstBitSize;
   case Instruction::ZExt:
-    return SrcTy->isIntOrIntVectorTy() &&
-           DstTy->isIntOrIntVectorTy()&& SrcBitSize < DstBitSize;
+    return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() &&
+      SrcLength == DstLength && SrcBitSize < DstBitSize;
   case Instruction::SExt: 
-    return SrcTy->isIntOrIntVectorTy() &&
-           DstTy->isIntOrIntVectorTy()&& SrcBitSize < DstBitSize;
+    return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() &&
+      SrcLength == DstLength && SrcBitSize < DstBitSize;
   case Instruction::FPTrunc:
-    return SrcTy->isFPOrFPVectorTy() &&
-           DstTy->isFPOrFPVectorTy() && 
-           SrcBitSize > DstBitSize;
+    return SrcTy->isFPOrFPVectorTy() && DstTy->isFPOrFPVectorTy() &&
+      SrcLength == DstLength && SrcBitSize > DstBitSize;
   case Instruction::FPExt:
-    return SrcTy->isFPOrFPVectorTy() &&
-           DstTy->isFPOrFPVectorTy() && 
-           SrcBitSize < DstBitSize;
+    return SrcTy->isFPOrFPVectorTy() && DstTy->isFPOrFPVectorTy() &&
+      SrcLength == DstLength && SrcBitSize < DstBitSize;
   case Instruction::UIToFP:
   case Instruction::SIToFP:
-    if (const VectorType *SVTy = dyn_cast<VectorType>(SrcTy)) {
-      if (const VectorType *DVTy = dyn_cast<VectorType>(DstTy)) {
-        return SVTy->getElementType()->isIntOrIntVectorTy() &&
-               DVTy->getElementType()->isFPOrFPVectorTy() &&
-               SVTy->getNumElements() == DVTy->getNumElements();
-      }
-    }
-    return SrcTy->isIntOrIntVectorTy() && DstTy->isFPOrFPVectorTy();
+    return SrcTy->isIntOrIntVectorTy() && DstTy->isFPOrFPVectorTy() &&
+      SrcLength == DstLength;
   case Instruction::FPToUI:
   case Instruction::FPToSI:
-    if (const VectorType *SVTy = dyn_cast<VectorType>(SrcTy)) {
-      if (const VectorType *DVTy = dyn_cast<VectorType>(DstTy)) {
-        return SVTy->getElementType()->isFPOrFPVectorTy() &&
-               DVTy->getElementType()->isIntOrIntVectorTy() &&
-               SVTy->getNumElements() == DVTy->getNumElements();
-      }
-    }
-    return SrcTy->isFPOrFPVectorTy() && DstTy->isIntOrIntVectorTy();
+    return SrcTy->isFPOrFPVectorTy() && DstTy->isIntOrIntVectorTy() &&
+      SrcLength == DstLength;
   case Instruction::PtrToInt:
     return SrcTy->isPointerTy() && DstTy->isIntegerTy();
   case Instruction::IntToPtr:
diff --git a/lib/VMCore/PassManager.cpp b/lib/VMCore/PassManager.cpp
index 637fa79..5cf2905 100644
--- a/lib/VMCore/PassManager.cpp
+++ b/lib/VMCore/PassManager.cpp
@@ -449,9 +449,9 @@ namespace {
 static DebugInfoProbeInfo *TheDebugProbe;
 static void createDebugInfoProbe() {
   if (TheDebugProbe) return;
-      
-  // Constructed the first time this is called. This guarantees that the 
-  // object will be constructed, if -enable-debug-info-probe is set, 
+
+  // Constructed the first time this is called. This guarantees that the
+  // object will be constructed, if -enable-debug-info-probe is set,
   // before static globals, thus it will be destroyed before them.
   static ManagedStatic<DebugInfoProbeInfo> DIP;
   TheDebugProbe = &*DIP;
@@ -632,6 +632,7 @@ void PMTopLevelManager::schedulePass(Pass *P) {
       Pass *AnalysisPass = findAnalysisPass(*I);
       if (!AnalysisPass) {
         const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
+        assert(PI && "Expected required passes to be initialized");
         AnalysisPass = PI->createPass();
         if (P->getPotentialPassManagerType () ==
             AnalysisPass->getPotentialPassManagerType())
@@ -686,6 +687,7 @@ Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
     // If Pass not found then check the interfaces implemented by Immutable Pass
     const PassInfo *PassInf =
       PassRegistry::getPassRegistry()->getPassInfo(PI);
+    assert(PassInf && "Expected all immutable passes to be initialized");
     const std::vector<const PassInfo*> &ImmPI =
       PassInf->getInterfacesImplemented();
     for (std::vector<const PassInfo*>::const_iterator II = ImmPI.begin(),
@@ -727,9 +729,11 @@ void PMTopLevelManager::dumpArguments() const {
   for (SmallVector<ImmutablePass *, 8>::const_iterator I =
        ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
     if (const PassInfo *PI =
-          PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID()))
+        PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID())) {
+      assert(PI && "Expected all immutable passes to be initialized");
       if (!PI->isAnalysisGroup())
         dbgs() << " -" << PI->getPassArgument();
+    }
   for (SmallVector<PMDataManager *, 8>::const_iterator I = PassManagers.begin(),
          E = PassManagers.end(); I != E; ++I)
     (*I)->dumpPassArguments();
@@ -1183,6 +1187,12 @@ void PMDataManager::dumpAnalysisUsage(StringRef Msg, const Pass *P,
   for (unsigned i = 0; i != Set.size(); ++i) {
     if (i) dbgs() << ',';
     const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(Set[i]);
+    if (!PInf) {
+      // Some preserved passes, such as AliasAnalysis, may not be initialized by
+      // all drivers.
+      dbgs() << " Uninitialized Pass";
+      continue;
+    }
     dbgs() << ' ' << PInf->getPassName();
   }
   dbgs() << '\n';
diff --git a/lib/VMCore/Type.cpp b/lib/VMCore/Type.cpp
index b15304c..e4496db 100644
--- a/lib/VMCore/Type.cpp
+++ b/lib/VMCore/Type.cpp
@@ -197,6 +197,25 @@ bool Type::canLosslesslyBitCastTo(const Type *Ty) const {
   return false;  // Other types have no identity values
 }
 
+bool Type::isEmptyTy() const {
+  const ArrayType *ATy = dyn_cast<ArrayType>(this);
+  if (ATy) {
+    unsigned NumElements = ATy->getNumElements();
+    return NumElements == 0 || ATy->getElementType()->isEmptyTy();
+  }
+
+  const StructType *STy = dyn_cast<StructType>(this);
+  if (STy) {
+    unsigned NumElements = STy->getNumElements();
+    for (unsigned i = 0; i < NumElements; ++i)
+      if (!STy->getElementType(i)->isEmptyTy())
+        return false;
+    return true;
+  }
+
+  return false;
+}
+
 unsigned Type::getPrimitiveSizeInBits() const {
   switch (getTypeID()) {
   case Type::FloatTyID: return 32;
diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
index 8b89110..139e035 100644
--- a/lib/VMCore/Verifier.cpp
+++ b/lib/VMCore/Verifier.cpp
@@ -1645,6 +1645,9 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
     Assert1(isa<ConstantInt>(CI.getArgOperand(3)),
             "alignment argument of memory intrinsics must be a constant int",
             &CI);
+    Assert1(isa<ConstantInt>(CI.getArgOperand(4)),
+            "isvolatile argument of memory intrinsics must be a constant int",
+            &CI);
     break;
   case Intrinsic::gcroot:
   case Intrinsic::gcwrite:
diff --git a/runtime/libprofile/BasicBlockTracing.c b/runtime/libprofile/BasicBlockTracing.c
index dbe81e3..0815e2e 100644
--- a/runtime/libprofile/BasicBlockTracing.c
+++ b/runtime/libprofile/BasicBlockTracing.c
@@ -30,7 +30,7 @@ static void WriteAndFlushBBTraceData () {
 /* BBTraceAtExitHandler - When the program exits, just write out any remaining 
  * data and free the trace buffer.
  */
-static void BBTraceAtExitHandler() {
+static void BBTraceAtExitHandler(void) {
   WriteAndFlushBBTraceData ();
   free (ArrayStart);
 }
diff --git a/runtime/libprofile/EdgeProfiling.c b/runtime/libprofile/EdgeProfiling.c
index 4a68a08..f19e188 100644
--- a/runtime/libprofile/EdgeProfiling.c
+++ b/runtime/libprofile/EdgeProfiling.c
@@ -22,7 +22,7 @@ static unsigned NumElements;
 /* EdgeProfAtExitHandler - When the program exits, just write out the profiling
  * data.
  */
-static void EdgeProfAtExitHandler() {
+static void EdgeProfAtExitHandler(void) {
   /* Note that if this were doing something more intelligent with the
    * instrumentation, we could do some computation here to expand what we
    * collected into simple edge profiles.  Since we directly count each edge, we
diff --git a/runtime/libprofile/GCDAProfiling.c b/runtime/libprofile/GCDAProfiling.c
index 2dcf22d..09a1aec 100644
--- a/runtime/libprofile/GCDAProfiling.c
+++ b/runtime/libprofile/GCDAProfiling.c
@@ -24,6 +24,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#ifdef _MSC_VER
+#include <direct.h>
+#endif
 
 /* #define DEBUG_GCDAPROFILING */
 
@@ -46,6 +51,17 @@ static void write_int64(uint64_t i) {
   write_int32(hi);
 }
 
+static uint32_t length_of_string(const char *s) {
+  return (strlen(s) / 4) + 1;
+}
+
+static void write_string(const char *s) {
+  uint32_t len = length_of_string(s);
+  write_int32(len);
+  fwrite(s, strlen(s), 1, output_file);
+  fwrite("\0\0\0\0", 4 - (strlen(s) % 4), 1, output_file);
+}
+
 static char *mangle_filename(const char *orig_filename) {
   /* TODO: handle GCOV_PREFIX_STRIP */
   const char *prefix;
@@ -54,7 +70,7 @@ static char *mangle_filename(const char *orig_filename) {
   prefix = getenv("GCOV_PREFIX");
 
   if (!prefix)
-    return strdup(filename);
+    return strdup(orig_filename);
 
   filename = malloc(strlen(prefix) + 1 + strlen(orig_filename) + 1);
   strcpy(filename, prefix);
@@ -64,6 +80,25 @@ static char *mangle_filename(const char *orig_filename) {
   return filename;
 }
 
+static void recursive_mkdir(const char *filename) {
+  char *pathname;
+  int i, e;
+
+  for (i = 1, e = strlen(filename); i != e; ++i) {
+    if (filename[i] == '/') {
+      pathname = malloc(i + 1);
+      strncpy(pathname, filename, i);
+      pathname[i] = '\0';
+#ifdef _MSC_VER
+      _mkdir(pathname);
+#else
+      mkdir(pathname, 0750);  /* some of these will fail, ignore it. */
+#endif
+      free(pathname);
+    }
+  }
+}
+
 /*
  * --- LLVM line counter API ---
  */
@@ -75,6 +110,7 @@ static char *mangle_filename(const char *orig_filename) {
 void llvm_gcda_start_file(const char *orig_filename) {
   char *filename;
   filename = mangle_filename(orig_filename);
+  recursive_mkdir(filename);
   output_file = fopen(filename, "wb");
 
   /* gcda file, version 404*, stamp LLVM. */
@@ -111,16 +147,18 @@ void llvm_gcda_increment_indirect_counter(uint32_t *predecessor,
 #endif
 }
 
-void llvm_gcda_emit_function(uint32_t ident) {
+void llvm_gcda_emit_function(uint32_t ident, const char *function_name) {
 #ifdef DEBUG_GCDAPROFILING
   printf("llvmgcda: function id=%x\n", ident);
 #endif
 
   /* function tag */  
   fwrite("\0\0\0\1", 4, 1, output_file);
-  write_int32(2);
+  write_int32(3 + 1 + length_of_string(function_name));
   write_int32(ident);
   write_int32(0);
+  write_int32(0);
+  write_string(function_name);
 }
 
 void llvm_gcda_emit_arcs(uint32_t num_counters, uint64_t *counters) {
diff --git a/runtime/libprofile/OptimalEdgeProfiling.c b/runtime/libprofile/OptimalEdgeProfiling.c
index 947da53..3a7631b 100644
--- a/runtime/libprofile/OptimalEdgeProfiling.c
+++ b/runtime/libprofile/OptimalEdgeProfiling.c
@@ -22,7 +22,7 @@ static unsigned NumElements;
 /* OptEdgeProfAtExitHandler - When the program exits, just write out the
  * profiling data.
  */
-static void OptEdgeProfAtExitHandler() {
+static void OptEdgeProfAtExitHandler(void) {
   /* Note that, although the array has a counter for each edge, not all
    * counters are updated, the ones that are not used are initialised with -1.
    * When loading this information the counters with value -1 have to be
diff --git a/test/Analysis/BasicAA/2010-09-15-GEP-SignedArithmetic.ll b/test/Analysis/BasicAA/2010-09-15-GEP-SignedArithmetic.ll
index 2b0cd78..7b5584e 100644
--- a/test/Analysis/BasicAA/2010-09-15-GEP-SignedArithmetic.ll
+++ b/test/Analysis/BasicAA/2010-09-15-GEP-SignedArithmetic.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output |& grep {1 may alias}
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output |& grep {1 partial alias}
 ; PR7959
 
 target datalayout = "e-p:32:32:32"
diff --git a/test/Analysis/BasicAA/dag.ll b/test/Analysis/BasicAA/dag.ll
new file mode 100644
index 0000000..501f4c3
--- /dev/null
+++ b/test/Analysis/BasicAA/dag.ll
@@ -0,0 +1,41 @@
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info |& FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+; BasicAA's guard against use-def cycles shouldn't prevent it from
+; analyzing use-def dags.
+
+; CHECK: MustAlias:  i8* %base, i8* %phi
+; CHECK: MustAlias: i8* %phi, i8* %wwa
+; CHECK: MustAlias: i8* %phi, i8* %wwb
+; CHECK: MustAlias: i16* %bigbase, i8* %phi
+define i8 @foo(i8* %base, i1 %x, i1 %w) {
+entry:
+  br i1 %w, label %wa, label %wb
+wa:
+  %wwa = bitcast i8* %base to i8*
+  br label %wc
+wb:
+  %wwb = bitcast i8* %base to i8*
+  br label %wc
+wc:
+  %first = phi i8* [ %wwa, %wa ], [ %wwb, %wb ]
+  %fc = bitcast i8* %first to i8*
+  br i1 %x, label %xa, label %xb
+xa:
+  %xxa = bitcast i8* %fc to i8*
+  br label %xc
+xb:
+  %xxb = bitcast i8* %fc to i8*
+  br label %xc
+xc:
+  %phi = phi i8* [ %xxa, %xa ], [ %xxb, %xb ]
+
+  store i8 0, i8* %phi
+
+  %bigbase = bitcast i8* %base to i16*
+  store i16 -1, i16* %bigbase
+
+  %loaded = load i8* %phi
+  ret i8 %loaded
+}
diff --git a/test/Analysis/BasicAA/modref.ll b/test/Analysis/BasicAA/modref.ll
index ec0c8a7..7a71e3e 100644
--- a/test/Analysis/BasicAA/modref.ll
+++ b/test/Analysis/BasicAA/modref.ll
@@ -102,7 +102,7 @@ define i32 @test4(i8* %P) {
   %sub = sub i32 %tmp2, %tmp
   ret i32 %sub
 ; CHECK: @test4
-; CHECK: load i32* @G
+; CHECK-NOT: load
 ; CHECK: memset.p0i8.i32
 ; CHECK-NOT: load
 ; CHECK: ret i32 0
@@ -117,7 +117,7 @@ define i32 @test5(i8* %P, i32 %Len) {
   %sub = sub i32 %tmp2, %tmp
   ret i32 %sub
 ; CHECK: @test5
-; CHECK: load i32* @G
+; CHECK-NOT: load
 ; CHECK: memcpy.p0i8.p0i8.i32
 ; CHECK-NOT: load
 ; CHECK: ret i32 0
diff --git a/test/Analysis/BasicAA/must-and-partial.ll b/test/Analysis/BasicAA/must-and-partial.ll
new file mode 100644
index 0000000..93b6184
--- /dev/null
+++ b/test/Analysis/BasicAA/must-and-partial.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info |& FileCheck %s
+
+; When merging MustAlias and PartialAlias, merge to PartialAlias
+; instead of MayAlias.
+
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+; CHECK: PartialAlias:  i16* %bigbase0, i8* %phi
+define i8 @test0(i8* %base, i1 %x) {
+entry:
+  %baseplusone = getelementptr i8* %base, i64 1
+  br i1 %x, label %red, label %green
+red:
+  br label %green
+green:
+  %phi = phi i8* [ %baseplusone, %red ], [ %base, %entry ]
+  store i8 0, i8* %phi
+
+  %bigbase0 = bitcast i8* %base to i16*
+  store i16 -1, i16* %bigbase0
+
+  %loaded = load i8* %phi
+  ret i8 %loaded
+}
+
+; CHECK: PartialAlias:  i16* %bigbase1, i8* %sel
+define i8 @test1(i8* %base, i1 %x) {
+entry:
+  %baseplusone = getelementptr i8* %base, i64 1
+  %sel = select i1 %x, i8* %baseplusone, i8* %base
+  store i8 0, i8* %sel
+
+  %bigbase1 = bitcast i8* %base to i16*
+  store i16 -1, i16* %bigbase1
+
+  %loaded = load i8* %sel
+  ret i8 %loaded
+}
diff --git a/test/Analysis/BasicAA/underlying-value.ll b/test/Analysis/BasicAA/underlying-value.ll
new file mode 100644
index 0000000..0671c82
--- /dev/null
+++ b/test/Analysis/BasicAA/underlying-value.ll
@@ -0,0 +1,25 @@
+; RUN: opt -basicaa -licm -S < %s
+; PR9931
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+define void @func_20() nounwind {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond2, %entry
+  br i1 undef, label %for.cond2, label %for.end22
+
+for.cond2:                                        ; preds = %for.body5, %for.cond
+  br i1 false, label %for.body5, label %for.cond
+
+for.body5:                                        ; preds = %for.cond2
+  %arrayidx = getelementptr inbounds [2 x i64]* undef, i32 0, i64 0
+  %tmp7 = load i64* %arrayidx, align 8
+  %arrayidx9 = getelementptr inbounds [2 x i64]* undef, i32 0, i64 undef
+  %tmp10 = load i64* %arrayidx9, align 8
+  br label %for.cond2
+
+for.end22:                                        ; preds = %for.cond
+  ret void
+}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/dynamic-indices.ll b/test/Analysis/TypeBasedAliasAnalysis/dynamic-indices.ll
new file mode 100644
index 0000000..52e394b
--- /dev/null
+++ b/test/Analysis/TypeBasedAliasAnalysis/dynamic-indices.ll
@@ -0,0 +1,131 @@
+; RUN: opt -tbaa -basicaa -gvn -S < %s | FileCheck %s
+; PR9971
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.X = type { i32, float }
+%union.vector_t = type { [2 x i64] }
+
+; Don't delete the load after the loop, because it loads values stored
+; inside the loop.
+
+; CHECK: define void @vrlh(
+
+; CHECK: for.end:
+; CHECK:   %arrayidx31 = getelementptr inbounds %union.vector_t* %t, i64 0, i32 0, i64 1
+; CHECK:   %tmp32 = load i64* %arrayidx31, align 8, !tbaa !3
+
+define void @vrlh(%union.vector_t* %va, %union.vector_t* %vb, %union.vector_t* %vd) nounwind {
+entry:
+  %t = alloca %union.vector_t, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %sub = sub nsw i32 7, %i.01
+  %idxprom = sext i32 %sub to i64
+  %half = bitcast %union.vector_t* %vb to [8 x i16]*
+  %arrayidx = getelementptr inbounds [8 x i16]* %half, i64 0, i64 %idxprom
+  %tmp4 = load i16* %arrayidx, align 2, !tbaa !0
+  %conv = zext i16 %tmp4 to i32
+  %and = and i32 %conv, 15
+  %sub6 = sub nsw i32 7, %i.01
+  %idxprom7 = sext i32 %sub6 to i64
+  %half9 = bitcast %union.vector_t* %va to [8 x i16]*
+  %arrayidx10 = getelementptr inbounds [8 x i16]* %half9, i64 0, i64 %idxprom7
+  %tmp11 = load i16* %arrayidx10, align 2, !tbaa !0
+  %conv12 = zext i16 %tmp11 to i32
+  %shl = shl i32 %conv12, %and
+  %sub15 = sub nsw i32 7, %i.01
+  %idxprom16 = sext i32 %sub15 to i64
+  %half18 = bitcast %union.vector_t* %va to [8 x i16]*
+  %arrayidx19 = getelementptr inbounds [8 x i16]* %half18, i64 0, i64 %idxprom16
+  %tmp20 = load i16* %arrayidx19, align 2, !tbaa !0
+  %conv21 = zext i16 %tmp20 to i32
+  %sub23 = sub nsw i32 16, %and
+  %shr = lshr i32 %conv21, %sub23
+  %or = or i32 %shl, %shr
+  %conv24 = trunc i32 %or to i16
+  %sub26 = sub nsw i32 7, %i.01
+  %idxprom27 = sext i32 %sub26 to i64
+  %half28 = bitcast %union.vector_t* %t to [8 x i16]*
+  %arrayidx29 = getelementptr inbounds [8 x i16]* %half28, i64 0, i64 %idxprom27
+  store i16 %conv24, i16* %arrayidx29, align 2, !tbaa !0
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 8
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %arrayidx31 = getelementptr inbounds %union.vector_t* %t, i64 0, i32 0, i64 1
+  %tmp32 = load i64* %arrayidx31, align 8, !tbaa !3
+  %arrayidx35 = getelementptr inbounds %union.vector_t* %vd, i64 0, i32 0, i64 1
+  store i64 %tmp32, i64* %arrayidx35, align 8, !tbaa !3
+  %arrayidx37 = getelementptr inbounds %union.vector_t* %t, i64 0, i32 0, i64 0
+  %tmp38 = load i64* %arrayidx37, align 8, !tbaa !3
+  %arrayidx41 = getelementptr inbounds %union.vector_t* %vd, i64 0, i32 0, i64 0
+  store i64 %tmp38, i64* %arrayidx41, align 8, !tbaa !3
+  ret void
+}
+
+; Do delete the load after the loop.
+
+; CHECK: define i32 @test0(
+
+; CHECK:   ret i32 0
+
+define i32 @test0(%struct.X* %a) nounwind {
+entry:
+  %i = getelementptr inbounds %struct.X* %a, i64 0, i32 0
+  store i32 0, i32* %i, align 4, !tbaa !4
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i2.01 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %f = getelementptr inbounds %struct.X* %a, i64 %i2.01, i32 1
+  %tmp6 = load float* %f, align 4, !tbaa !5
+  %mul = fmul float %tmp6, 0x40019999A0000000
+  store float %mul, float* %f, align 4, !tbaa !5
+  %inc = add nsw i64 %i2.01, 1
+  %cmp = icmp slt i64 %inc, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %i9 = getelementptr inbounds %struct.X* %a, i64 0, i32 0
+  %tmp10 = load i32* %i9, align 4, !tbaa !4
+  ret i32 %tmp10
+}
+
+; Do delete the load after the loop.
+
+; CHECK: define float @test1(
+
+; CHECK:   ret float 0x3FD3333340000000
+
+define float @test1(%struct.X* %a) nounwind {
+entry:
+  %f = getelementptr inbounds %struct.X* %a, i64 0, i32 1
+  store float 0x3FD3333340000000, float* %f, align 4, !tbaa !5
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %i5 = getelementptr inbounds %struct.X* %a, i64 %i.01, i32 0
+  %tmp6 = load i32* %i5, align 4, !tbaa !4
+  %mul = mul nsw i32 %tmp6, 3
+  store i32 %mul, i32* %i5, align 4, !tbaa !4
+  %inc = add nsw i64 %i.01, 1
+  %cmp = icmp slt i64 %inc, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %f9 = getelementptr inbounds %struct.X* %a, i64 0, i32 1
+  %tmp10 = load float* %f9, align 4, !tbaa !5
+  ret float %tmp10
+}
+
+!0 = metadata !{metadata !"short", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
+!3 = metadata !{metadata !"long long", metadata !1}
+!4 = metadata !{metadata !"int", metadata !1}
+!5 = metadata !{metadata !"float", metadata !1}
diff --git a/test/Archive/check_binary_output.ll b/test/Archive/check_binary_output.ll
new file mode 100644
index 0000000..60ab5ca
--- /dev/null
+++ b/test/Archive/check_binary_output.ll
@@ -0,0 +1,4 @@
+; This is not an assembly file, this is just to run the test.
+; The test verifies that llvm-ar produces a binary output.
+
+;RUN: llvm-ar p %p/GNU.a very_long_bytecode_file_name.bc | cmp -s %p/very_long_bytecode_file_name.bc -
diff --git a/test/Assembler/AutoUpgradeIntrinsics.ll b/test/Assembler/AutoUpgradeIntrinsics.ll
index e4e2d3a..417493f 100644
--- a/test/Assembler/AutoUpgradeIntrinsics.ll
+++ b/test/Assembler/AutoUpgradeIntrinsics.ll
@@ -10,6 +10,7 @@
 ; RUN:   not grep {llvm\\.x86\\.sse2\\.loadu}
 ; RUN: llvm-as < %s | llvm-dis | \
 ; RUN:   grep {llvm\\.x86\\.mmx\\.ps} | grep {x86_mmx} | count 16
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
 
 declare i32 @llvm.ctpop.i28(i28 %val)
 declare i32 @llvm.cttz.i29(i29 %val)
@@ -91,3 +92,20 @@ define void @test_loadu(i8* %a, double* %b) {
   %v2 = call <2 x double> @llvm.x86.sse2.loadu.pd(double* %b)
   ret void
 }
+
+declare void @llvm.x86.sse.movnt.ps(i8*, <4 x float>) nounwind readnone 
+declare void @llvm.x86.sse2.movnt.dq(i8*, <2 x double>) nounwind readnone 
+declare void @llvm.x86.sse2.movnt.pd(i8*, <2 x double>) nounwind readnone 
+declare void @llvm.x86.sse2.movnt.i(i8*, i32) nounwind readnone 
+
+define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D) {
+; CHECK: store{{.*}}nontemporal
+  call void @llvm.x86.sse.movnt.ps(i8* %B, <4 x float> %A)
+; CHECK: store{{.*}}nontemporal
+  call void @llvm.x86.sse2.movnt.dq(i8* %B, <2 x double> %C)
+; CHECK: store{{.*}}nontemporal
+  call void @llvm.x86.sse2.movnt.pd(i8* %B, <2 x double> %C)
+; CHECK: store{{.*}}nontemporal
+  call void @llvm.x86.sse2.movnt.i(i8* %B, i32 %D)
+  ret void
+}
diff --git a/test/Assembler/invalid_cast.ll b/test/Assembler/invalid_cast.ll
new file mode 100644
index 0000000..c5b082b
--- /dev/null
+++ b/test/Assembler/invalid_cast.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s |& grep {invalid cast opcode}
+
+define <3 x i8> @foo(<4 x i64> %x) {
+  %y = trunc <4 x i64> %x to <3 x i8>
+  ret <3 x i8> %y
+}
diff --git a/test/Assembler/invalid_cast2.ll b/test/Assembler/invalid_cast2.ll
new file mode 100644
index 0000000..f2e7c41
--- /dev/null
+++ b/test/Assembler/invalid_cast2.ll
@@ -0,0 +1,6 @@
+; RUN: not llvm-as < %s |& grep {invalid cast opcode}
+
+define i8 @foo(<4 x i64> %x) {
+  %y = trunc <4 x i64> %x to i8
+  ret i8 %y
+}
diff --git a/test/Bitcode/2006-12-11-Cast-ConstExpr.ll b/test/Bitcode/2006-12-11-Cast-ConstExpr.ll
index 6df8711..e704627 100644
--- a/test/Bitcode/2006-12-11-Cast-ConstExpr.ll
+++ b/test/Bitcode/2006-12-11-Cast-ConstExpr.ll
@@ -1,7 +1,7 @@
 ; This test ensures that we get a bitcast constant expression in and out,
 ; not a sitofp constant expression. 
-; RUN: llvm-as < %s | llvm-dis | \
-; RUN:   grep {bitcast (}
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; CHECK: bitcast (
 
 @G = external global i32
 
diff --git a/test/Bitcode/AutoUpgradeGlobals.ll b/test/Bitcode/AutoUpgradeGlobals.ll
index 8a87673..a5af2b8 100644
--- a/test/Bitcode/AutoUpgradeGlobals.ll
+++ b/test/Bitcode/AutoUpgradeGlobals.ll
@@ -1,3 +1,4 @@
 ; This isn't really an assembly file. It just runs test on bitcode to ensure
 ; it is auto-upgraded.
-; RUN: llvm-dis < %s.bc | not grep {i32 @\\.llvm\\.eh}
+; RUN: llvm-dis < %s.bc | FileCheck %s 
+; CHECK-NOT: {i32 @\\.llvm\\.eh}
diff --git a/test/Bitcode/AutoUpgradeIntrinsics.ll b/test/Bitcode/AutoUpgradeIntrinsics.ll
index 5f9bcd5..c3e2e9e 100644
--- a/test/Bitcode/AutoUpgradeIntrinsics.ll
+++ b/test/Bitcode/AutoUpgradeIntrinsics.ll
@@ -1,10 +1,8 @@
 ; This isn't really an assembly file. It just runs test on bitcode to ensure
 ; it is auto-upgraded.
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.ct}
-; RUN: llvm-dis < %s.bc | \
-; RUN:   not grep {llvm\\.part\\.set\\.i\[0-9\]*\\.i\[0-9\]*\\.i\[0-9\]*}
-; RUN: llvm-dis < %s.bc | \
-; RUN:   not grep {llvm\\.part\\.select\\.i\[0-9\]*\\.i\[0-9\]*}
-; RUN: llvm-dis < %s.bc | \
-; RUN:   not grep {llvm\\.bswap\\.i\[0-9\]*\\.i\[0-9\]*}
+; RUN: llvm-dis < %s.bc | FileCheck %s
+; CHECK-NOT: {i32 @llvm\\.ct}
+; CHECK-NOT: {llvm\\.part\\.set\\.i\[0-9\]*\\.i\[0-9\]*\\.i\[0-9\]*}
+; CHECK-NOT: {llvm\\.part\\.select\\.i\[0-9\]*\\.i\[0-9\]*}
+; CHECK-NOT: {llvm\\.bswap\\.i\[0-9\]*\\.i\[0-9\]*}
 
diff --git a/test/Bitcode/blockaddress.ll b/test/Bitcode/blockaddress.ll
new file mode 100644
index 0000000..b9f3341
--- /dev/null
+++ b/test/Bitcode/blockaddress.ll
@@ -0,0 +1,30 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+; PR9857
+
+define void @f(i8** nocapture %ptr1) {
+; CHECK: define void @f
+entry:
+  br label %here.i
+
+here.i:
+  store i8* blockaddress(@doit, %here), i8** %ptr1, align 8
+; CHECK: blockaddress(@doit, %here)
+  br label %doit.exit
+
+doit.exit:
+  ret void
+}
+
+define void @doit(i8** nocapture %pptr) {
+; CHECK: define void @doit
+entry:
+  br label %here
+
+here:
+  store i8* blockaddress(@doit, %here), i8** %pptr, align 8
+; CHECK: blockaddress(@doit, %here)
+  br label %end
+
+end:
+  ret void
+}
diff --git a/test/Bitcode/sse2_loadl_pd.ll b/test/Bitcode/sse2_loadl_pd.ll
index b0bea16..6cb0da5 100644
--- a/test/Bitcode/sse2_loadl_pd.ll
+++ b/test/Bitcode/sse2_loadl_pd.ll
@@ -1,2 +1,3 @@
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.loadl.pd}
-; RUN: llvm-dis < %s.bc | grep shufflevector
+; RUN: llvm-dis < %s.bc | FileCheck %s
+; CHECK-NOT: {i32 @llvm\\.loadl.pd} 
+; CHECK: shufflevector
diff --git a/test/Bitcode/sse2_movl_dq.ll b/test/Bitcode/sse2_movl_dq.ll
index 093d821..2fc0149 100644
--- a/test/Bitcode/sse2_movl_dq.ll
+++ b/test/Bitcode/sse2_movl_dq.ll
@@ -1,2 +1,3 @@
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.movl.dq}
-; RUN: llvm-dis < %s.bc | grep shufflevector
+; RUN: llvm-dis < %s.bc | FileCheck %s 
+; CHECK-NOT: {i32 @llvm\\.movl.dq}
+; CHECK: shufflevector
diff --git a/test/Bitcode/sse2_movs_d.ll b/test/Bitcode/sse2_movs_d.ll
index 25a35b6..ab82c43 100644
--- a/test/Bitcode/sse2_movs_d.ll
+++ b/test/Bitcode/sse2_movs_d.ll
@@ -1,2 +1,3 @@
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.movs.d}
-; RUN: llvm-dis < %s.bc | grep shufflevector
+; RUN: llvm-dis < %s.bc | FileCheck %s
+; CHECK-NOT: {i32 @llvm\\.movs.d}
+; CHECK: shufflevector
diff --git a/test/Bitcode/sse2_punpck_qdq.ll b/test/Bitcode/sse2_punpck_qdq.ll
index b9d711c..4c68af5 100644
--- a/test/Bitcode/sse2_punpck_qdq.ll
+++ b/test/Bitcode/sse2_punpck_qdq.ll
@@ -1,3 +1,4 @@
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.punpckh.qdq}
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.punpckl.qdq}
-; RUN: llvm-dis < %s.bc | grep shufflevector
+; RUN: llvm-dis < %s.bc | FileCheck %s
+; CHECK-NOT: {i32 @llvm\\.punpckh.qdq}
+; CHECK-NOT: {i32 @llvm\\.punpckl.qdq}
+; CHECK: shufflevector
diff --git a/test/Bitcode/sse2_shuf_pd.ll b/test/Bitcode/sse2_shuf_pd.ll
index 5829edb..1ba6a1d 100644
--- a/test/Bitcode/sse2_shuf_pd.ll
+++ b/test/Bitcode/sse2_shuf_pd.ll
@@ -1,2 +1,3 @@
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.shuf.pd}
-; RUN: llvm-dis < %s.bc | grep shufflevector
+; RUN: llvm-dis < %s.bc | FileCheck %s
+; CHECK-NOT: {i32 @llvm\\.shuf.pd}
+; CHECK: shufflevector
diff --git a/test/Bitcode/sse2_unpck_pd.ll b/test/Bitcode/sse2_unpck_pd.ll
index f4e5d54..99b61b6 100644
--- a/test/Bitcode/sse2_unpck_pd.ll
+++ b/test/Bitcode/sse2_unpck_pd.ll
@@ -1,3 +1,4 @@
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.unpckh.pd}
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.unpckl.pd}
-; RUN: llvm-dis < %s.bc | grep shufflevector
+; RUN: llvm-dis < %s.bc | FileCheck %s
+; CHECK-NOT: {i32 @llvm\\.unpckh.pd}
+; CHECK-NOT: {i32 @llvm\\.unpckl.pd}
+; CHECK: shufflevector
diff --git a/test/Bitcode/sse41_pmulld.ll b/test/Bitcode/sse41_pmulld.ll
index 6872cc0..752786d 100644
--- a/test/Bitcode/sse41_pmulld.ll
+++ b/test/Bitcode/sse41_pmulld.ll
@@ -1,2 +1,3 @@
-; RUN: llvm-dis < %s.bc | not grep {i32 @llvm\\.pmulld}
-; RUN: llvm-dis < %s.bc | grep mul
+; RUN: llvm-dis < %s.bc | FileCheck %s
+; CHECK-NOT: {i32 @llvm\\.pmulld}
+; CHECK: mul
diff --git a/test/Bitcode/sse42_crc32.ll b/test/Bitcode/sse42_crc32.ll
new file mode 100644
index 0000000..1c371c3
--- /dev/null
+++ b/test/Bitcode/sse42_crc32.ll
@@ -0,0 +1,28 @@
+; Check to make sure old CRC32 intrinsics are auto-upgraded
+; correctly.
+;
+; Rdar: 9472944
+;
+; RUN: llvm-dis < %s.bc | FileCheck %s
+
+; crc32.8 should upgrade to crc32.32.8
+; CHECK: i32 @llvm.x86.sse42.crc32.32.8(
+; CHECK-NOT: i32 @llvm.x86.sse42.crc32.8(
+
+; crc32.16 should upgrade to crc32.32.16
+; CHECK: i32 @llvm.x86.sse42.crc32.32.16(
+; CHECK-NOT: i32 @llvm.x86.sse42.crc32.16(
+
+; crc32.32 should upgrade to crc32.32.32
+; CHECK: i32 @llvm.x86.sse42.crc32.32.32(
+; CHECK-NOT: i32 @llvm.x86.sse42.crc32.32(
+
+; crc64.8 should upgrade to crc32.64.8
+; CHECK: i64 @llvm.x86.sse42.crc32.64.8(
+; CHECK-NOT: i64 @llvm.x86.sse42.crc64.8(
+
+; crc64.64 should upgrade to crc32.64.64
+; CHECK: i64 @llvm.x86.sse42.crc32.64.64(
+; CHECK-NOT: i64 @llvm.x86.sse42.crc64.64(
+
+
diff --git a/test/Bitcode/sse42_crc32.ll.bc b/test/Bitcode/sse42_crc32.ll.bc
new file mode 100644
index 0000000..d895fad
--- /dev/null
+++ b/test/Bitcode/sse42_crc32.ll.bc
diff --git a/test/Bitcode/ssse3_palignr.ll b/test/Bitcode/ssse3_palignr.ll
index d596dd5..f62ca11 100644
--- a/test/Bitcode/ssse3_palignr.ll
+++ b/test/Bitcode/ssse3_palignr.ll
@@ -1 +1,2 @@
-; RUN: llvm-dis < %s.bc | not grep {@llvm\\.palign}
+; RUN: llvm-dis < %s.bc | FileCheck %s 
+; CHECK-NOT: {@llvm\\.palign}
diff --git a/test/CodeGen/ARM/2009-09-09-fpcmp-ole.ll b/test/CodeGen/ARM/2009-09-09-fpcmp-ole.ll
index 3909c6a..0a157c9 100644
--- a/test/CodeGen/ARM/2009-09-09-fpcmp-ole.ll
+++ b/test/CodeGen/ARM/2009-09-09-fpcmp-ole.ll
@@ -1,16 +1,16 @@
-; RUN: llc -O1 -march=arm -mattr=+vfp2 < %s | FileCheck %s
+; RUN: llc -O1 -march=arm -mattr=+vfp2 -mtriple=arm-linux-gnueabi < %s | FileCheck %s
 ; pr4939
 
 define void @test(double* %x, double* %y) nounwind {
-  %1 = load double* %x, align 4
-  %2 = load double* %y, align 4
+  %1 = load double* %x
+  %2 = load double* %y
   %3 = fsub double -0.000000e+00, %1
   %4 = fcmp ugt double %2, %3
   br i1 %4, label %bb1, label %bb2
 
 bb1:
 ;CHECK: vstrhi.64
-  store double %1, double* %y, align 4
+  store double %1, double* %y
   br label %bb2
 
 bb2:
diff --git a/test/CodeGen/ARM/2011-04-07-schediv.ll b/test/CodeGen/ARM/2011-04-07-schediv.ll
index a61908f..19f756f 100644
--- a/test/CodeGen/ARM/2011-04-07-schediv.ll
+++ b/test/CodeGen/ARM/2011-04-07-schediv.ll
@@ -13,6 +13,7 @@ entry:
 ; Make sure the scheduler schedules all uses of the preincrement
 ; induction variable before defining the postincrement value.
 ; CHECK: t:
+; CHECK: %bb
 ; CHECK-NOT: mov
 bb:                                               ; preds = %entry, %bb
   %j.05 = phi i32 [ %2, %bb ], [ 0, %entry ]
diff --git a/test/CodeGen/ARM/2011-04-11-MachineLICMBug.ll b/test/CodeGen/ARM/2011-04-11-MachineLICMBug.ll
index a9dd971..568718c 100644
--- a/test/CodeGen/ARM/2011-04-11-MachineLICMBug.ll
+++ b/test/CodeGen/ARM/2011-04-11-MachineLICMBug.ll
@@ -14,15 +14,15 @@ for.cond:
   br i1 %cmp, label %for.body, label %return
 
 for.body:
-; CHECK: %for.body
-; CHECK: movs r{{[0-9]+}}, #1
+; CHECK: %for.
+; CHECK: movs r{{[0-9]+}}, #{{[01]}}
   %arrayidx = getelementptr i32* %A, i32 %0
   %tmp4 = load i32* %arrayidx, align 4
   %cmp6 = icmp eq i32 %tmp4, %value
   br i1 %cmp6, label %return, label %for.inc
 
-; CHECK: %for.cond
-; CHECK: movs r{{[0-9]+}}, #0
+; CHECK: %for.
+; CHECK: movs r{{[0-9]+}}, #{{[01]}}
 
 for.inc:
   %inc = add i32 %0, 1
diff --git a/test/CodeGen/ARM/2011-05-04-MultipleLandingPadSuccs.ll b/test/CodeGen/ARM/2011-05-04-MultipleLandingPadSuccs.ll
new file mode 100644
index 0000000..0b5f962
--- /dev/null
+++ b/test/CodeGen/ARM/2011-05-04-MultipleLandingPadSuccs.ll
@@ -0,0 +1,93 @@
+; RUN: llc < %s -verify-machineinstrs
+; <rdar://problem/9187612>
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
+target triple = "thumbv7-apple-darwin"
+
+define void @func() unnamed_addr align 2 {
+entry:
+  br label %for.cond
+
+for.cond:
+  %tmp2 = phi i32 [ 0, %entry ], [ %add, %for.cond.backedge ]
+  %cmp = icmp ult i32 %tmp2, 14
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  %add = add i32 %tmp2, 1
+  switch i32 %tmp2, label %sw.default [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb
+    i32 4, label %sw.bb
+    i32 5, label %sw.bb
+    i32 10, label %sw.bb
+  ]
+
+sw.bb:
+  invoke void @foo()
+          to label %invoke.cont17 unwind label %lpad
+
+invoke.cont17:
+  invoke void @foo()
+          to label %for.cond.backedge unwind label %lpad26
+
+for.cond.backedge:
+  br label %for.cond
+
+lpad:
+  %exn = tail call i8* @llvm.eh.exception() nounwind
+  %eh.selector = tail call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*), i8* null) nounwind
+  invoke void @foo()
+          to label %eh.resume unwind label %terminate.lpad
+
+lpad26:
+  %exn27 = tail call i8* @llvm.eh.exception() nounwind
+  %eh.selector28 = tail call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn27, i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*), i8* null) nounwind
+  invoke void @foo()
+          to label %eh.resume unwind label %terminate.lpad
+
+sw.default:
+  br label %for.cond.backedge
+
+for.end:
+  invoke void @foo()
+          to label %call8.i.i.i.noexc unwind label %lpad44
+
+call8.i.i.i.noexc:
+  ret void
+
+lpad44:
+  %exn45 = tail call i8* @llvm.eh.exception() nounwind
+  %eh.selector46 = tail call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn45, i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*), i8* null) nounwind
+  invoke void @foo()
+          to label %eh.resume unwind label %terminate.lpad
+
+eh.resume:
+  %exn.slot.0 = phi i8* [ %exn27, %lpad26 ], [ %exn, %lpad ], [ %exn45, %lpad44 ]
+  tail call void @_Unwind_SjLj_Resume_or_Rethrow(i8* %exn.slot.0) noreturn
+  unreachable
+
+terminate.lpad:
+  %exn51 = tail call i8* @llvm.eh.exception() nounwind
+  %eh.selector52 = tail call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn51, i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*), i8* null) nounwind
+  tail call void @_ZSt9terminatev() noreturn nounwind
+  unreachable
+}
+
+declare void @foo()
+
+declare i8* @llvm.eh.exception() nounwind readonly
+
+declare i32 @__gxx_personality_sj0(...)
+
+declare i32 @llvm.eh.selector(i8*, i8*, ...) nounwind
+
+declare void @_Unwind_SjLj_Resume_or_Rethrow(i8*)
+
+declare void @_ZSt9terminatev()
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
+!3 = metadata !{metadata !"bool", metadata !1}
+!4 = metadata !{metadata !"int", metadata !1}
diff --git a/test/CodeGen/ARM/2011-06-09-TailCallByVal.ll b/test/CodeGen/ARM/2011-06-09-TailCallByVal.ll
new file mode 100644
index 0000000..4db3acf
--- /dev/null
+++ b/test/CodeGen/ARM/2011-06-09-TailCallByVal.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -relocation-model=pic -mcpu=cortex-a8 -arm-tail-calls=1 | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-darwin10"
+
+%struct._RuneCharClass = type { [14 x i8], i32 }
+%struct._RuneEntry = type { i32, i32, i32, i32* }
+%struct._RuneLocale = type { [8 x i8], [32 x i8], i32 (i8*, i32, i8**)*, i32 (i32, i8*, i32, i8**)*, i32, [256 x i32], [256 x i32], [256 x i32], %struct._RuneRange, %struct._RuneRange, %struct._RuneRange, i8*, i32, i32, %struct._RuneCharClass* }
+%struct._RuneRange = type { i32, %struct._RuneEntry* }
+%struct.__collate_st_chain_pri = type { [10 x i32], [2 x i32] }
+%struct.__collate_st_char_pri = type { [2 x i32] }
+%struct.__collate_st_info = type { [2 x i8], i8, i8, [2 x i32], [2 x i32], i32, i32 }
+%struct.__collate_st_large_char_pri = type { i32, %struct.__collate_st_char_pri }
+%struct.__collate_st_subst = type { i32, [10 x i32] }
+%struct.__xlocale_st_collate = type { i32, void (i8*)*, [32 x i8], %struct.__collate_st_info, [2 x %struct.__collate_st_subst*], %struct.__collate_st_chain_pri*, %struct.__collate_st_large_char_pri*, [256 x %struct.__collate_st_char_pri] }
+%struct.__xlocale_st_messages = type { i32, void (i8*)*, i8*, %struct.lc_messages_T }
+%struct.__xlocale_st_monetary = type { i32, void (i8*)*, i8*, %struct.lc_monetary_T }
+%struct.__xlocale_st_numeric = type { i32, void (i8*)*, i8*, %struct.lc_numeric_T }
+%struct.__xlocale_st_runelocale = type { i32, void (i8*)*, [32 x i8], i32, i32, i32 (i32*, i8*, i32, %union.__mbstate_t*, %struct._xlocale*)*, i32 (%union.__mbstate_t*, %struct._xlocale*)*, i32 (i32*, i8**, i32, i32, %union.__mbstate_t*, %struct._xlocale*)*, i32 (i8*, i32, %union.__mbstate_t*, %struct._xlocale*)*, i32 (i8*, i32**, i32, i32, %union.__mbstate_t*, %struct._xlocale*)*, i32, %struct._RuneLocale }
+%struct.__xlocale_st_time = type { i32, void (i8*)*, i8*, %struct.lc_time_T }
+%struct._xlocale = type { i32, void (i8*)*, %union.__mbstate_t, %union.__mbstate_t, %union.__mbstate_t, %union.__mbstate_t, %union.__mbstate_t, %union.__mbstate_t, %union.__mbstate_t, %union.__mbstate_t, %union.__mbstate_t, %union.__mbstate_t, i32, i64, i8, i8, i8, i8, i8, i8, i8, i8, i8, %struct.__xlocale_st_collate*, %struct.__xlocale_st_runelocale*, %struct.__xlocale_st_messages*, %struct.__xlocale_st_monetary*, %struct.__xlocale_st_numeric*, %struct._xlocale*, %struct.__xlocale_st_time*, %struct.lconv }
+%struct.lc_messages_T = type { i8*, i8*, i8*, i8* }
+%struct.lc_monetary_T = type { i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8* }
+%struct.lc_numeric_T = type { i8*, i8*, i8* }
+%struct.lc_time_T = type { [12 x i8*], [12 x i8*], [7 x i8*], [7 x i8*], i8*, i8*, i8*, i8*, i8*, i8*, [12 x i8*], i8*, i8* }
+%struct.lconv = type { i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
+%union.__mbstate_t = type { i64, [120 x i8] }
+
+@"\01_fnmatch.initial" = external constant %union.__mbstate_t, align 4
+
+; CHECK: _fnmatch
+; CHECK: blx _fnmatch1
+
+define i32 @"\01_fnmatch"(i8* %pattern, i8* %string, i32 %flags) nounwind optsize {
+entry:
+  %call4 = tail call i32 @fnmatch1(i8* %pattern, i8* %string, i8* %string, i32 %flags, %union.__mbstate_t* byval @"\01_fnmatch.initial", %union.__mbstate_t* byval @"\01_fnmatch.initial", %struct._xlocale* undef, i32 64) optsize
+  ret i32 %call4
+}
+
+declare i32 @fnmatch1(i8*, i8*, i8*, i32, %union.__mbstate_t* byval, %union.__mbstate_t* byval, %struct._xlocale*, i32) nounwind optsize
diff --git a/test/CodeGen/ARM/arm-modifier.ll b/test/CodeGen/ARM/arm-modifier.ll
new file mode 100644
index 0000000..0a7bb6c
--- /dev/null
+++ b/test/CodeGen/ARM/arm-modifier.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s
+
+define i32 @foo(float %scale, float %scale2) nounwind {
+entry:
+  %scale.addr = alloca float, align 4
+  %scale2.addr = alloca float, align 4
+  store float %scale, float* %scale.addr, align 4
+  store float %scale2, float* %scale2.addr, align 4
+  %tmp = load float* %scale.addr, align 4
+  %tmp1 = load float* %scale2.addr, align 4
+  call void asm sideeffect "vmul.f32    q0, q0, ${0:y} \0A\09vmul.f32    q1, q1, ${0:y} \0A\09vmul.f32    q1, q0, ${1:y} \0A\09", "w,w,~{q0},~{q1}"(float %tmp, float %tmp1) nounwind
+  ret i32 0
+}
+
+define void @f0() nounwind {
+entry:
+; CHECK: f0
+; CHECK: .word -1
+call void asm sideeffect ".word ${0:B} \0A\09", "i"(i32 0) nounwind
+ret void
+}
+
+define void @f1() nounwind {
+entry:
+; CHECK: f1
+; CHECK: .word 65535
+call void asm sideeffect ".word ${0:L} \0A\09", "i"(i32 -1) nounwind
+ret void
+}
+
+@f2_ptr = internal global i32* @f2_var, align 4
+@f2_var = external global i32
+
+define void @f2() nounwind {
+entry:
+; CHECK: f2
+; CHECK: ldr r0, [r{{[0-9]+}}]
+call void asm sideeffect "ldr r0, [${0:m}]\0A\09", "*m,~{r0}"(i32** @f2_ptr) nounwind
+ret void
+}
+
+@f3_ptr = internal global i64* @f3_var, align 4
+@f3_var = external global i64
+@f3_var2 = external global i64
+
+define void @f3() nounwind {
+entry:
+; CHECK: f3
+; CHECK: stm r{{[0-9]+}}, {[[REG1:(r[0-9]+)]], r{{[0-9]+}}}
+; CHECK: adds lr, [[REG1]]
+; CHECK: ldm r{{[0-9]+}}, {r{{[0-9]+}}, r{{[0-9]+}}}
+%tmp = load i64* @f3_var, align 4
+%tmp1 = load i64* @f3_var2, align 4
+%0 = call i64 asm sideeffect "stm ${0:m}, ${1:M}\0A\09adds $3, $1\0A\09", "=*m,=r,1,r"(i64** @f3_ptr, i64 %tmp, i64 %tmp1) nounwind
+store i64 %0, i64* @f3_var, align 4
+%1 = call i64 asm sideeffect "ldm ${1:m}, ${0:M}\0A\09", "=r,*m"(i64** @f3_ptr) nounwind
+store i64 %1, i64* @f3_var, align 4
+ret void
+}
diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
new file mode 100644
index 0000000..03940e3
--- /dev/null
+++ b/test/CodeGen/ARM/atomic-op.ll
@@ -0,0 +1,103 @@
+; RUN: llc < %s -mtriple=armv7-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 | FileCheck %s
+
+define void @func(i32 %argc, i8** %argv) nounwind {
+entry:
+	%argc.addr = alloca i32		; <i32*> [#uses=1]
+	%argv.addr = alloca i8**		; <i8***> [#uses=1]
+	%val1 = alloca i32		; <i32*> [#uses=2]
+	%val2 = alloca i32		; <i32*> [#uses=15]
+	%andt = alloca i32		; <i32*> [#uses=2]
+	%ort = alloca i32		; <i32*> [#uses=2]
+	%xort = alloca i32		; <i32*> [#uses=2]
+	%old = alloca i32		; <i32*> [#uses=18]
+	%temp = alloca i32		; <i32*> [#uses=2]
+	store i32 %argc, i32* %argc.addr
+	store i8** %argv, i8*** %argv.addr
+	store i32 0, i32* %val1
+	store i32 31, i32* %val2
+	store i32 3855, i32* %andt
+	store i32 3855, i32* %ort
+	store i32 3855, i32* %xort
+	store i32 4, i32* %temp
+	%tmp = load i32* %temp
+  ; CHECK: ldrex
+  ; CHECK: add
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.add.i32.p0i32( i32* %val1, i32 %tmp )		; <i32>:0 [#uses=1]
+	store i32 %0, i32* %old
+  ; CHECK: ldrex
+  ; CHECK: sub
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.sub.i32.p0i32( i32* %val2, i32 30 )		; <i32>:1 [#uses=1]
+	store i32 %1, i32* %old
+  ; CHECK: ldrex
+  ; CHECK: add
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.add.i32.p0i32( i32* %val2, i32 1 )		; <i32>:2 [#uses=1]
+	store i32 %2, i32* %old
+  ; CHECK: ldrex
+  ; CHECK: sub
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.sub.i32.p0i32( i32* %val2, i32 1 )		; <i32>:3 [#uses=1]
+	store i32 %3, i32* %old
+  ; CHECK: ldrex
+  ; CHECK: and
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.and.i32.p0i32( i32* %andt, i32 4080 )		; <i32>:4 [#uses=1]
+	store i32 %4, i32* %old
+  ; CHECK: ldrex
+  ; CHECK: or
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.or.i32.p0i32( i32* %ort, i32 4080 )		; <i32>:5 [#uses=1]
+	store i32 %5, i32* %old
+  ; CHECK: ldrex
+  ; CHECK: eor
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.xor.i32.p0i32( i32* %xort, i32 4080 )		; <i32>:6 [#uses=1]
+	store i32 %6, i32* %old
+  ; CHECK: ldrex
+  ; CHECK: cmp
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.min.i32.p0i32( i32* %val2, i32 16 )		; <i32>:7 [#uses=1]
+	store i32 %7, i32* %old
+	%neg = sub i32 0, 1		; <i32> [#uses=1]
+  ; CHECK: ldrex
+  ; CHECK: cmp
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.min.i32.p0i32( i32* %val2, i32 %neg )		; <i32>:8 [#uses=1]
+	store i32 %8, i32* %old
+  ; CHECK: ldrex
+  ; CHECK: cmp
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.max.i32.p0i32( i32* %val2, i32 1 )		; <i32>:9 [#uses=1]
+	store i32 %9, i32* %old
+  ; CHECK: ldrex
+  ; CHECK: cmp
+  ; CHECK: strex
+	call i32 @llvm.atomic.load.max.i32.p0i32( i32* %val2, i32 0 )		; <i32>:10 [#uses=1]
+	store i32 %10, i32* %old
+	ret void
+}
+
+declare i32 @llvm.atomic.load.add.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.sub.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.and.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.or.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.xor.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.min.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.max.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.umax.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.load.umin.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.swap.i32.p0i32(i32*, i32) nounwind 
+
+declare i32 @llvm.atomic.cmp.swap.i32.p0i32(i32*, i32, i32) nounwind 
diff --git a/test/CodeGen/ARM/bfi.ll b/test/CodeGen/ARM/bfi.ll
index 946db19..c94b096 100644
--- a/test/CodeGen/ARM/bfi.ll
+++ b/test/CodeGen/ARM/bfi.ll
@@ -31,8 +31,7 @@ define i32 @f3(i32 %A, i32 %B) nounwind {
 entry:
 ; CHECK: f3
 ; CHECK: lsr{{.*}} #7
-; CHECK: mov r0, r1
-; CHECK: bfi r0, r2, #7, #16
+; CHECK: bfi {{.*}}, #7, #16
   %and = and i32 %A, 8388480                      ; <i32> [#uses=1]
   %and2 = and i32 %B, -8388481                    ; <i32> [#uses=1]
   %or = or i32 %and2, %and                        ; <i32> [#uses=1]
@@ -42,8 +41,8 @@ entry:
 ; rdar://8752056
 define i32 @f4(i32 %a) nounwind {
 ; CHECK: f4
-; CHECK: movw r1, #3137
-; CHECK: bfi r1, r0, #15, #5
+; CHECK: movw [[R1:r[0-9]+]], #3137
+; CHECK: bfi [[R1]], {{r[0-9]+}}, #15, #5
   %1 = shl i32 %a, 15
   %ins7 = and i32 %1, 1015808
   %ins12 = or i32 %ins7, 3137
diff --git a/test/CodeGen/ARM/call-tc.ll b/test/CodeGen/ARM/call-tc.ll
index 4dc37aa..c460f7a 100644
--- a/test/CodeGen/ARM/call-tc.ll
+++ b/test/CodeGen/ARM/call-tc.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=armv6-apple-darwin -mattr=+vfp2 -arm-tail-calls | FileCheck %s -check-prefix=CHECKV6
 ; RUN: llc < %s -mtriple=armv6-linux-gnueabi -relocation-model=pic -mattr=+vfp2 -arm-tail-calls | FileCheck %s -check-prefix=CHECKELF
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-tail-calls | FileCheck %s -check-prefix=CHECKT2
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-tail-calls | FileCheck %s -check-prefix=CHECKT2D
 
 @t = weak global i32 ()* null           ; <i32 ()**> [#uses=1]
 
@@ -16,6 +16,10 @@ define void @t1() {
 define void @t2() {
 ; CHECKV6: t2:
 ; CHECKV6: bx r0 @ TAILCALL
+; CHECKT2D: t2:
+; CHECKT2D: ldr
+; CHECKT2D-NEXT: ldr
+; CHECKT2D-NEXT: bx r0 @ TAILCALL
         %tmp = load i32 ()** @t         ; <i32 ()*> [#uses=1]
         %tmp.upgrd.2 = tail call i32 %tmp( )            ; <i32> [#uses=0]
         ret void
@@ -26,6 +30,9 @@ define void @t3() {
 ; CHECKV6: b _t2  @ TAILCALL
 ; CHECKELF: t3:
 ; CHECKELF: b t2(PLT) @ TAILCALL
+; CHECKT2D: t3:
+; CHECKT2D: b.w _t2  @ TAILCALL
+
         tail call void @t2( )            ; <i32> [#uses=0]
         ret void
 }
@@ -71,10 +78,10 @@ declare void @foo() nounwind
 
 define void @t7() nounwind {
 entry:
-; CHECKT2: t7:
-; CHECKT2: blxeq _foo
-; CHECKT2-NEXT: pop.w
-; CHECKT2-NEXT: b _foo
+; CHECKT2D: t7:
+; CHECKT2D: blxeq _foo
+; CHECKT2D-NEXT: pop.w
+; CHECKT2D-NEXT: b.w _foo
   br i1 undef, label %bb, label %bb1.lr.ph
 
 bb1.lr.ph:
diff --git a/test/CodeGen/ARM/debug-info-branch-folding.ll b/test/CodeGen/ARM/debug-info-branch-folding.ll
new file mode 100644
index 0000000..9bdae43
--- /dev/null
+++ b/test/CodeGen/ARM/debug-info-branch-folding.ll
@@ -0,0 +1,94 @@
+; RUN: llc < %s - | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-macosx10.6.7"
+
+;CHECK: 	vadd.f32	q4, q8, q8
+;CHECK-NEXT: Ltmp
+;CHECK-NEXT: 	@DEBUG_VALUE: y <- Q4+0
+;CHECK-NEXT:    @DEBUG_VALUE: x <- Q4+0
+
+
+@.str = external constant [13 x i8]
+
+declare <4 x float> @test0001(float) nounwind readnone ssp
+
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind ssp {
+entry:
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %entry
+  %add19 = fadd <4 x float> undef, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, !dbg !39
+  tail call void @llvm.dbg.value(metadata !{<4 x float> %add19}, i64 0, metadata !27), !dbg !39
+  %add20 = fadd <4 x float> undef, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, !dbg !39
+  tail call void @llvm.dbg.value(metadata !{<4 x float> %add20}, i64 0, metadata !28), !dbg !39
+  br i1 undef, label %for.end54, label %for.body9, !dbg !44
+
+for.end54:                                        ; preds = %for.body9
+  %tmp115 = extractelement <4 x float> %add19, i32 1
+  %conv6.i75 = fpext float %tmp115 to double, !dbg !45
+  %call.i82 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0), double undef, double %conv6.i75, double undef, double undef) nounwind, !dbg !45
+  %tmp116 = extractelement <4 x float> %add20, i32 1
+  %conv6.i76 = fpext float %tmp116 to double, !dbg !45
+  %call.i83 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0), double undef, double %conv6.i76, double undef, double undef) nounwind, !dbg !45
+  ret i32 0, !dbg !49
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.sp = !{!0, !10, !14}
+!llvm.dbg.lv.test0001 = !{!18}
+!llvm.dbg.lv.main = !{!19, !20, !24, !26, !27, !28, !29}
+!llvm.dbg.lv.printFV = !{!30}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"test0001", metadata !"test0001", metadata !"", metadata !1, i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, <4 x float> (float)* @test0001, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"build2.c", metadata !"/private/tmp", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 12, metadata !"build2.c", metadata !"/private/tmp", metadata !"clang version 3.0 (trunk 129915)", i1 true, i1 true, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 589846, metadata !2, metadata !"v4f32", metadata !1, i32 14, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
+!6 = metadata !{i32 590083, metadata !2, metadata !"", metadata !2, i32 0, i64 128, i64 128, i32 0, i32 0, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_vector_type ]
+!7 = metadata !{i32 589860, metadata !2, metadata !"float", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 589857, i64 0, i64 3}         ; [ DW_TAG_subrange_type ]
+!10 = metadata !{i32 589870, i32 0, metadata !1, metadata !"main", metadata !"main", metadata !"", metadata !1, i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i8**)* @main, null} ; [ DW_TAG_subprogram ]
+!11 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 589860, metadata !2, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!14 = metadata !{i32 589870, i32 0, metadata !15, metadata !"printFV", metadata !"printFV", metadata !"", metadata !15, i32 41, metadata !16, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, null, null} ; [ DW_TAG_subprogram ]
+!15 = metadata !{i32 589865, metadata !"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", metadata !"/private/tmp", metadata !2} ; [ DW_TAG_file_type ]
+!16 = metadata !{i32 589845, metadata !15, metadata !"", metadata !15, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !17, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!17 = metadata !{null}
+!18 = metadata !{i32 590081, metadata !0, metadata !"a", metadata !1, i32 16777219, metadata !7, i32 0} ; [ DW_TAG_arg_variable ]
+!19 = metadata !{i32 590081, metadata !10, metadata !"argc", metadata !1, i32 16777275, metadata !13, i32 0} ; [ DW_TAG_arg_variable ]
+!20 = metadata !{i32 590081, metadata !10, metadata !"argv", metadata !1, i32 33554491, metadata !21, i32 0} ; [ DW_TAG_arg_variable ]
+!21 = metadata !{i32 589839, metadata !2, metadata !"", null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !22} ; [ DW_TAG_pointer_type ]
+!22 = metadata !{i32 589839, metadata !2, metadata !"", null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
+!23 = metadata !{i32 589860, metadata !2, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!24 = metadata !{i32 590080, metadata !25, metadata !"i", metadata !1, i32 60, metadata !13, i32 0} ; [ DW_TAG_auto_variable ]
+!25 = metadata !{i32 589835, metadata !10, i32 59, i32 33, metadata !1, i32 14} ; [ DW_TAG_lexical_block ]
+!26 = metadata !{i32 590080, metadata !25, metadata !"j", metadata !1, i32 60, metadata !13, i32 0} ; [ DW_TAG_auto_variable ]
+!27 = metadata !{i32 590080, metadata !25, metadata !"x", metadata !1, i32 61, metadata !5, i32 0} ; [ DW_TAG_auto_variable ]
+!28 = metadata !{i32 590080, metadata !25, metadata !"y", metadata !1, i32 62, metadata !5, i32 0} ; [ DW_TAG_auto_variable ]
+!29 = metadata !{i32 590080, metadata !25, metadata !"z", metadata !1, i32 63, metadata !5, i32 0} ; [ DW_TAG_auto_variable ]
+!30 = metadata !{i32 590081, metadata !14, metadata !"F", metadata !15, i32 16777257, metadata !31, i32 0} ; [ DW_TAG_arg_variable ]
+!31 = metadata !{i32 589839, metadata !2, metadata !"", null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_pointer_type ]
+!32 = metadata !{i32 589846, metadata !2, metadata !"FV", metadata !15, i32 25, i64 0, i64 0, i64 0, i32 0, metadata !33} ; [ DW_TAG_typedef ]
+!33 = metadata !{i32 589847, metadata !2, metadata !"", metadata !15, i32 22, i64 128, i64 128, i64 0, i32 0, i32 0, metadata !34, i32 0, i32 0} ; [ DW_TAG_union_type ]
+!34 = metadata !{metadata !35, metadata !37}
+!35 = metadata !{i32 589837, metadata !15, metadata !"V", metadata !15, i32 23, i64 128, i64 128, i64 0, i32 0, metadata !36} ; [ DW_TAG_member ]
+!36 = metadata !{i32 589846, metadata !2, metadata !"v4sf", metadata !15, i32 3, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
+!37 = metadata !{i32 589837, metadata !15, metadata !"A", metadata !15, i32 24, i64 128, i64 32, i64 0, i32 0, metadata !38} ; [ DW_TAG_member ]
+!38 = metadata !{i32 589825, metadata !2, metadata !"", metadata !2, i32 0, i64 128, i64 32, i32 0, i32 0, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_array_type ]
+!39 = metadata !{i32 79, i32 7, metadata !40, null}
+!40 = metadata !{i32 589835, metadata !41, i32 75, i32 35, metadata !1, i32 18} ; [ DW_TAG_lexical_block ]
+!41 = metadata !{i32 589835, metadata !42, i32 75, i32 5, metadata !1, i32 17} ; [ DW_TAG_lexical_block ]
+!42 = metadata !{i32 589835, metadata !43, i32 71, i32 32, metadata !1, i32 16} ; [ DW_TAG_lexical_block ]
+!43 = metadata !{i32 589835, metadata !25, i32 71, i32 3, metadata !1, i32 15} ; [ DW_TAG_lexical_block ]
+!44 = metadata !{i32 75, i32 5, metadata !42, null}
+!45 = metadata !{i32 42, i32 2, metadata !46, metadata !48}
+!46 = metadata !{i32 589835, metadata !47, i32 42, i32 2, metadata !15, i32 20} ; [ DW_TAG_lexical_block ]
+!47 = metadata !{i32 589835, metadata !14, i32 41, i32 28, metadata !15, i32 19} ; [ DW_TAG_lexical_block ]
+!48 = metadata !{i32 95, i32 3, metadata !25, null}
+!49 = metadata !{i32 99, i32 3, metadata !25, null}
diff --git a/test/CodeGen/ARM/debug-info-sreg2.ll b/test/CodeGen/ARM/debug-info-sreg2.ll
new file mode 100644
index 0000000..16aeab3
--- /dev/null
+++ b/test/CodeGen/ARM/debug-info-sreg2.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s - | FileCheck %s
+; Radar 9376013
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-macosx10.6.7"
+
+;CHECK: Ldebug_loc0:
+;CHECK-NEXT:        .long   Ltmp1
+;CHECK-NEXT:        .long   Ltmp3
+;CHECK-NEXT: Lset9 = Ltmp10-Ltmp9                    @ Loc expr size
+;CHECK-NEXT:        .short  Lset9
+;CHECK-NEXT: Ltmp9:
+;CHECK-NEXT:        .byte   144                     @ DW_OP_regx for S register
+
+define void @_Z3foov() optsize ssp {
+entry:
+  %call = tail call float @_Z3barv() optsize, !dbg !11
+  tail call void @llvm.dbg.value(metadata !{float %call}, i64 0, metadata !5), !dbg !11
+  %call16 = tail call float @_Z2f2v() optsize, !dbg !12
+  %cmp7 = fcmp olt float %call, %call16, !dbg !12
+  br i1 %cmp7, label %for.body, label %for.end, !dbg !12
+
+for.body:                                         ; preds = %entry, %for.body
+  %k.08 = phi float [ %inc, %for.body ], [ %call, %entry ]
+  %call4 = tail call float @_Z2f3f(float %k.08) optsize, !dbg !13
+  %inc = fadd float %k.08, 1.000000e+00, !dbg !14
+  %call1 = tail call float @_Z2f2v() optsize, !dbg !12
+  %cmp = fcmp olt float %inc, %call1, !dbg !12
+  br i1 %cmp, label %for.body, label %for.end, !dbg !12
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void, !dbg !15
+}
+
+declare float @_Z3barv() optsize
+
+declare float @_Z2f2v() optsize
+
+declare float @_Z2f3f(float) optsize
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+!llvm.dbg.sp = !{!1}
+!llvm.dbg.lv._Z3foov = !{!5, !8}
+
+!0 = metadata !{i32 589841, i32 0, i32 4, metadata !"k.cc", metadata !"/private/tmp", metadata !"clang version 3.0 (trunk 130845)", i1 true, i1 true, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{i32 589870, i32 0, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3foov", metadata !2, i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, void ()* @_Z3foov, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 589865, metadata !"k.cc", metadata !"/private/tmp", metadata !0} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 589845, metadata !2, metadata !"", metadata !2, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{null}
+!5 = metadata !{i32 590080, metadata !6, metadata !"k", metadata !2, i32 6, metadata !7, i32 0} ; [ DW_TAG_auto_variable ]
+!6 = metadata !{i32 589835, metadata !1, i32 5, i32 12, metadata !2, i32 0} ; [ DW_TAG_lexical_block ]
+!7 = metadata !{i32 589860, metadata !0, metadata !"float", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
+!8 = metadata !{i32 590080, metadata !9, metadata !"y", metadata !2, i32 8, metadata !7, i32 0} ; [ DW_TAG_auto_variable ]
+!9 = metadata !{i32 589835, metadata !10, i32 7, i32 25, metadata !2, i32 2} ; [ DW_TAG_lexical_block ]
+!10 = metadata !{i32 589835, metadata !6, i32 7, i32 3, metadata !2, i32 1} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{i32 6, i32 18, metadata !6, null}
+!12 = metadata !{i32 7, i32 3, metadata !6, null}
+!13 = metadata !{i32 8, i32 20, metadata !9, null}
+!14 = metadata !{i32 7, i32 20, metadata !10, null}
+!15 = metadata !{i32 10, i32 1, metadata !6, null}
diff --git a/test/CodeGen/ARM/divmod.ll b/test/CodeGen/ARM/divmod.ll
deleted file mode 100644
index 34313aa..0000000
--- a/test/CodeGen/ARM/divmod.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: llc < %s -mtriple=arm-apple-ios | FileCheck %s
-
-define void @foo(i32 %x, i32 %y, i32* nocapture %P) nounwind ssp {
-entry:
-; CHECK: foo:
-; CHECK: bl ___divmodsi4
-; CHECK-NOT: bl ___divmodsi4
-  %div = sdiv i32 %x, %y
-  store i32 %div, i32* %P, align 4
-  %rem = srem i32 %x, %y
-  %arrayidx6 = getelementptr inbounds i32* %P, i32 1
-  store i32 %rem, i32* %arrayidx6, align 4
-  ret void
-}
-
-define void @bar(i32 %x, i32 %y, i32* nocapture %P) nounwind ssp {
-entry:
-; CHECK: bar:
-; CHECK: bl ___udivmodsi4
-; CHECK-NOT: bl ___udivmodsi4
-  %div = udiv i32 %x, %y
-  store i32 %div, i32* %P, align 4
-  %rem = urem i32 %x, %y
-  %arrayidx6 = getelementptr inbounds i32* %P, i32 1
-  store i32 %rem, i32* %arrayidx6, align 4
-  ret void
-}
-
-; rdar://9280991
-@flags = external unnamed_addr global i32
-@tabsize = external unnamed_addr global i32
-
-define void @do_indent(i32 %cols) nounwind {
-entry:
-; CHECK: do_indent:
-  %0 = load i32* @flags, align 4
-  %1 = and i32 %0, 67108864
-  %2 = icmp eq i32 %1, 0
-  br i1 %2, label %bb1, label %bb
-
-bb:
-; CHECK: bl ___divmodsi4
-  %3 = load i32* @tabsize, align 4
-  %4 = srem i32 %cols, %3
-  %5 = sdiv i32 %cols, %3
-  %6 = tail call i32 @llvm.objectsize.i32(i8* null, i1 false)
-  %7 = tail call i8* @__memset_chk(i8* null, i32 9, i32 %5, i32 %6) nounwind
-  br label %bb1
-
-bb1:
-  %line_indent_len.0 = phi i32 [ %4, %bb ], [ 0, %entry ]
-  %8 = getelementptr inbounds i8* null, i32 %line_indent_len.0
-  store i8 0, i8* %8, align 1
-  ret void
-}
-
-declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readnone
-declare i8* @__memset_chk(i8*, i32, i32, i32) nounwind
diff --git a/test/CodeGen/ARM/eh-resume-darwin.ll b/test/CodeGen/ARM/eh-resume-darwin.ll
new file mode 100644
index 0000000..e475508
--- /dev/null
+++ b/test/CodeGen/ARM/eh-resume-darwin.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -march=arm | FileCheck %s
+target triple = "armv6-apple-macosx10.6"
+
+declare void @func()
+
+declare i8* @llvm.eh.exception() nounwind readonly
+
+declare i32 @llvm.eh.selector(i8*, i8*, ...) nounwind
+
+declare void @llvm.eh.resume(i8*, i32)
+
+declare i32 @__gxx_personality_sj0(...)
+
+define void @test0() {
+entry:
+  invoke void @func()
+    to label %cont unwind label %lpad
+
+cont:
+  ret void
+
+lpad:
+  %exn = call i8* @llvm.eh.exception()
+  %sel = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*), i32 0)
+  call void @llvm.eh.resume(i8* %exn, i32 %sel) noreturn
+  unreachable
+}
+
+; CHECK: __Unwind_SjLj_Resume
diff --git a/test/CodeGen/ARM/fast-isel-crash2.ll b/test/CodeGen/ARM/fast-isel-crash2.ll
new file mode 100644
index 0000000..aa06299
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-crash2.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -O0 -mtriple=thumbv7-apple-darwin
+; rdar://9515076
+; (Make sure this doesn't crash.)
+
+define i32 @test(i32 %i) {
+  %t = trunc i32 %i to i4
+  %r = sext i4 %t to i32
+  ret i32 %r
+}
diff --git a/test/CodeGen/ARM/fast-isel.ll b/test/CodeGen/ARM/fast-isel.ll
index 6aad92f..499c97f 100644
--- a/test/CodeGen/ARM/fast-isel.ll
+++ b/test/CodeGen/ARM/fast-isel.ll
@@ -61,3 +61,100 @@ b3:
 ; THUMB: orr {{.*}} #4
 ; ARM: orr {{.*}} #4
 }
+
+define void @test3(i32 %tmp, i32* %ptr1, i16* %ptr2, i8* %ptr3) nounwind {
+; THUMB: test3:
+; ARM: test3:
+
+bb1:
+  %a1 = trunc i32 %tmp to i16
+  %a2 = trunc i16 %a1 to i8
+  %a3 = trunc i8 %a2 to i1
+  %a4 = zext i1 %a3 to i8
+  store i8 %a4, i8* %ptr3
+  %a5 = zext i8 %a4 to i16
+  store i16 %a5, i16* %ptr2
+  %a6 = zext i16 %a5 to i32
+  store i32 %a6, i32* %ptr1
+  br label %bb2
+
+; THUMB: and
+; THUMB: strb
+; THUMB: uxtb
+; THUMB: strh
+; THUMB: uxth
+; ARM: and
+; ARM: strb
+; ARM: uxtb
+; ARM: strh
+; ARM: uxth
+
+bb2:
+  %b1 = trunc i32 %tmp to i16
+  %b2 = trunc i16 %b1 to i8
+  store i8 %b2, i8* %ptr3
+  %b3 = sext i8 %b2 to i16
+  store i16 %b3, i16* %ptr2
+  %b4 = sext i16 %b3 to i32
+  store i32 %b4, i32* %ptr1
+  br label %bb3
+
+; THUMB: strb
+; THUMB: sxtb
+; THUMB: strh
+; THUMB: sxth
+; ARM: strb
+; ARM: sxtb
+; ARM: strh
+; ARM: sxth
+
+bb3:
+  %c1 = load i8* %ptr3
+  %c2 = load i16* %ptr2
+  %c3 = load i32* %ptr1
+  %c4 = zext i8 %c1 to i32
+  %c5 = sext i16 %c2 to i32
+  %c6 = add i32 %c4, %c5
+  %c7 = sub i32 %c3, %c6
+  store i32 %c7, i32* %ptr1
+  ret void
+
+; THUMB: ldrb
+; THUMB: ldrh
+; THUMB: uxtb
+; THUMB: sxth
+; THUMB: add
+; THUMB: sub
+; ARM: ldrb
+; ARM: ldrh
+; ARM: uxtb
+; ARM: sxth
+; ARM: add
+; ARM: sub
+}
+
+; Check loads/stores with globals
+@test4g = external global i32
+
+define void @test4() {
+  %a = load i32* @test4g
+  %b = add i32 %a, 1
+  store i32 %b, i32* @test4g
+  ret void
+
+; THUMB: ldr.n r0, LCPI4_1
+; THUMB: ldr r0, [r0]
+; THUMB: ldr r0, [r0]
+; THUMB: adds r0, #1
+; THUMB: ldr.n r1, LCPI4_0
+; THUMB: ldr r1, [r1]
+; THUMB: str r0, [r1]
+
+; ARM: ldr r0, LCPI4_1
+; ARM: ldr r0, [r0]
+; ARM: ldr r0, [r0]
+; ARM: add r0, r0, #1
+; ARM: ldr r1, LCPI4_0
+; ARM: ldr r1, [r1]
+; ARM: str r0, [r1]
+}
diff --git a/test/CodeGen/ARM/fcopysign.ll b/test/CodeGen/ARM/fcopysign.ll
index f241c26..c4dbeb9 100644
--- a/test/CodeGen/ARM/fcopysign.ll
+++ b/test/CodeGen/ARM/fcopysign.ll
@@ -10,7 +10,7 @@ entry:
 
 ; HARD: test1:
 ; HARD: vmov.i32 [[REG1:(d[0-9]+)]], #0x80000000
-; HARD: vbsl [[REG1]], d2, d0
+; HARD: vbsl [[REG1]], d
   %0 = tail call float @copysignf(float %x, float %y) nounwind
   ret float %0
 }
diff --git a/test/CodeGen/ARM/fnmscs.ll b/test/CodeGen/ARM/fnmscs.ll
index 9facf20..6081712 100644
--- a/test/CodeGen/ARM/fnmscs.ll
+++ b/test/CodeGen/ARM/fnmscs.ll
@@ -29,7 +29,7 @@ entry:
 ; NEON: vnmla.f32
 
 ; A8: t2:
-; A8: vnmul.f32 s{{[0123]}}, s{{[0123]}}, s{{[0123]}}
+; A8: vnmul.f32 s{{[01234]}}, s{{[01234]}}, s{{[01234]}}
 ; A8: vsub.f32 d{{[0-9]}}, d{{[0-9]}}, d{{[0-9]}}
 	%0 = fmul float %a, %b
 	%1 = fmul float -1.0, %0
diff --git a/test/CodeGen/ARM/inlineasm3.ll b/test/CodeGen/ARM/inlineasm3.ll
index 9d6eba8..58687b9 100644
--- a/test/CodeGen/ARM/inlineasm3.ll
+++ b/test/CodeGen/ARM/inlineasm3.ll
@@ -23,3 +23,38 @@ entry:
   %asmtmp2 = tail call i32 asm sideeffect "vmov d30, $1\0Avmov.32 $0, d30[0]\0A", "=r,w,~{d30}"(<2 x i32> undef) nounwind
   ret void
 }
+
+; Radar 9306086
+
+%0 = type { <8 x i8>, <16 x i8>* }
+
+define hidden void @conv4_8_E() nounwind {
+entry:
+%asmtmp31 = call %0 asm "vld1.u8  {$0}, [$1, :128]!\0A", "=w,=r,1"(<16 x i8>* undef) nounwind
+unreachable
+}
+
+; Radar 9037836 & 9119939
+
+define i32 @t3() nounwind {
+entry:
+tail call void asm sideeffect "flds s15, $0 \0A", "^Uv|m,~{s15}"(float 1.000000e+00) nounwind
+ret i32 0
+}
+
+; Radar 9037836 & 9119939
+
+@k.2126 = internal unnamed_addr global float 1.000000e+00
+define i32 @t4() nounwind {
+entry:
+call void asm sideeffect "flds s15, $0 \0A", "*^Uv,~{s15}"(float* @k.2126) nounwind
+ret i32 0
+}
+
+; Radar 9037836 & 9119939
+
+define i32 @t5() nounwind {
+entry:
+call void asm sideeffect "flds s15, $0 \0A", "*^Uvm,~{s15}"(float* @k.2126) nounwind
+ret i32 0
+}
diff --git a/test/CodeGen/ARM/intrinsics.ll b/test/CodeGen/ARM/intrinsics.ll
new file mode 100644
index 0000000..54cc3e0
--- /dev/null
+++ b/test/CodeGen/ARM/intrinsics.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -mtriple=armv7-eabi -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -march=thumb -mtriple=thumbv7-eabi -mcpu=cortex-a8 | FileCheck %s
+
+define void @coproc() nounwind {
+entry:
+  ; CHECK: mrc
+  %0 = tail call i32 @llvm.arm.mrc(i32 7, i32 1, i32 1, i32 1, i32 4) nounwind
+  ; CHECK: mcr
+  tail call void @llvm.arm.mcr(i32 7, i32 1, i32 %0, i32 1, i32 1, i32 4) nounwind
+  ; CHECK: mrc2
+  %1 = tail call i32 @llvm.arm.mrc2(i32 7, i32 1, i32 1, i32 1, i32 4) nounwind
+  ; CHECK: mcr2
+  tail call void @llvm.arm.mcr2(i32 7, i32 1, i32 %1, i32 1, i32 1, i32 4) nounwind
+  ; CHECK: mcrr
+  tail call void @llvm.arm.mcrr(i32 7, i32 1, i32 %0, i32 %1, i32 1) nounwind
+  ; CHECK: mcrr2
+  tail call void @llvm.arm.mcrr2(i32 7, i32 1, i32 %0, i32 %1, i32 1) nounwind
+  ; CHECK: cdp
+  tail call void @llvm.arm.cdp(i32 7, i32 3, i32 1, i32 1, i32 1, i32 5) nounwind
+  ; CHECK: cdp2
+  tail call void @llvm.arm.cdp2(i32 7, i32 3, i32 1, i32 1, i32 1, i32 5) nounwind
+  ret void
+}
+
+declare void @llvm.arm.cdp2(i32, i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.cdp(i32, i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.mcrr2(i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.mcrr(i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.mcr2(i32, i32, i32, i32, i32, i32) nounwind
+
+declare i32 @llvm.arm.mrc2(i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.mcr(i32, i32, i32, i32, i32, i32) nounwind
+
+declare i32 @llvm.arm.mrc(i32, i32, i32, i32, i32) nounwind
diff --git a/test/CodeGen/ARM/ldst-f32-2-i32.ll b/test/CodeGen/ARM/ldst-f32-2-i32.ll
index 2d016f6..1c69e15 100644
--- a/test/CodeGen/ARM/ldst-f32-2-i32.ll
+++ b/test/CodeGen/ARM/ldst-f32-2-i32.ll
@@ -10,8 +10,8 @@ entry:
   br i1 %0, label %return, label %bb
 
 bb:
-; CHECK: ldr [[REGISTER:(r[0-9]+)]], [r1], r3
-; CHECK: str [[REGISTER]], [r2], #4
+; CHECK: ldr [[REGISTER:(r[0-9]+)]], [{{r[0-9]+}}], {{r[0-9]+}}
+; CHECK: str [[REGISTER]], [{{r[0-9]+}}], #4
   %j.05 = phi i32 [ %2, %bb ], [ 0, %entry ]
   %tmp = mul i32 %j.05, %index
   %uglygep = getelementptr i8* %src6, i32 %tmp
diff --git a/test/CodeGen/ARM/ldstrexd.ll b/test/CodeGen/ARM/ldstrexd.ll
new file mode 100644
index 0000000..0c0911a
--- /dev/null
+++ b/test/CodeGen/ARM/ldstrexd.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple=armv7-apple-darwin   | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s
+
+%0 = type { i32, i32 }
+
+; CHECK: f0:
+; CHECK: ldrexd
+define i64 @f0(i8* %p) nounwind readonly {
+entry:
+  %ldrexd = tail call %0 @llvm.arm.ldrexd(i8* %p)
+  %0 = extractvalue %0 %ldrexd, 1
+  %1 = extractvalue %0 %ldrexd, 0
+  %2 = zext i32 %0 to i64
+  %3 = zext i32 %1 to i64
+  %shl = shl nuw i64 %2, 32
+  %4 = or i64 %shl, %3
+  ret i64 %4
+}
+
+; CHECK: f1:
+; CHECK: strexd
+define i32 @f1(i8* %ptr, i64 %val) nounwind {
+entry:
+  %tmp4 = trunc i64 %val to i32
+  %tmp6 = lshr i64 %val, 32
+  %tmp7 = trunc i64 %tmp6 to i32
+  %strexd = tail call i32 @llvm.arm.strexd(i32 %tmp4, i32 %tmp7, i8* %ptr)
+  ret i32 %strexd
+}
+
+declare %0 @llvm.arm.ldrexd(i8*) nounwind readonly
+declare i32 @llvm.arm.strexd(i32, i32, i8*) nounwind
+
diff --git a/test/CodeGen/ARM/lsr-unfolded-offset.ll b/test/CodeGen/ARM/lsr-unfolded-offset.ll
new file mode 100644
index 0000000..e3e6eae
--- /dev/null
+++ b/test/CodeGen/ARM/lsr-unfolded-offset.ll
@@ -0,0 +1,80 @@
+; RUN: llc -regalloc=greedy < %s | FileCheck %s
+
+; LSR shouldn't introduce more induction variables than needed, increasing
+; register pressure and therefore spilling. There is more room for improvement
+; here.
+
+; CHECK: sub sp, #{{32|24}}
+
+; CHECK:      ldr r{{.*}}, [sp, #4]
+; CHECK-NEXT: ldr r{{.*}}, [sp, #16]
+; CHECK-NEXT: ldr r{{.*}}, [sp, #12]
+; CHECK-NEXT: adds
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-macosx10.7.0"
+
+%struct.partition_entry = type { i32, i32, i64, i64 }
+
+define i32 @partition_overlap_check(%struct.partition_entry* nocapture %part, i32 %num_entries) nounwind readonly optsize ssp {
+entry:
+  %cmp79 = icmp sgt i32 %num_entries, 0
+  br i1 %cmp79, label %outer.loop, label %for.end72
+
+outer.loop:                                 ; preds = %for.inc69, %entry
+  %overlap.081 = phi i32 [ %overlap.4, %for.inc69 ], [ 0, %entry ]
+  %0 = phi i32 [ %inc71, %for.inc69 ], [ 0, %entry ]
+  %offset = getelementptr %struct.partition_entry* %part, i32 %0, i32 2
+  %len = getelementptr %struct.partition_entry* %part, i32 %0, i32 3
+  %tmp5 = load i64* %offset, align 4, !tbaa !0
+  %tmp15 = load i64* %len, align 4, !tbaa !0
+  %add = add nsw i64 %tmp15, %tmp5
+  br label %inner.loop
+
+inner.loop:                                       ; preds = %for.inc, %outer.loop
+  %overlap.178 = phi i32 [ %overlap.081, %outer.loop ], [ %overlap.4, %for.inc ]
+  %1 = phi i32 [ 0, %outer.loop ], [ %inc, %for.inc ]
+  %cmp23 = icmp eq i32 %0, %1
+  br i1 %cmp23, label %for.inc, label %if.end
+
+if.end:                                           ; preds = %inner.loop
+  %len39 = getelementptr %struct.partition_entry* %part, i32 %1, i32 3
+  %offset28 = getelementptr %struct.partition_entry* %part, i32 %1, i32 2
+  %tmp29 = load i64* %offset28, align 4, !tbaa !0
+  %tmp40 = load i64* %len39, align 4, !tbaa !0
+  %add41 = add nsw i64 %tmp40, %tmp29
+  %cmp44 = icmp sge i64 %tmp29, %tmp5
+  %cmp47 = icmp slt i64 %tmp29, %add
+  %or.cond = and i1 %cmp44, %cmp47
+  %overlap.2 = select i1 %or.cond, i32 1, i32 %overlap.178
+  %cmp52 = icmp sle i64 %add41, %add
+  %cmp56 = icmp sgt i64 %add41, %tmp5
+  %or.cond74 = and i1 %cmp52, %cmp56
+  %overlap.3 = select i1 %or.cond74, i32 1, i32 %overlap.2
+  %cmp61 = icmp sgt i64 %tmp29, %tmp5
+  %cmp65 = icmp slt i64 %add41, %add
+  %or.cond75 = or i1 %cmp61, %cmp65
+  br i1 %or.cond75, label %for.inc, label %if.then66
+
+if.then66:                                        ; preds = %if.end
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end, %if.then66, %inner.loop
+  %overlap.4 = phi i32 [ %overlap.178, %inner.loop ], [ 1, %if.then66 ], [ %overlap.3, %if.end ]
+  %inc = add nsw i32 %1, 1
+  %exitcond = icmp eq i32 %inc, %num_entries
+  br i1 %exitcond, label %for.inc69, label %inner.loop
+
+for.inc69:                                        ; preds = %for.inc
+  %inc71 = add nsw i32 %0, 1
+  %exitcond83 = icmp eq i32 %inc71, %num_entries
+  br i1 %exitcond83, label %for.end72, label %outer.loop
+
+for.end72:                                        ; preds = %for.inc69, %entry
+  %overlap.0.lcssa = phi i32 [ 0, %entry ], [ %overlap.4, %for.inc69 ]
+  ret i32 %overlap.0.lcssa
+}
+
+!0 = metadata !{metadata !"long long", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/ARM/memfunc.ll b/test/CodeGen/ARM/memfunc.ll
index 41d5944..032129d 100644
--- a/test/CodeGen/ARM/memfunc.ll
+++ b/test/CodeGen/ARM/memfunc.ll
@@ -1,10 +1,26 @@
-; RUN: llc < %s -march=arm
+; RUN: llc < %s -march=arm -o - | FileCheck %s
+; RUN: llc < %s -mtriple=arm-none-eabi -o - | FileCheck --check-prefix=EABI %s
+
+@from = common global [500 x i32] zeroinitializer, align 4
+@to = common global [500 x i32] zeroinitializer, align 4
 
 define void @f() {
 entry:
-        call void @llvm.memmove.i32( i8* null, i8* null, i32 64, i32 0 )
-        call void @llvm.memcpy.i32( i8* null, i8* null, i32 64, i32 0 )
-        call void @llvm.memset.i32( i8* null, i8 64, i32 0, i32 0 )
+
+        ; CHECK: memmove
+        ; EABI: __aeabi_memmove
+        call void @llvm.memmove.i32( i8* bitcast ([500 x i32]* @from to i8*), i8* bitcast ([500 x i32]* @to to i8*), i32 500, i32 0 )
+
+        ; CHECK: memcpy
+        ; EABI: __aeabi_memcpy
+        call void @llvm.memcpy.i32( i8* bitcast ([500 x i32]* @from to i8*), i8* bitcast ([500 x i32]* @to to i8*), i32 500, i32 0 )
+
+        ; EABI memset swaps arguments
+        ; CHECK: mov r1, #0
+        ; CHECK: memset
+        ; EABI: mov r2, #0
+        ; EABI: __aeabi_memset
+        call void @llvm.memset.i32( i8* bitcast ([500 x i32]* @from to i8*), i8 0, i32 500, i32 0 )
         unreachable
 }
 
diff --git a/test/CodeGen/ARM/movt-movw-global.ll b/test/CodeGen/ARM/movt-movw-global.ll
index 886ff3f..991d728 100644
--- a/test/CodeGen/ARM/movt-movw-global.ll
+++ b/test/CodeGen/ARM/movt-movw-global.ll
@@ -1,20 +1,39 @@
-; RUN: llc < %s | FileCheck %s
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
-target triple = "armv7-eabi"
+; RUN: llc < %s -mtriple=armv7-eabi      | FileCheck %s -check-prefix=EABI
+; RUN: llc < %s -mtriple=armv7-apple-ios -relocation-model=dynamic-no-pic | FileCheck %s -check-prefix=IOS
+; RUN: llc < %s -mtriple=armv7-apple-ios -relocation-model=pic            | FileCheck %s -check-prefix=IOS-PIC
+; RUN: llc < %s -mtriple=armv7-apple-ios -relocation-model=static         | FileCheck %s -check-prefix=IOS-STATIC
 
-@foo = common global i32 0                        ; <i32*> [#uses=1]
+@foo = common global i32 0
 
-define arm_aapcs_vfpcc i32* @bar1() nounwind readnone {
+define i32* @bar1() nounwind readnone {
 entry:
-; CHECK:      movw    r0, :lower16:foo
-; CHECK-NEXT: movt    r0, :upper16:foo
+; EABI:      movw    r0, :lower16:foo
+; EABI-NEXT: movt    r0, :upper16:foo
+
+; IOS:      movw    r0, :lower16:L_foo$non_lazy_ptr
+; IOS-NEXT: movt    r0, :upper16:L_foo$non_lazy_ptr
+
+; IOS-PIC:      movw    r0, :lower16:(L_foo$non_lazy_ptr-(LPC0_0+8))
+; IOS-PIC-NEXT: movt    r0, :upper16:(L_foo$non_lazy_ptr-(LPC0_0+8))
+
+; IOS-STATIC-NOT:      movw    r0, :lower16:_foo
+; IOS-STATIC-NOT:       movt    r0, :upper16:_foo
   ret i32* @foo
 }
 
-define arm_aapcs_vfpcc void @bar2(i32 %baz) nounwind {
+define void @bar2(i32 %baz) nounwind {
 entry:
-; CHECK:      movw    r1, :lower16:foo
-; CHECK-NEXT: movt    r1, :upper16:foo
+; EABI:      movw    r1, :lower16:foo
+; EABI-NEXT: movt    r1, :upper16:foo
+
+; IOS:      movw    r1, :lower16:L_foo$non_lazy_ptr
+; IOS-NEXT: movt    r1, :upper16:L_foo$non_lazy_ptr
+
+; IOS-PIC:      movw    r1, :lower16:(L_foo$non_lazy_ptr-(LPC1_0+8))
+; IOS-PIC-NEXT: movt    r1, :upper16:(L_foo$non_lazy_ptr-(LPC1_0+8))
+
+; IOS-STATIC-NOT:      movw    r1, :lower16:_foo
+; IOS-STATIC-NOT:      movt    r1, :upper16:_foo
   store i32 %baz, i32* @foo, align 4
   ret void
 }
diff --git a/test/CodeGen/ARM/select-imm.ll b/test/CodeGen/ARM/select-imm.ll
index 82ed018..43f8a66 100644
--- a/test/CodeGen/ARM/select-imm.ll
+++ b/test/CodeGen/ARM/select-imm.ll
@@ -5,8 +5,8 @@
 define i32 @t1(i32 %c) nounwind readnone {
 entry:
 ; ARM: t1:
-; ARM: mov r1, #101
-; ARM: orr r1, r1, #1, #24
+; ARM: mov [[R1:r[0-9]+]], #101
+; ARM: orr [[R1b:r[0-9]+]], [[R1]], #1, #24
 ; ARM: movgt r0, #123
 
 ; ARMT2: t1:
@@ -34,7 +34,7 @@ entry:
 ; ARMT2: movwgt r0, #357
 
 ; THUMB2: t2:
-; THUMB2: mov.w r0, #123
+; THUMB2: mov{{(s|\.w)}} r0, #123
 ; THUMB2: movwgt r0, #357
 
   %0 = icmp sgt i32 %c, 1
@@ -53,7 +53,7 @@ entry:
 ; ARMT2: moveq r0, #1
 
 ; THUMB2: t3:
-; THUMB2: mov.w r0, #0
+; THUMB2: mov{{(s|\.w)}} r0, #0
 ; THUMB2: moveq r0, #1
   %0 = icmp eq i32 %a, 160
   %1 = zext i1 %0 to i32
@@ -67,11 +67,11 @@ entry:
 ; ARM: movlt
 
 ; ARMT2: t4:
-; ARMT2: movwlt r0, #65365
-; ARMT2: movtlt r0, #65365
+; ARMT2: movwlt [[R0:r[0-9]+]], #65365
+; ARMT2: movtlt [[R0]], #65365
 
 ; THUMB2: t4:
-; THUMB2: mvnlt.w r0, #11141290
+; THUMB2: mvnlt.w [[R0:r[0-9]+]], #11141290
   %0 = icmp slt i32 %a, %b
   %1 = select i1 %0, i32 4283826005, i32 %x
   ret i32 %1
diff --git a/test/CodeGen/ARM/stm.ll b/test/CodeGen/ARM/stm.ll
index 2f5fadb..82dc14d 100644
--- a/test/CodeGen/ARM/stm.ll
+++ b/test/CodeGen/ARM/stm.ll
@@ -9,7 +9,7 @@ define i32 @main() nounwind {
 entry:
 ; CHECK: main
 ; CHECK: push
-; CHECK: stmib
+; CHECK: stm
 	%0 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([26 x i8]* @"\01LC1", i32 0, i32 0), i32 -2, i32 -3, i32 2, i32 -6) nounwind		; <i32> [#uses=0]
 	%1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([32 x i8]* @"\01LC", i32 0, i32 0), i32 0, i32 1, i32 0, i32 1, i32 0, i32 1) nounwind		; <i32> [#uses=0]
 	ret i32 0
diff --git a/test/CodeGen/ARM/vldlane.ll b/test/CodeGen/ARM/vldlane.ll
index 805aad5..0d7d4ec 100644
--- a/test/CodeGen/ARM/vldlane.ll
+++ b/test/CodeGen/ARM/vldlane.ll
@@ -125,7 +125,7 @@ define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;Check for a post-increment updating load.
 define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind {
 ;CHECK: vld2lanei32_update:
-;CHECK: vld2.32 {d16[1], d17[1]}, [r1]!
+;CHECK: vld2.32 {d16[1], d17[1]}, [{{r[0-9]+}}]!
 	%A = load i32** %ptr
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>* %B
@@ -153,7 +153,7 @@ define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind {
 define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vld2laneQi16:
 ;Check the (default) alignment.
-;CHECK: vld2.16 {d17[1], d19[1]}, [r0]
+;CHECK: vld2.16 {d17[1], d19[1]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
 	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
@@ -166,7 +166,7 @@ define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vld2laneQi32:
 ;Check the alignment value.  Max for this instruction is 64 bits:
-;CHECK: vld2.32 {d17[0], d19[0]}, [r0, :64]
+;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}, :64]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>* %B
 	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
@@ -222,7 +222,7 @@ define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind {
 define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vld3lanei16:
 ;Check the (default) alignment value.  VLD3 does not support alignment.
-;CHECK: vld3.16 {d16[1], d17[1], d18[1]}, [r0]
+;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>* %B
 	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
@@ -265,7 +265,7 @@ define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind {
 define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vld3laneQi16:
 ;Check the (default) alignment value.  VLD3 does not support alignment.
-;CHECK: vld3.16 {d16[1], d18[1], d20[1]}, [r0]
+;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
 	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
@@ -280,7 +280,7 @@ define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;Check for a post-increment updating load with register increment.
 define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind {
 ;CHECK: vld3laneQi16_update:
-;CHECK: vld3.16 {d16[1], d18[1], d20[1]}, [{{r[0-9]+}}], {{r[0-9]+}}
+;CHECK: vld3.16 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}], {{r[0-9]+}}
 	%A = load i16** %ptr
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
@@ -344,7 +344,7 @@ declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x flo
 define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vld4lanei8:
 ;Check the alignment value.  Max for this instruction is 32 bits:
-;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0, :32]
+;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}, :32]
 	%tmp1 = load <8 x i8>* %B
 	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
         %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
@@ -360,7 +360,7 @@ define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check for a post-increment updating load.
 define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
 ;CHECK: vld4lanei8_update:
-;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1, :32]!
+;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}, :32]!
 	%A = load i8** %ptr
 	%tmp1 = load <8 x i8>* %B
 	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
@@ -380,7 +380,7 @@ define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vld4lanei16:
 ;Check that a power-of-two alignment smaller than the total size of the memory
 ;being loaded is ignored.
-;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [r0]
+;CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>* %B
 	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4)
@@ -398,7 +398,7 @@ define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vld4lanei32:
 ;Check the alignment value.  An 8-byte alignment is allowed here even though
 ;it is smaller than the total size of the memory being loaded.
-;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0, :64]
+;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}, :64]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>* %B
 	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
@@ -431,7 +431,7 @@ define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind {
 define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vld4laneQi16:
 ;Check the alignment value.  Max for this instruction is 64 bits:
-;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r0, :64]
+;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}, :64]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
 	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
@@ -448,7 +448,7 @@ define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vld4laneQi32:
 ;Check the (default) alignment.
-;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0]
+;CHECK: vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [{{r[0-9]+}}]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>* %B
 	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
diff --git a/test/CodeGen/ARM/vrev.ll b/test/CodeGen/ARM/vrev.ll
index f0f9e4e..34acd16 100644
--- a/test/CodeGen/ARM/vrev.ll
+++ b/test/CodeGen/ARM/vrev.ll
@@ -147,3 +147,34 @@ define void @test_with_vcombine(<4 x float>* %v) nounwind {
   store <4 x float> %tmp8, <4 x float>* %v, align 16
   ret void
 }
+
+; vrev <4 x i16> should use VREV32 and not VREV64
+define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
+; CHECK: test_vrev64:
+; CHECK: vext.16
+; CHECK: vrev32.16
+entry:
+  %0 = bitcast <4 x i16>* %source to <8 x i16>*
+  %tmp2 = load <8 x i16>* %0, align 4
+  %tmp3 = extractelement <8 x i16> %tmp2, i32 6
+  %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0
+  %tmp9 = extractelement <8 x i16> %tmp2, i32 5
+  %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1
+  store <2 x i16> %tmp11, <2 x i16>* %dst, align 4
+  ret void
+}
+
+; Test vrev of float4
+define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp {
+; CHECK: float_vrev64
+; CHECK: vext.32
+; CHECK: vrev64.32
+entry:
+  %0 = bitcast float* %source to <4 x float>*
+  %tmp2 = load <4 x float>* %0, align 4
+  %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0>
+  %arrayidx8 = getelementptr inbounds <4 x float>* %dest, i32 11
+  store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/ARM/vstlane.ll b/test/CodeGen/ARM/vstlane.ll
index d1bc15a..08b7232 100644
--- a/test/CodeGen/ARM/vstlane.ll
+++ b/test/CodeGen/ARM/vstlane.ll
@@ -54,7 +54,8 @@ define void @vst1lanef(float* %A, <2 x float>* %B) nounwind {
 
 define void @vst1laneQi8(i8* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vst1laneQi8:
-;CHECK: vst1.8 {d17[1]}, [r0]
+; // Can use scalar load. No need to use vectors.
+; // CHE-CK: vst1.8 {d17[1]}, [r0]
 	%tmp1 = load <16 x i8>* %B
         %tmp2 = extractelement <16 x i8> %tmp1, i32 9
         store i8 %tmp2, i8* %A, align 8
@@ -72,7 +73,8 @@ define void @vst1laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 
 define void @vst1laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst1laneQi32:
-;CHECK: vst1.32 {d17[1]}, [r0, :32]
+; // Can use scalar load. No need to use vectors.
+; // CHE-CK: vst1.32 {d17[1]}, [r0, :32]
 	%tmp1 = load <4 x i32>* %B
         %tmp2 = extractelement <4 x i32> %tmp1, i32 3
         store i32 %tmp2, i32* %A, align 8
@@ -82,7 +84,8 @@ define void @vst1laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;Check for a post-increment updating store.
 define void @vst1laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind {
 ;CHECK: vst1laneQi32_update:
-;CHECK: vst1.32 {d17[1]}, [r1, :32]!
+; // Can use scalar load. No need to use vectors.
+; // CHE-CK: vst1.32 {d17[1]}, [r1, :32]!
 	%A = load i32** %ptr
 	%tmp1 = load <4 x i32>* %B
 	%tmp2 = extractelement <4 x i32> %tmp1, i32 3
@@ -94,7 +97,8 @@ define void @vst1laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind {
 
 define void @vst1laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vst1laneQf:
-;CHECK: vst1.32 {d17[1]}, [r0]
+; // Can use scalar load. No need to use vectors.
+; // CHE-CK: vst1.32 {d17[1]}, [r0]
 	%tmp1 = load <4 x float>* %B
         %tmp2 = extractelement <4 x float> %tmp1, i32 3
         store float %tmp2, float* %A
diff --git a/test/CodeGen/Generic/promote-integers.ll b/test/CodeGen/Generic/promote-integers.ll
new file mode 100644
index 0000000..5812592
--- /dev/null
+++ b/test/CodeGen/Generic/promote-integers.ll
@@ -0,0 +1,15 @@
+; Test that vectors are scalarized/lowered correctly.
+; RUN: llc -march=x86 -promote-elements < %s | FileCheck %s
+
+; This test is the poster-child for integer-element-promotion.
+; Until this feature is complete, we mark this test as expected to fail.
+; XFAIL: *
+; CHECK: vector_code
+; CHECK: ret
+define <4 x float> @vector_code(<4 x i64> %A, <4 x i64> %B, <4 x float> %R0, <4 x float> %R1 )  {
+   %C = icmp eq <4 x i64> %A, %B
+   %K = xor <4 x i1> <i1 1, i1 1, i1 1, i1 1>, %C
+   %D = select <4 x i1> %K, <4 x float> %R1, <4 x float> %R0
+   ret <4 x float> %D
+}
+
diff --git a/test/CodeGen/Generic/zero-sized-array.ll b/test/CodeGen/Generic/zero-sized-array.ll
new file mode 100644
index 0000000..280ba00
--- /dev/null
+++ b/test/CodeGen/Generic/zero-sized-array.ll
@@ -0,0 +1,81 @@
+; RUN: llc < %s
+; PR9900
+
+%zero = type [0 x i8]
+%foobar = type { i32, %zero }
+
+define void @f(%foobar %arg) {
+  %arg1 = extractvalue %foobar %arg, 0
+  %arg2 = extractvalue %foobar %arg, 1
+  call i32 @f2(%zero %arg2, i32 5, i32 42)
+  ret void
+}
+
+define i32 @f2(%zero %x, i32 %y, i32 %z) {
+  ret i32 %y
+}
+
+define void @f3(%zero %x, i32 %y) {
+  call i32 @f2(%zero %x, i32 5, i32 %y)
+  ret void
+}
+
+define void @f4(%zero %z) {
+  insertvalue %foobar undef, %zero %z, 1
+  ret void
+}
+
+define void @f5(%foobar %x) {
+allocas:
+  %y = extractvalue %foobar %x, 1
+  br  label %b1
+
+b1:
+  %insert120 = insertvalue %foobar undef, %zero %y, 1
+  ret void
+}
+
+define void @f6(%zero %x, %zero %y) {
+b1:
+  br i1 undef, label %end, label %b2
+
+b2:
+  br label %end
+
+end:
+  %z = phi %zero [ %y, %b1 ], [ %x, %b2 ]
+  call void @f4(%zero %z)
+  ret void
+}
+
+%zero2 = type {}
+
+define i32 @g1(%zero2 %x, i32 %y, i32 %z) {
+  ret i32 %y
+}
+
+define void @g2(%zero2 %x, i32 %y) {
+  call i32 @g1(%zero2 %x, i32 5, i32 %y)
+  ret void
+}
+
+%zero2r = type {%zero2}
+
+define i32 @h1(%zero2r %x, i32 %y, i32 %z) {
+  ret i32 %y
+}
+
+define void @h2(%zero2r %x, i32 %y) {
+  call i32 @h1(%zero2r %x, i32 5, i32 %y)
+  ret void
+}
+
+%foobar2 = type { i32, %zero2r }
+
+define void @h3(%foobar2 %arg) {
+  %arg1 = extractvalue %foobar2 %arg, 0
+  %arg2 = extractvalue %foobar2 %arg, 1
+  %arg21 = extractvalue %zero2r %arg2, 0
+  call void @g2(%zero2 %arg21, i32 5)
+  ret void
+}
diff --git a/test/CodeGen/MSP430/Inst8rr.ll b/test/CodeGen/MSP430/Inst8rr.ll
index 0f5fc12..45342e2 100644
--- a/test/CodeGen/MSP430/Inst8rr.ll
+++ b/test/CodeGen/MSP430/Inst8rr.ll
@@ -10,7 +10,7 @@ define i8 @mov(i8 %a, i8 %b) nounwind {
 
 define i8 @add(i8 %a, i8 %b) nounwind {
 ; CHECK: add:
-; CHECK: add.b	r12, r15
+; CHECK: add.b
 	%1 = add i8 %a, %b
 	ret i8 %1
 }
diff --git a/test/CodeGen/Mips/2008-07-31-fcopysign.ll b/test/CodeGen/Mips/2008-07-31-fcopysign.ll
index 47382f9..f152acc 100644
--- a/test/CodeGen/Mips/2008-07-31-fcopysign.ll
+++ b/test/CodeGen/Mips/2008-07-31-fcopysign.ll
@@ -2,6 +2,10 @@
 ; RUN: grep abs.s  %t | count 1
 ; RUN: grep neg.s %t | count 1
 
+; FIXME: Should not emit abs.s or neg.s since these instructions produce
+;        incorrect results if the operand is NaN.
+; REQUIRES: disabled
+
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "mipsallegrexel-unknown-psp-elf"
 
diff --git a/test/CodeGen/Mips/2011-05-26-BranchKillsVreg.ll b/test/CodeGen/Mips/2011-05-26-BranchKillsVreg.ll
new file mode 100644
index 0000000..1255949
--- /dev/null
+++ b/test/CodeGen/Mips/2011-05-26-BranchKillsVreg.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -verify-coalescing
+; PR10046
+;
+; PHI elimination splits the critical edge from %while.end415 to %if.end427.
+; This requires updating the BNE-J terminators to a BEQ. The BNE instruction
+; kills a virtual register, and LiveVariables must be updated with the new kill
+; instruction.
+
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-n32"
+target triple = "mips-ellcc-linux"
+
+define i32 @mergesort(i8* %base, i32 %nmemb, i32 %size, i32 (i8*, i8*)* nocapture %cmp) nounwind {
+entry:
+  br i1 undef, label %return, label %if.end13
+
+if.end13:                                         ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %if.end13
+  %list1.0482 = phi i8* [ %base, %if.end13 ], [ null, %while.body ]
+  br i1 undef, label %while.end415, label %while.body
+
+while.end415:                                     ; preds = %while.body
+  br i1 undef, label %if.then419, label %if.end427
+
+if.then419:                                       ; preds = %while.end415
+  %call425 = tail call i8* @memmove(i8* %list1.0482, i8* undef, i32 undef) nounwind
+  br label %if.end427
+
+if.end427:                                        ; preds = %if.then419, %while.end415
+  %list2.1 = phi i8* [ undef, %if.then419 ], [ %list1.0482, %while.end415 ]
+  tail call void @free(i8* %list2.1)
+  unreachable
+
+return:                                           ; preds = %entry
+  ret i32 -1
+}
+
+
+declare i8* @memmove(i8*, i8*, i32)
+
+declare void @free(i8*)
+
diff --git a/test/CodeGen/Mips/alloca.ll b/test/CodeGen/Mips/alloca.ll
new file mode 100644
index 0000000..50eeecf
--- /dev/null
+++ b/test/CodeGen/Mips/alloca.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=mipsel -mcpu=4ke < %s | FileCheck %s
+
+define i32 @twoalloca(i32 %size) nounwind {
+entry:
+; CHECK: subu  $[[T0:[0-9]+]], $sp, $[[SZ:[0-9]+]]
+; CHECK: addu  $sp, $zero, $[[T0]]
+; CHECK: addu  $[[SP1:[0-9]+]], $zero, $sp
+; CHECK: subu  $[[T1:[0-9]+]], $sp, $[[SZ]]
+; CHECK: addu  $sp, $zero, $[[T1]]
+; CHECK: addu  $[[SP2:[0-9]+]], $zero, $sp
+; CHECK: lw  $25, %call16(foo)($gp)
+; CHECK: addiu $4, $[[SP1]], 24
+; CHECK: jalr  $25
+; CHECK: lw  $25, %call16(foo)($gp)
+; CHECK: addiu $4, $[[SP2]], 24
+; CHECK: jalr  $25
+  %tmp1 = alloca i8, i32 %size, align 4
+  %add.ptr = getelementptr inbounds i8* %tmp1, i32 5
+  store i8 97, i8* %add.ptr, align 1
+  %tmp4 = alloca i8, i32 %size, align 4
+  call void @foo2(double 1.000000e+00, double 2.000000e+00, i32 3) nounwind
+  %call = call i32 @foo(i8* %tmp1) nounwind
+  %call7 = call i32 @foo(i8* %tmp4) nounwind
+  %add = add nsw i32 %call7, %call
+  ret i32 %add
+}
+
+declare void @foo2(double, double, i32)
+
+declare i32 @foo(i8*)
+
diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll
new file mode 100644
index 0000000..2d5555b
--- /dev/null
+++ b/test/CodeGen/Mips/atomic.ll
@@ -0,0 +1,253 @@
+; RUN: llc -march=mipsel -mcpu=mips2 < %s | FileCheck %s
+
+
+declare i32 @llvm.atomic.load.add.i32.p0i32(i32* nocapture, i32) nounwind
+declare i32 @llvm.atomic.load.nand.i32.p0i32(i32* nocapture, i32) nounwind
+declare i32 @llvm.atomic.swap.i32.p0i32(i32* nocapture, i32) nounwind
+declare i32 @llvm.atomic.cmp.swap.i32.p0i32(i32* nocapture, i32, i32) nounwind
+
+declare i8 @llvm.atomic.load.add.i8.p0i8(i8* nocapture, i8) nounwind
+declare i8 @llvm.atomic.load.sub.i8.p0i8(i8* nocapture, i8) nounwind
+declare i8 @llvm.atomic.load.nand.i8.p0i8(i8* nocapture, i8) nounwind
+declare i8 @llvm.atomic.swap.i8.p0i8(i8* nocapture, i8) nounwind
+declare i8 @llvm.atomic.cmp.swap.i8.p0i8(i8* nocapture, i8, i8) nounwind
+
+
+@x = common global i32 0, align 4
+
+define i32 @AtomicLoadAdd32(i32 %incr) nounwind {
+entry:
+  %0 = call i32 @llvm.atomic.load.add.i32.p0i32(i32* @x, i32 %incr)
+  ret i32 %0
+
+; CHECK:   AtomicLoadAdd32:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(x)($gp)
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $[[R1:[0-9]+]], 0($[[R0]])
+; CHECK:   or      $2, $zero, $[[R1]]
+; CHECK:   addu    $[[R2:[0-9]+]], $[[R1]], $4
+; CHECK:   sc      $[[R2]], 0($[[R0]])
+; CHECK:   beq     $[[R2]], $zero, $[[BB0]]
+}
+
+define i32 @AtomicLoadNand32(i32 %incr) nounwind {
+entry:
+  %0 = call i32 @llvm.atomic.load.nand.i32.p0i32(i32* @x, i32 %incr)
+  ret i32 %0
+
+; CHECK:   AtomicLoadNand32:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(x)($gp)
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $[[R1:[0-9]+]], 0($[[R0]])
+; CHECK:   or      $2, $zero, $[[R1]]
+; CHECK:   and     $[[R1]], $[[R1]], $4
+; CHECK:   nor     $[[R2:[0-9]+]], $zero, $[[R1]]
+; CHECK:   sc      $[[R2]], 0($[[R0]])
+; CHECK:   beq     $[[R2]], $zero, $[[BB0]]
+}
+
+define i32 @AtomicSwap32(i32 %oldval) nounwind {
+entry:
+  %0 = call i32 @llvm.atomic.swap.i32.p0i32(i32* @x, i32 %oldval)
+  ret i32 %0
+
+; CHECK:   AtomicSwap32:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(x)($gp)
+; CHECK:   sw      $4, [[OFFSET:[0-9]+]]($sp)
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $[[R1:[0-9]+]], 0($[[R0]])
+; CHECK:   or      $2, $zero, $[[R1]]
+; CHECK:   lw      $[[R2:[0-9]+]], [[OFFSET]]($sp)
+; CHECK:   or      $[[R3:[0-9]+]], $zero, $[[R2]]
+; CHECK:   sc      $[[R3]], 0($[[R0]])
+; CHECK:   beq     $[[R3]], $zero, $[[BB0]]
+}
+
+define i32 @AtomicCmpSwap32(i32 %oldval, i32 %newval) nounwind {
+entry:
+  %0 = call i32 @llvm.atomic.cmp.swap.i32.p0i32(i32* @x, i32 %oldval, i32 %newval)
+  ret i32 %0
+
+; CHECK:   AtomicCmpSwap32:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(x)($gp)
+; CHECK:   sw      $5, [[OFFSET:[0-9]+]]($sp)
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $2, 0($[[R0]])
+; CHECK:   bne     $2, $4, $[[BB1:[A-Z_0-9]+]]
+; CHECK:   lw      $[[R1:[0-9]+]], [[OFFSET]]($sp)
+; CHECK:   or      $[[R2:[0-9]+]], $zero, $[[R1]]
+; CHECK:   sc      $[[R2]], 0($[[R0]])
+; CHECK:   beq     $[[R2]], $zero, $[[BB0]]
+; CHECK:   $[[BB1]]:
+}
+
+
+
+@y = common global i8 0, align 1
+
+define signext i8 @AtomicLoadAdd8(i8 signext %incr) nounwind {
+entry:
+  %0 = call i8 @llvm.atomic.load.add.i8.p0i8(i8* @y, i8 %incr)
+  ret i8 %0
+
+; CHECK:   AtomicLoadAdd8:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(y)($gp)
+; CHECK:   addiu   $[[R1:[0-9]+]], $zero, -4
+; CHECK:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; CHECK:   andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK:   sll     $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK:   ori     $[[R5:[0-9]+]], $zero, 255
+; CHECK:   sll     $[[R6:[0-9]+]], $[[R5]], $[[R4]]
+; CHECK:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
+; CHECK:   andi    $[[R8:[0-9]+]], $4, 255
+; CHECK:   sll     $[[R9:[0-9]+]], $[[R8]], $[[R4]]
+
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $[[R10:[0-9]+]], 0($[[R2]])
+; CHECK:   addu    $[[R11:[0-9]+]], $[[R10]], $[[R9]]
+; CHECK:   and     $[[R12:[0-9]+]], $[[R11]], $[[R6]]
+; CHECK:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
+; CHECK:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
+; CHECK:   sc      $[[R14]], 0($[[R2]])
+; CHECK:   beq     $[[R14]], $zero, $[[BB0]]
+
+; CHECK:   and     $[[R15:[0-9]+]], $[[R10]], $[[R6]]
+; CHECK:   srl     $[[R16:[0-9]+]], $[[R15]], $[[R4]]
+; CHECK:   sll     $[[R17:[0-9]+]], $[[R16]], 24
+; CHECK:   sra     $2, $[[R17]], 24
+}
+
+define signext i8 @AtomicLoadSub8(i8 signext %incr) nounwind {
+entry:
+  %0 = call i8 @llvm.atomic.load.sub.i8.p0i8(i8* @y, i8 %incr)
+  ret i8 %0
+
+; CHECK:   AtomicLoadSub8:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(y)($gp)
+; CHECK:   addiu   $[[R1:[0-9]+]], $zero, -4
+; CHECK:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; CHECK:   andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK:   sll     $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK:   ori     $[[R5:[0-9]+]], $zero, 255
+; CHECK:   sll     $[[R6:[0-9]+]], $[[R5]], $[[R4]]
+; CHECK:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
+; CHECK:   subu    $[[R18:[0-9]+]], $zero, $4
+; CHECK:   andi    $[[R8:[0-9]+]], $[[R18]], 255
+; CHECK:   sll     $[[R9:[0-9]+]], $[[R8]], $[[R4]]
+
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $[[R10:[0-9]+]], 0($[[R2]])
+; CHECK:   addu    $[[R11:[0-9]+]], $[[R10]], $[[R9]]
+; CHECK:   and     $[[R12:[0-9]+]], $[[R11]], $[[R6]]
+; CHECK:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
+; CHECK:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
+; CHECK:   sc      $[[R14]], 0($[[R2]])
+; CHECK:   beq     $[[R14]], $zero, $[[BB0]]
+
+; CHECK:   and     $[[R15:[0-9]+]], $[[R10]], $[[R6]]
+; CHECK:   srl     $[[R16:[0-9]+]], $[[R15]], $[[R4]]
+; CHECK:   sll     $[[R17:[0-9]+]], $[[R16]], 24
+; CHECK:   sra     $2, $[[R17]], 24
+}
+
+define signext i8 @AtomicLoadNand8(i8 signext %incr) nounwind {
+entry:
+  %0 = call i8 @llvm.atomic.load.nand.i8.p0i8(i8* @y, i8 %incr)
+  ret i8 %0
+
+; CHECK:   AtomicLoadNand8:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(y)($gp)
+; CHECK:   addiu   $[[R1:[0-9]+]], $zero, -4
+; CHECK:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; CHECK:   andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK:   sll     $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK:   ori     $[[R5:[0-9]+]], $zero, 255
+; CHECK:   sll     $[[R6:[0-9]+]], $[[R5]], $[[R4]]
+; CHECK:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
+; CHECK:   andi    $[[R8:[0-9]+]], $4, 255
+; CHECK:   sll     $[[R9:[0-9]+]], $[[R8]], $[[R4]]
+
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $[[R10:[0-9]+]], 0($[[R2]])
+; CHECK:   and     $[[R18:[0-9]+]], $[[R10]], $[[R9]]
+; CHECK:   nor     $[[R11:[0-9]+]], $zero, $[[R18]]
+; CHECK:   and     $[[R12:[0-9]+]], $[[R11]], $[[R6]]
+; CHECK:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
+; CHECK:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
+; CHECK:   sc      $[[R14]], 0($[[R2]])
+; CHECK:   beq     $[[R14]], $zero, $[[BB0]]
+
+; CHECK:   and     $[[R15:[0-9]+]], $[[R10]], $[[R6]]
+; CHECK:   srl     $[[R16:[0-9]+]], $[[R15]], $[[R4]]
+; CHECK:   sll     $[[R17:[0-9]+]], $[[R16]], 24
+; CHECK:   sra     $2, $[[R17]], 24
+}
+
+define signext i8 @AtomicSwap8(i8 signext %oldval) nounwind {
+entry:
+  %0 = call i8 @llvm.atomic.swap.i8.p0i8(i8* @y, i8 %oldval)
+  ret i8 %0
+
+; CHECK:   AtomicSwap8:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(y)($gp)
+; CHECK:   addiu   $[[R1:[0-9]+]], $zero, -4
+; CHECK:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; CHECK:   andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK:   sll     $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK:   ori     $[[R5:[0-9]+]], $zero, 255
+; CHECK:   sll     $[[R6:[0-9]+]], $[[R5]], $[[R4]]
+; CHECK:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
+; CHECK:   andi    $[[R8:[0-9]+]], $4, 255
+; CHECK:   sll     $[[R9:[0-9]+]], $[[R8]], $[[R4]]
+; CHECK:   sw      $[[R9]], [[OFFSET:[0-9]+]]($sp)
+
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $[[R10:[0-9]+]], 0($[[R2]])
+; CHECK:   lw      $[[R18:[0-9]+]], [[OFFSET]]($sp)
+; CHECK:   or      $[[R11:[0-9]+]], $zero, $[[R18]]
+; CHECK:   and     $[[R12:[0-9]+]], $[[R11]], $[[R6]]
+; CHECK:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
+; CHECK:   or      $[[R14:[0-9]+]], $[[R13]], $[[R12]]
+; CHECK:   sc      $[[R14]], 0($[[R2]])
+; CHECK:   beq     $[[R14]], $zero, $[[BB0]]
+
+; CHECK:   and     $[[R15:[0-9]+]], $[[R10]], $[[R6]]
+; CHECK:   srl     $[[R16:[0-9]+]], $[[R15]], $[[R4]]
+; CHECK:   sll     $[[R17:[0-9]+]], $[[R16]], 24
+; CHECK:   sra     $2, $[[R17]], 24
+}
+
+define signext i8 @AtomicCmpSwap8(i8 signext %oldval, i8 signext %newval) nounwind {
+entry:
+  %0 = call i8 @llvm.atomic.cmp.swap.i8.p0i8(i8* @y, i8 %oldval, i8 %newval)
+  ret i8 %0
+
+; CHECK:   AtomicCmpSwap8:
+; CHECK:   lw      $[[R0:[0-9]+]], %got(y)($gp)
+; CHECK:   addiu   $[[R1:[0-9]+]], $zero, -4
+; CHECK:   and     $[[R2:[0-9]+]], $[[R0]], $[[R1]]
+; CHECK:   andi    $[[R3:[0-9]+]], $[[R0]], 3
+; CHECK:   sll     $[[R4:[0-9]+]], $[[R3]], 3
+; CHECK:   ori     $[[R5:[0-9]+]], $zero, 255
+; CHECK:   sll     $[[R6:[0-9]+]], $[[R5]], $[[R4]]
+; CHECK:   nor     $[[R7:[0-9]+]], $zero, $[[R6]]
+; CHECK:   andi    $[[R8:[0-9]+]], $4, 255
+; CHECK:   sll     $[[R9:[0-9]+]], $[[R8]], $[[R4]]
+; CHECK:   andi    $[[R10:[0-9]+]], $5, 255
+; CHECK:   sll     $[[R11:[0-9]+]], $[[R10]], $[[R4]]
+
+; CHECK:   $[[BB0:[A-Z_0-9]+]]:
+; CHECK:   ll      $[[R12:[0-9]+]], 0($[[R2]])
+; CHECK:   and     $[[R13:[0-9]+]], $[[R12]], $[[R6]]
+; CHECK:   bne     $[[R13]], $[[R9]], $[[BB1:[A-Z_0-9]+]]
+
+; CHECK:   and     $[[R14:[0-9]+]], $[[R12]], $[[R7]]
+; CHECK:   or      $[[R15:[0-9]+]], $[[R14]], $[[R11]]
+; CHECK:   sc      $[[R15]], 0($[[R2]])
+; CHECK:   beq     $[[R15]], $zero, $[[BB0]]
+
+; CHECK:   $[[BB1]]:
+; CHECK:   srl     $[[R16:[0-9]+]], $[[R13]], $[[R4]]
+; CHECK:   sll     $[[R17:[0-9]+]], $[[R16]], 24
+; CHECK:   sra     $2, $[[R17]], 24
+}
diff --git a/test/CodeGen/Mips/blockaddr.ll b/test/CodeGen/Mips/blockaddr.ll
index e9af304..6de6b77 100644
--- a/test/CodeGen/Mips/blockaddr.ll
+++ b/test/CodeGen/Mips/blockaddr.ll
@@ -8,14 +8,14 @@ entry:
   ret i8* %x
 }
 
-; CHECK-PIC: lw  $[[R0:[0-9]+]], %got($tmp1)($gp)
-; CHECK-PIC: addiu ${{[0-9]+}}, $[[R0]], %lo($tmp1)
-; CHECK-PIC: lw  $[[R1:[0-9]+]], %got($tmp2)($gp)
-; CHECK-PIC: addiu ${{[0-9]+}}, $[[R1]], %lo($tmp2)
-; CHECK-STATIC: lui  $[[R2:[0-9]+]], %hi($tmp1)
-; CHECK-STATIC: addiu ${{[0-9]+}}, $[[R2]], %lo($tmp1)
-; CHECK-STATIC: lui   $[[R3:[0-9]+]], %hi($tmp2)
-; CHECK-STATIC: addiu ${{[0-9]+}}, $[[R3]], %lo($tmp2)
+; CHECK-PIC: lw  $[[R0:[0-9]+]], %got($tmp[[T0:[0-9]+]])($gp)
+; CHECK-PIC: addiu ${{[0-9]+}}, $[[R0]], %lo($tmp[[T0]])
+; CHECK-PIC: lw  $[[R1:[0-9]+]], %got($tmp[[T1:[0-9]+]])($gp)
+; CHECK-PIC: addiu ${{[0-9]+}}, $[[R1]], %lo($tmp[[T1]])
+; CHECK-STATIC: lui  $[[R2:[0-9]+]], %hi($tmp[[T0:[0-9]+]])
+; CHECK-STATIC: addiu ${{[0-9]+}}, $[[R2]], %lo($tmp[[T0]])
+; CHECK-STATIC: lui   $[[R3:[0-9]+]], %hi($tmp[[T1:[0-9]+]])
+; CHECK-STATIC: addiu ${{[0-9]+}}, $[[R3]], %lo($tmp[[T1]])
 define void @f() nounwind {
 entry:
   %call = tail call i8* @dummy(i8* blockaddress(@f, %baz))
diff --git a/test/CodeGen/Mips/cmov.ll b/test/CodeGen/Mips/cmov.ll
index 8329c89..ec37961 100755
--- a/test/CodeGen/Mips/cmov.ll
+++ b/test/CodeGen/Mips/cmov.ll
@@ -4,8 +4,8 @@
 @i1 = global [3 x i32] [i32 1, i32 2, i32 3], align 4
 @i3 = common global i32* null, align 4
 
-; CHECK:  lw  ${{[0-9]+}}, %got(i3)($gp)
 ; CHECK:  addiu ${{[0-9]+}}, $gp, %got(i1)
+; CHECK:  lw  ${{[0-9]+}}, %got(i3)($gp)
 define i32* @cmov1(i32 %s) nounwind readonly {
 entry:
   %tobool = icmp ne i32 %s, 0
@@ -14,3 +14,19 @@ entry:
   ret i32* %cond
 }
 
+@c = global i32 1, align 4
+@d = global i32 0, align 4
+
+; CHECK: cmov2:
+; CHECK: addiu $[[R0:[0-9]+]], $gp, %got(c)
+; CHECK: addiu $[[R1:[0-9]+]], $gp, %got(d)
+; CHECK: movn  $[[R1]], $[[R0]], ${{[0-9]+}}
+define i32 @cmov2(i32 %s) nounwind readonly {
+entry:
+  %tobool = icmp ne i32 %s, 0
+  %tmp1 = load i32* @c, align 4
+  %tmp2 = load i32* @d, align 4
+  %cond = select i1 %tobool, i32 %tmp1, i32 %tmp2
+  ret i32 %cond
+}
+
diff --git a/test/CodeGen/Mips/double2int.ll b/test/CodeGen/Mips/double2int.ll
new file mode 100644
index 0000000..3d033e1
--- /dev/null
+++ b/test/CodeGen/Mips/double2int.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=mips -mcpu=4ke < %s | FileCheck %s
+
+define i32 @f1(double %d) nounwind readnone {
+entry:
+; CHECK: trunc.w.d $f{{[0-9]+}}, $f12
+  %conv = fptosi double %d to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/Mips/eh.ll b/test/CodeGen/Mips/eh.ll
new file mode 100644
index 0000000..765b778
--- /dev/null
+++ b/test/CodeGen/Mips/eh.ll
@@ -0,0 +1,78 @@
+; RUN: llc  < %s -march=mipsel -mcpu=4ke | FileCheck %s -check-prefix=CHECK-EL
+; RUN: llc  < %s -march=mips   -mcpu=4ke | FileCheck %s -check-prefix=CHECK-EB
+
+@g1 = global double 0.000000e+00, align 8
+@_ZTId = external constant i8*
+
+define void @_Z1fd(double %i2) {
+entry:
+; CHECK-EL:  addiu $sp, $sp
+; CHECK-EL:  .cfi_def_cfa_offset
+; CHECK-EL:  sdc1 $f20
+; CHECK-EL:  sw  $ra
+; CHECK-EL:  sw  $17
+; CHECK-EL:  sw  $16
+; CHECK-EL:  .cfi_offset 52, -8
+; CHECK-EL:  .cfi_offset 53, -4
+; CHECK-EB:  .cfi_offset 53, -8
+; CHECK-EB:  .cfi_offset 52, -4
+; CHECK-EL:  .cfi_offset 31, -12
+; CHECK-EL:  .cfi_offset 17, -16
+; CHECK-EL:  .cfi_offset 16, -20
+; CHECK-EL:  .cprestore 
+
+  %exception = tail call i8* @__cxa_allocate_exception(i32 8) nounwind
+  %0 = bitcast i8* %exception to double*
+  store double 3.200000e+00, double* %0, align 8, !tbaa !0
+  invoke void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTId to i8*), i8* null) noreturn
+          to label %unreachable unwind label %lpad
+
+lpad:                                             ; preds = %entry
+; CHECK-EL:  # %lpad
+; CHECK-EL:  lw  $gp
+; CHECK-EL:  beq $5
+
+  %exn = tail call i8* @llvm.eh.exception() nounwind
+  %eh.selector = tail call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* bitcast (i8** @_ZTId to i8*)) nounwind
+  %1 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTId to i8*)) nounwind
+  %2 = icmp eq i32 %eh.selector, %1
+  br i1 %2, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  %3 = tail call i8* @__cxa_begin_catch(i8* %exn) nounwind
+  %4 = bitcast i8* %3 to double*
+  %exn.scalar = load double* %4, align 8
+  %add = fadd double %exn.scalar, %i2
+  store double %add, double* @g1, align 8, !tbaa !0
+  tail call void @__cxa_end_catch() nounwind
+  ret void
+
+eh.resume:                                        ; preds = %lpad
+  tail call void @llvm.eh.resume(i8* %exn, i32 %eh.selector) noreturn
+  unreachable
+
+unreachable:                                      ; preds = %entry
+  unreachable
+}
+
+declare i8* @__cxa_allocate_exception(i32)
+
+declare i8* @llvm.eh.exception() nounwind readonly
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i32 @llvm.eh.selector(i8*, i8*, ...) nounwind
+
+declare i32 @llvm.eh.typeid.for(i8*) nounwind
+
+declare void @llvm.eh.resume(i8*, i32)
+
+declare void @__cxa_throw(i8*, i8*, i8*)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+!0 = metadata !{metadata !"double", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/Mips/fcopysign.ll b/test/CodeGen/Mips/fcopysign.ll
new file mode 100644
index 0000000..14c6507
--- /dev/null
+++ b/test/CodeGen/Mips/fcopysign.ll
@@ -0,0 +1,55 @@
+; RUN: llc  < %s -march=mipsel -mcpu=4ke | FileCheck %s -check-prefix=CHECK-EL
+; RUN: llc  < %s -march=mips -mcpu=4ke | FileCheck %s -check-prefix=CHECK-EB
+
+define double @func0(double %d0, double %d1) nounwind readnone {
+entry:
+; CHECK-EL: func0:
+; CHECK-EL: lui $[[T0:[0-9]+]], 32767
+; CHECK-EL: lui $[[T1:[0-9]+]], 32768
+; CHECK-EL: mfc1 $[[HI0:[0-9]+]], $f13
+; CHECK-EL: ori $[[MSK0:[0-9]+]], $[[T0]], 65535
+; CHECK-EL: mfc1 $[[HI1:[0-9]+]], $f15
+; CHECK-EL: ori $[[MSK1:[0-9]+]], $[[T1]], 0
+; CHECK-EL: and $[[AND0:[0-9]+]], $[[HI0]], $[[MSK0]]
+; CHECK-EL: and $[[AND1:[0-9]+]], $[[HI1]], $[[MSK1]]
+; CHECK-EL: mfc1 $[[LO0:[0-9]+]], $f12
+; CHECK-EL: or  $[[OR:[0-9]+]], $[[AND0]], $[[AND1]]
+; CHECK-EL: mtc1 $[[LO0]], $f0
+; CHECK-EL: mtc1 $[[OR]], $f1
+;
+; CHECK-EB: lui $[[T0:[0-9]+]], 32767
+; CHECK-EB: lui $[[T1:[0-9]+]], 32768
+; CHECK-EB: mfc1 $[[HI0:[0-9]+]], $f12
+; CHECK-EB: ori $[[MSK0:[0-9]+]], $[[T0]], 65535
+; CHECK-EB: mfc1 $[[HI1:[0-9]+]], $f14
+; CHECK-EB: ori $[[MSK1:[0-9]+]], $[[T1]], 0
+; CHECK-EB: and $[[AND0:[0-9]+]], $[[HI0]], $[[MSK0]]
+; CHECK-EB: and $[[AND1:[0-9]+]], $[[HI1]], $[[MSK1]]
+; CHECK-EB: or  $[[OR:[0-9]+]], $[[AND0]], $[[AND1]]
+; CHECK-EB: mfc1 $[[LO0:[0-9]+]], $f13
+; CHECK-EB: mtc1 $[[OR]], $f0
+; CHECK-EB: mtc1 $[[LO0]], $f1
+  %call = tail call double @copysign(double %d0, double %d1) nounwind readnone
+  ret double %call
+}
+
+declare double @copysign(double, double) nounwind readnone
+
+define float @func1(float %f0, float %f1) nounwind readnone {
+entry:
+; CHECK-EL: func1:
+; CHECK-EL: lui $[[T0:[0-9]+]], 32767
+; CHECK-EL: lui $[[T1:[0-9]+]], 32768
+; CHECK-EL: mfc1 $[[ARG0:[0-9]+]], $f12
+; CHECK-EL: ori $[[MSK0:[0-9]+]], $[[T0]], 65535
+; CHECK-EL: mfc1 $[[ARG1:[0-9]+]], $f14
+; CHECK-EL: ori $[[MSK1:[0-9]+]], $[[T1]], 0
+; CHECK-EL: and $[[T2:[0-9]+]], $[[ARG0]], $[[MSK0]]
+; CHECK-EL: and $[[T3:[0-9]+]], $[[ARG1]], $[[MSK1]]
+; CHECK-EL: or  $[[T4:[0-9]+]], $[[T2]], $[[T3]]
+; CHECK-EL: mtc1 $[[T4]], $f0
+  %call = tail call float @copysignf(float %f0, float %f1) nounwind readnone
+  ret float %call
+}
+
+declare float @copysignf(float, float) nounwind readnone
diff --git a/test/CodeGen/Mips/frame-address.ll b/test/CodeGen/Mips/frame-address.ll
new file mode 100644
index 0000000..c48ce7e
--- /dev/null
+++ b/test/CodeGen/Mips/frame-address.ll
@@ -0,0 +1,12 @@
+; RUN: llc -march=mipsel -mcpu=mips2 < %s | FileCheck %s
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
+
+define i8* @f() nounwind {
+entry:
+  %0 = call i8* @llvm.frameaddress(i32 0)
+  ret i8* %0
+
+; CHECK:   addu    $fp, $sp, $zero
+; CHECK:   addu    $2, $zero, $fp
+}
diff --git a/test/CodeGen/Mips/gprestore.ll b/test/CodeGen/Mips/gprestore.ll
new file mode 100644
index 0000000..ee7e131
--- /dev/null
+++ b/test/CodeGen/Mips/gprestore.ll
@@ -0,0 +1,32 @@
+; RUN: llc -march=mips < %s | FileCheck %s
+
+@p = external global i32
+@q = external global i32
+@r = external global i32
+
+define void @f0() nounwind {
+entry:
+; CHECK: jalr
+; CHECK-NOT: got({{.*}})($gp)
+; CHECK: lw $gp
+; CHECK: jalr
+; CHECK-NOT: got({{.*}})($gp)
+; CHECK: lw $gp
+; CHECK: jalr
+; CHECK-NOT: got({{.*}})($gp)
+; CHECK: lw $gp
+  tail call void (...)* @f1() nounwind
+  %tmp = load i32* @p, align 4
+  tail call void @f2(i32 %tmp) nounwind
+  %tmp1 = load i32* @q, align 4
+  %tmp2 = load i32* @r, align 4
+  tail call void @f3(i32 %tmp1, i32 %tmp2) nounwind
+  ret void
+}
+
+declare void @f1(...)
+
+declare void @f2(i32)
+
+declare void @f3(i32, i32)
+
diff --git a/test/CodeGen/Mips/i64arg.ll b/test/CodeGen/Mips/i64arg.ll
new file mode 100644
index 0000000..9a30453
--- /dev/null
+++ b/test/CodeGen/Mips/i64arg.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=mips -mcpu=4ke < %s | FileCheck %s
+
+define void @f1(i64 %ll1, float %f, i64 %ll, i32 %i, float %f2) nounwind {
+entry:
+; CHECK: addu $[[R1:[0-9]+]], $zero, $5
+; CHECK: addu $[[R0:[0-9]+]], $zero, $4
+; CHECK: lw  $25, %call16(ff1)
+; CHECK: ori $6, ${{[0-9]+}}, 3855
+; CHECK: ori $7, ${{[0-9]+}}, 22136
+; CHECK: jalr
+  tail call void @ff1(i32 %i, i64 1085102592623924856) nounwind
+; CHECK: lw $25, %call16(ff2)
+; CHECK: lw $[[R2:[0-9]+]], 80($sp)
+; CHECK: lw $[[R3:[0-9]+]], 84($sp)
+; CHECK: addu $4, $zero, $[[R2]]
+; CHECK: addu $5, $zero, $[[R3]]
+; CHECK: jalr $25
+  tail call void @ff2(i64 %ll, double 3.000000e+00) nounwind
+  %sub = add nsw i32 %i, -1
+; CHECK: sw $[[R0]], 24($sp)
+; CHECK: sw $[[R1]], 28($sp)
+; CHECK: lw $25, %call16(ff3)
+; CHECK: addu $6, $zero, $[[R2]]
+; CHECK: addu $7, $zero, $[[R3]]
+; CHECK: jalr $25
+  tail call void @ff3(i32 %i, i64 %ll, i32 %sub, i64 %ll1) nounwind
+  ret void
+}
+
+declare void @ff1(i32, i64)
+
+declare void @ff2(i64, double)
+
+declare void @ff3(i32, i64, i32, i64)
diff --git a/test/CodeGen/Mips/internalfunc.ll b/test/CodeGen/Mips/internalfunc.ll
index fdfa01a..50d0993 100644
--- a/test/CodeGen/Mips/internalfunc.ll
+++ b/test/CodeGen/Mips/internalfunc.ll
@@ -1,4 +1,4 @@
-; RUN: llc  < %s -march=mips | FileCheck %s
+; RUN: llc  < %s -march=mipsel -mcpu=4ke  | FileCheck %s
 
 @caller.sf1 = internal unnamed_addr global void (...)* null, align 4
 @gf1 = external global void (...)*
diff --git a/test/CodeGen/Mips/largeimmprinting.ll b/test/CodeGen/Mips/largeimmprinting.ll
new file mode 100644
index 0000000..fd7ae9e
--- /dev/null
+++ b/test/CodeGen/Mips/largeimmprinting.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=mipsel -mcpu=4ke < %s | FileCheck %s
+
+%struct.S1 = type { [65536 x i8] }
+
+@s1 = external global %struct.S1
+
+define void @f() nounwind {
+entry:
+; CHECK:  lui $at, 65534
+; CHECK:  addu  $at, $sp, $at
+; CHECK:  addiu $sp, $at, -16
+; CHECK:  .cprestore  65536
+
+  %agg.tmp = alloca %struct.S1, align 1
+  %tmp = getelementptr inbounds %struct.S1* %agg.tmp, i32 0, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp, i8* getelementptr inbounds (%struct.S1* @s1, i32 0, i32 0, i32 0), i32 65536, i32 1, i1 false)
+  call void @f2(%struct.S1* byval %agg.tmp) nounwind
+  ret void
+}
+
+declare void @f2(%struct.S1* byval)
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/CodeGen/Mips/o32_cc_byval.ll b/test/CodeGen/Mips/o32_cc_byval.ll
new file mode 100644
index 0000000..b78c393
--- /dev/null
+++ b/test/CodeGen/Mips/o32_cc_byval.ll
@@ -0,0 +1,127 @@
+; RUN: llc -march=mipsel -mcpu=4ke < %s | FileCheck %s
+
+%0 = type { i8, i16, i32, i64, double, i32, [4 x i8] }
+%struct.S1 = type { i8, i16, i32, i64, double, i32 }
+%struct.S2 = type { [4 x i32] }
+%struct.S3 = type { i8 }
+
+@f1.s1 = internal unnamed_addr constant %0 { i8 1, i16 2, i32 3, i64 4, double 5.000000e+00, i32 6, [4 x i8] undef }, align 8
+@f1.s2 = internal unnamed_addr constant %struct.S2 { [4 x i32] [i32 7, i32 8, i32 9, i32 10] }, align 4
+
+define void @f1() nounwind {
+entry:
+; CHECK: lw  $[[R1:[0-9]+]], %got(f1.s1)($gp)
+; CHECK: addiu $[[R0:[0-9]+]], $[[R1]], %lo(f1.s1)
+; CHECK: lw  $[[R2:[0-9]+]], 8($[[R0]])
+; CHECK: lw  $[[R7:[0-9]+]], 12($[[R0]])
+; CHECK: lw  $[[R3:[0-9]+]], 16($[[R0]])
+; CHECK: lw  $[[R4:[0-9]+]], 20($[[R0]])
+; CHECK: lw  $[[R5:[0-9]+]], 24($[[R0]])
+; CHECK: lw  $[[R6:[0-9]+]], 28($[[R0]])
+; CHECK: sw  $[[R2]], 16($sp)
+; CHECK: sw  $[[R7]], 20($sp)
+; CHECK: sw  $[[R3]], 24($sp)
+; CHECK: sw  $[[R4]], 28($sp)
+; CHECK: sw  $[[R5]], 32($sp)
+; CHECK: sw  $[[R6]], 36($sp)
+; CHECK: lw  $6, 0($[[R0]])
+; CHECK: lw  $7, 4($[[R0]])
+  %agg.tmp10 = alloca %struct.S3, align 4
+  call void @callee1(float 2.000000e+01, %struct.S1* byval bitcast (%0* @f1.s1 to %struct.S1*)) nounwind
+  call void @callee2(%struct.S2* byval @f1.s2) nounwind
+  %tmp11 = getelementptr inbounds %struct.S3* %agg.tmp10, i32 0, i32 0
+  store i8 11, i8* %tmp11, align 4
+  call void @callee3(float 2.100000e+01, %struct.S3* byval %agg.tmp10, %struct.S1* byval bitcast (%0* @f1.s1 to %struct.S1*)) nounwind
+  ret void
+}
+
+declare void @callee1(float, %struct.S1* byval)
+
+declare void @callee2(%struct.S2* byval)
+
+declare void @callee3(float, %struct.S3* byval, %struct.S1* byval)
+
+define void @f2(float %f, %struct.S1* nocapture byval %s1) nounwind {
+entry:
+; CHECK: addiu $sp, $sp, -56
+; CHECK: sw  $6, 64($sp)
+; CHECK: sw  $7, 68($sp)
+; CHECK: ldc1 $f[[F0:[0-9]+]], 80($sp)
+; CHECK: lw  $[[R2:[0-9]+]], 68($sp)
+; CHECK: lh  $[[R1:[0-9]+]], 66($sp)
+; CHECK: lb  $[[R0:[0-9]+]], 64($sp)
+; CHECK: lw  $[[R3:[0-9]+]], 72($sp)
+; CHECK: lw  $[[R4:[0-9]+]], 76($sp)
+; CHECK: lw  $4, 88($sp)
+; CHECK: sw  $[[R3]], 16($sp)
+; CHECK: sw  $[[R4]], 20($sp)
+; CHECK: sw  $[[R2]], 24($sp)
+; CHECK: sw  $[[R1]], 28($sp)
+; CHECK: sw  $[[R0]], 32($sp)
+; CHECK: mfc1 $6, $f[[F0]]
+
+  %i2 = getelementptr inbounds %struct.S1* %s1, i32 0, i32 5
+  %tmp = load i32* %i2, align 4, !tbaa !0
+  %d = getelementptr inbounds %struct.S1* %s1, i32 0, i32 4
+  %tmp1 = load double* %d, align 8, !tbaa !3
+  %ll = getelementptr inbounds %struct.S1* %s1, i32 0, i32 3
+  %tmp2 = load i64* %ll, align 8, !tbaa !4
+  %i = getelementptr inbounds %struct.S1* %s1, i32 0, i32 2
+  %tmp3 = load i32* %i, align 4, !tbaa !0
+  %s = getelementptr inbounds %struct.S1* %s1, i32 0, i32 1
+  %tmp4 = load i16* %s, align 2, !tbaa !5
+  %c = getelementptr inbounds %struct.S1* %s1, i32 0, i32 0
+  %tmp5 = load i8* %c, align 1, !tbaa !1
+  tail call void @callee4(i32 %tmp, double %tmp1, i64 %tmp2, i32 %tmp3, i16 signext %tmp4, i8 signext %tmp5, float %f) nounwind
+  ret void
+}
+
+declare void @callee4(i32, double, i64, i32, i16 signext, i8 signext, float)
+
+define void @f3(%struct.S2* nocapture byval %s2) nounwind {
+entry:
+; CHECK: addiu $sp, $sp, -56
+; CHECK: sw  $4, 56($sp)
+; CHECK: sw  $5, 60($sp)
+; CHECK: sw  $6, 64($sp)
+; CHECK: sw  $7, 68($sp)
+; CHECK: lw  $[[R0:[0-9]+]], 68($sp)
+; CHECK: lw  $4, 56($sp)
+; CHECK: sw  $[[R0]], 24($sp)
+
+  %arrayidx = getelementptr inbounds %struct.S2* %s2, i32 0, i32 0, i32 0
+  %tmp = load i32* %arrayidx, align 4, !tbaa !0
+  %arrayidx2 = getelementptr inbounds %struct.S2* %s2, i32 0, i32 0, i32 3
+  %tmp3 = load i32* %arrayidx2, align 4, !tbaa !0
+  tail call void @callee4(i32 %tmp, double 2.000000e+00, i64 3, i32 %tmp3, i16 signext 4, i8 signext 5, float 6.000000e+00) nounwind
+  ret void
+}
+
+define void @f4(float %f, %struct.S3* nocapture byval %s3, %struct.S1* nocapture byval %s1) nounwind {
+entry:
+; CHECK: addiu $sp, $sp, -56
+; CHECK: sw  $5, 60($sp)
+; CHECK: sw  $6, 64($sp)
+; CHECK: sw  $7, 68($sp)
+; CHECK: lw  $[[R1:[0-9]+]], 88($sp)
+; CHECK: lb  $[[R0:[0-9]+]], 60($sp)
+; CHECK: lw  $4, 68($sp)
+; CHECK: sw  $[[R1]], 24($sp)
+; CHECK: sw  $[[R0]], 32($sp)
+
+  %i = getelementptr inbounds %struct.S1* %s1, i32 0, i32 2
+  %tmp = load i32* %i, align 4, !tbaa !0
+  %i2 = getelementptr inbounds %struct.S1* %s1, i32 0, i32 5
+  %tmp1 = load i32* %i2, align 4, !tbaa !0
+  %c = getelementptr inbounds %struct.S3* %s3, i32 0, i32 0
+  %tmp2 = load i8* %c, align 1, !tbaa !1
+  tail call void @callee4(i32 %tmp, double 2.000000e+00, i64 3, i32 %tmp1, i16 signext 4, i8 signext %tmp2, float 6.000000e+00) nounwind
+  ret void
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
+!3 = metadata !{metadata !"double", metadata !1}
+!4 = metadata !{metadata !"long long", metadata !1}
+!5 = metadata !{metadata !"short", metadata !1}
diff --git a/test/CodeGen/Mips/o32_cc_vararg.ll b/test/CodeGen/Mips/o32_cc_vararg.ll
index 1f71ed2..14ce04b 100644
--- a/test/CodeGen/Mips/o32_cc_vararg.ll
+++ b/test/CodeGen/Mips/o32_cc_vararg.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -march=mipsel -mcpu=mips2 -pre-RA-sched=source < %s | FileCheck %s
-; RUN: llc -march=mipsel -mcpu=mips2 -pre-RA-sched=source < %s -regalloc=basic | FileCheck %s
 
 
 ; All test functions do the same thing - they return the first variable
@@ -30,11 +29,11 @@ entry:
   ret i32 %tmp
 
 ; CHECK: va1:
-; CHECK: addiu   $sp, $sp, -32
-; CHECK: sw      $7, 44($sp)
-; CHECK: sw      $6, 40($sp)
-; CHECK: sw      $5, 36($sp)
-; CHECK: lw      $2, 36($sp)
+; CHECK: addiu   $sp, $sp, -16
+; CHECK: sw      $7, 28($sp)
+; CHECK: sw      $6, 24($sp)
+; CHECK: sw      $5, 20($sp)
+; CHECK: lw      $2, 20($sp)
 }
 
 ; check whether the variable double argument will be accessed from the 8-byte
@@ -56,11 +55,11 @@ entry:
   ret double %tmp
 
 ; CHECK: va2:
-; CHECK: addiu   $sp, $sp, -40
-; CHECK: sw      $7, 52($sp)
-; CHECK: sw      $6, 48($sp)
-; CHECK: sw      $5, 44($sp)
-; CHECK: addiu   $[[R0:[0-9]+]], $sp, 44
+; CHECK: addiu   $sp, $sp, -16
+; CHECK: sw      $7, 28($sp)
+; CHECK: sw      $6, 24($sp)
+; CHECK: sw      $5, 20($sp)
+; CHECK: addiu   $[[R0:[0-9]+]], $sp, 20
 ; CHECK: addiu   $[[R1:[0-9]+]], $[[R0]], 7
 ; CHECK: addiu   $[[R2:[0-9]+]], $zero, -8
 ; CHECK: and     $[[R3:[0-9]+]], $[[R1]], $[[R2]]
@@ -84,10 +83,10 @@ entry:
   ret i32 %tmp
 
 ; CHECK: va3:
-; CHECK: addiu   $sp, $sp, -40
-; CHECK: sw      $7, 52($sp)
-; CHECK: sw      $6, 48($sp)
-; CHECK: lw      $2, 48($sp)
+; CHECK: addiu   $sp, $sp, -16
+; CHECK: sw      $7, 28($sp)
+; CHECK: sw      $6, 24($sp)
+; CHECK: lw      $2, 24($sp)
 }
 
 ; double
@@ -107,14 +106,11 @@ entry:
   ret double %tmp
 
 ; CHECK: va4:
-; CHECK: addiu   $sp, $sp, -48
-; CHECK: sw      $7, 60($sp)
-; CHECK: sw      $6, 56($sp)
-; CHECK: addiu   $[[R0:[0-9]+]], $sp, 56
-; CHECK: addiu   $[[R1:[0-9]+]], $[[R0]], 7
-; CHECK: addiu   $[[R2:[0-9]+]], $zero, -8
-; CHECK: and     $[[R3:[0-9]+]], $[[R1]], $[[R2]]
-; CHECK: ldc1    $f0, 0($[[R3]])
+; CHECK: addiu   $sp, $sp, -24
+; CHECK: sw      $7, 36($sp)
+; CHECK: sw      $6, 32($sp)
+; CHECK: addiu   ${{[0-9]+}}, $sp, 32
+; CHECK: ldc1    $f0, 32($sp)
 }
 
 ; int
@@ -138,9 +134,9 @@ entry:
   ret i32 %tmp
 
 ; CHECK: va5:
-; CHECK: addiu   $sp, $sp, -40
-; CHECK: sw      $7, 52($sp)
-; CHECK: lw      $2, 52($sp)
+; CHECK: addiu   $sp, $sp, -24
+; CHECK: sw      $7, 36($sp)
+; CHECK: lw      $2, 36($sp)
 }
 
 ; double
@@ -164,9 +160,9 @@ entry:
   ret double %tmp
 
 ; CHECK: va6:
-; CHECK: addiu   $sp, $sp, -48
-; CHECK: sw      $7, 60($sp)
-; CHECK: addiu   $[[R0:[0-9]+]], $sp, 60
+; CHECK: addiu   $sp, $sp, -24
+; CHECK: sw      $7, 36($sp)
+; CHECK: addiu   $[[R0:[0-9]+]], $sp, 36
 ; CHECK: addiu   $[[R1:[0-9]+]], $[[R0]], 7
 ; CHECK: addiu   $[[R2:[0-9]+]], $zero, -8
 ; CHECK: and     $[[R3:[0-9]+]], $[[R1]], $[[R2]]
@@ -192,8 +188,8 @@ entry:
   ret i32 %tmp
 
 ; CHECK: va7:
-; CHECK: addiu   $sp, $sp, -40
-; CHECK: lw      $2, 56($sp)
+; CHECK: addiu   $sp, $sp, -24
+; CHECK: lw      $2, 40($sp)
 }
 
 ; double
@@ -215,12 +211,9 @@ entry:
   ret double %tmp
 
 ; CHECK: va8:
-; CHECK: addiu   $sp, $sp, -48
-; CHECK: addiu   $[[R0:[0-9]+]], $sp, 64
-; CHECK: addiu   $[[R1:[0-9]+]], $[[R0]], 7
-; CHECK: addiu   $[[R2:[0-9]+]], $zero, -8
-; CHECK: and     $[[R3:[0-9]+]], $[[R1]], $[[R2]]
-; CHECK: ldc1    $f0, 0($[[R3]])
+; CHECK: addiu   $sp, $sp, -32
+; CHECK: addiu   ${{[0-9]+}}, $sp, 48
+; CHECK: ldc1    $f0, 48($sp)
 }
 
 ; int
@@ -244,8 +237,8 @@ entry:
   ret i32 %tmp
 
 ; CHECK: va9:
-; CHECK: addiu   $sp, $sp, -56
-; CHECK: lw      $2, 76($sp)
+; CHECK: addiu   $sp, $sp, -32
+; CHECK: lw      $2, 52($sp)
 }
 
 ; double
@@ -269,8 +262,8 @@ entry:
   ret double %tmp
 
 ; CHECK: va10:
-; CHECK: addiu   $sp, $sp, -56
-; CHECK: addiu   $[[R0:[0-9]+]], $sp, 76
+; CHECK: addiu   $sp, $sp, -32
+; CHECK: addiu   $[[R0:[0-9]+]], $sp, 52
 ; CHECK: addiu   $[[R1:[0-9]+]], $[[R0]], 7
 ; CHECK: addiu   $[[R2:[0-9]+]], $zero, -8
 ; CHECK: and     $[[R3:[0-9]+]], $[[R1]], $[[R2]]
diff --git a/test/CodeGen/Mips/tls.ll b/test/CodeGen/Mips/tls.ll
new file mode 100644
index 0000000..034738b
--- /dev/null
+++ b/test/CodeGen/Mips/tls.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=mipsel -mcpu=mips2 < %s | FileCheck %s -check-prefix=PIC
+; RUN: llc -march=mipsel -mcpu=mips2 -relocation-model=static < %s \
+; RUN:                             | FileCheck %s -check-prefix=STATIC
+
+
+@t1 = thread_local global i32 0, align 4
+
+define i32 @f1() nounwind {
+entry:
+  %tmp = load i32* @t1, align 4
+  ret i32 %tmp
+
+; CHECK: f1:
+
+; PIC:   lw      $25, %call16(__tls_get_addr)($gp)
+; PIC:   addiu   $4, $gp, %tlsgd(t1)
+; PIC:   jalr    $25
+; PIC:   lw      $2, 0($2)
+
+; STATIC:   rdhwr   $3, $29
+; STATIC:   lui     $[[R0:[0-9]+]], %tprel_hi(t1)
+; STATIC:   addiu   $[[R1:[0-9]+]], $[[R0]], %tprel_lo(t1)
+; STATIC:   addu    $[[R2:[0-9]+]], $3, $[[R1]]
+; STATIC:   lw      $2, 0($[[R2]])
+}
+
+
+@t2 = external thread_local global i32
+
+define i32 @f2() nounwind {
+entry:
+  %tmp = load i32* @t2, align 4
+  ret i32 %tmp
+
+; CHECK: f2:
+
+; PIC:   lw      $25, %call16(__tls_get_addr)($gp)
+; PIC:   addiu   $4, $gp, %tlsgd(t2)
+; PIC:   jalr    $25
+; PIC:   lw      $2, 0($2)
+
+; STATIC:   rdhwr   $3, $29
+; STATIC:   lw      $[[R0:[0-9]+]], %gottprel(t2)($gp)
+; STATIC:   addu    $[[R1:[0-9]+]], $3, $[[R0]]
+; STATIC:   lw      $2, 0($[[R1]])
+}
diff --git a/test/CodeGen/Mips/weak.ll b/test/CodeGen/Mips/weak.ll
new file mode 100644
index 0000000..09dd2a4
--- /dev/null
+++ b/test/CodeGen/Mips/weak.ll
@@ -0,0 +1,12 @@
+; RUN: llc -march=mips < %s | FileCheck %s
+
+@t = common global i32 (...)* null, align 4
+
+define void @f() nounwind {
+entry:
+  store i32 (...)* @test_weak, i32 (...)** @t, align 4
+  ret void
+}
+
+; CHECK: .weak test_weak
+declare extern_weak i32 @test_weak(...)
diff --git a/test/CodeGen/PTX/cvt.ll b/test/CodeGen/PTX/cvt.ll
new file mode 100644
index 0000000..2f793de
--- /dev/null
+++ b/test/CodeGen/PTX/cvt.ll
@@ -0,0 +1,234 @@
+; RUN: llc < %s -march=ptx32 | FileCheck %s
+
+; preds 
+; (note: we convert back to i32 to return)
+
+define ptx_device i32 @cvt_pred_i16(i16 %x, i1 %y) {
+; CHECK: cvt.pred.u16 p0, rh1;
+; CHECK: ret;
+	%a = trunc i16 %x to i1
+	%b = and i1 %a, %y
+	%c = zext i1 %b to i32
+	ret i32 %c
+}
+
+define ptx_device i32 @cvt_pred_i32(i32 %x, i1 %y) {
+; CHECK: cvt.pred.u32 p0, r1;
+; CHECK: ret;
+	%a = trunc i32 %x to i1
+	%b = and i1 %a, %y
+	%c = zext i1 %b to i32
+	ret i32 %c
+}
+
+define ptx_device i32 @cvt_pred_i64(i64 %x, i1 %y) {
+; CHECK: cvt.pred.u64 p0, rd1;
+; CHECK: ret;
+	%a = trunc i64 %x to i1
+	%b = and i1 %a, %y
+	%c = zext i1 %b to i32
+	ret i32 %c
+}
+
+define ptx_device i32 @cvt_pred_f32(float %x, i1 %y) {
+; CHECK: cvt.rni.pred.f32 p0, f1;
+; CHECK: ret;
+	%a = fptoui float %x to i1
+	%b = and i1 %a, %y
+	%c = zext i1 %b to i32
+	ret i32 %c
+}
+
+define ptx_device i32 @cvt_pred_f64(double %x, i1 %y) {
+; CHECK: cvt.rni.pred.f64 p0, fd1;
+; CHECK: ret;
+	%a = fptoui double %x to i1
+	%b = and i1 %a, %y
+	%c = zext i1 %b to i32
+	ret i32 %c
+}
+
+; i16
+
+define ptx_device i16 @cvt_i16_preds(i1 %x) {
+; CHECK: cvt.u16.pred rh0, p1;
+; CHECK: ret;
+	%a = zext i1 %x to i16
+	ret i16 %a
+}
+
+define ptx_device i16 @cvt_i16_i32(i32 %x) {
+; CHECK: cvt.u16.u32 rh0, r1;
+; CHECK: ret;
+	%a = trunc i32 %x to i16
+	ret i16 %a
+}
+
+define ptx_device i16 @cvt_i16_i64(i64 %x) {
+; CHECK: cvt.u16.u64 rh0, rd1;
+; CHECK: ret;
+	%a = trunc i64 %x to i16
+	ret i16 %a
+}
+
+define ptx_device i16 @cvt_i16_f32(float %x) {
+; CHECK: cvt.rni.u16.f32 rh0, f1;
+; CHECK: ret;
+	%a = fptoui float %x to i16
+	ret i16 %a
+}
+
+define ptx_device i16 @cvt_i16_f64(double %x) {
+; CHECK: cvt.rni.u16.f64 rh0, fd1;
+; CHECK: ret;
+	%a = fptoui double %x to i16
+	ret i16 %a
+}
+
+; i32
+
+define ptx_device i32 @cvt_i32_preds(i1 %x) {
+; CHECK: cvt.u32.pred r0, p1;
+; CHECK: ret;
+	%a = zext i1 %x to i32
+	ret i32 %a
+}
+
+define ptx_device i32 @cvt_i32_i16(i16 %x) {
+; CHECK: cvt.u32.u16 r0, rh1;
+; CHECK: ret;
+	%a = zext i16 %x to i32
+	ret i32 %a
+}
+
+define ptx_device i32 @cvt_i32_i64(i64 %x) {
+; CHECK: cvt.u32.u64 r0, rd1;
+; CHECK: ret;
+	%a = trunc i64 %x to i32
+	ret i32 %a
+}
+
+define ptx_device i32 @cvt_i32_f32(float %x) {
+; CHECK: cvt.rni.u32.f32 r0, f1;
+; CHECK: ret;
+	%a = fptoui float %x to i32
+	ret i32 %a
+}
+
+define ptx_device i32 @cvt_i32_f64(double %x) {
+; CHECK: cvt.rni.u32.f64 r0, fd1;
+; CHECK: ret;
+	%a = fptoui double %x to i32
+	ret i32 %a
+}
+
+; i64
+
+define ptx_device i64 @cvt_i64_preds(i1 %x) {
+; CHECK: cvt.u64.pred rd0, p1;
+; CHECK: ret;
+	%a = zext i1 %x to i64
+	ret i64 %a
+}
+
+define ptx_device i64 @cvt_i64_i16(i16 %x) {
+; CHECK: cvt.u64.u16 rd0, rh1;
+; CHECK: ret;
+	%a = zext i16 %x to i64
+	ret i64 %a
+}
+
+define ptx_device i64 @cvt_i64_i32(i32 %x) {
+; CHECK: cvt.u64.u32 rd0, r1;
+; CHECK: ret;
+	%a = zext i32 %x to i64
+	ret i64 %a
+}
+
+define ptx_device i64 @cvt_i64_f32(float %x) {
+; CHECK: cvt.rni.u64.f32 rd0, f1;
+; CHECK: ret;
+	%a = fptoui float %x to i64
+	ret i64 %a
+}
+
+define ptx_device i64 @cvt_i64_f64(double %x) {
+; CHECK: cvt.rni.u64.f64 rd0, fd1;
+; CHECK: ret;
+	%a = fptoui double %x to i64
+	ret i64 %a
+}
+
+; f32
+
+define ptx_device float @cvt_f32_preds(i1 %x) {
+; CHECK: cvt.rn.f32.pred f0, p1;
+; CHECK: ret;
+	%a = uitofp i1 %x to float
+	ret float %a
+}
+
+define ptx_device float @cvt_f32_i16(i16 %x) {
+; CHECK: cvt.rn.f32.u16 f0, rh1;
+; CHECK: ret;
+	%a = uitofp i16 %x to float
+	ret float %a
+}
+
+define ptx_device float @cvt_f32_i32(i32 %x) {
+; CHECK: cvt.rn.f32.u32 f0, r1;
+; CHECK: ret;
+	%a = uitofp i32 %x to float
+	ret float %a
+}
+
+define ptx_device float @cvt_f32_i64(i64 %x) {
+; CHECK: cvt.rn.f32.u64 f0, rd1;
+; CHECK: ret;
+	%a = uitofp i64 %x to float
+	ret float %a
+}
+
+define ptx_device float @cvt_f32_f64(double %x) {
+; CHECK: cvt.rn.f32.f64 f0, fd1;
+; CHECK: ret;
+	%a = fptrunc double %x to float
+	ret float %a
+}
+
+; f64
+
+define ptx_device double @cvt_f64_preds(i1 %x) {
+; CHECK: cvt.rn.f64.pred fd0, p1;
+; CHECK: ret;
+	%a = uitofp i1 %x to double
+	ret double %a
+}
+
+define ptx_device double @cvt_f64_i16(i16 %x) {
+; CHECK: cvt.rn.f64.u16 fd0, rh1;
+; CHECK: ret;
+	%a = uitofp i16 %x to double
+	ret double %a
+}
+
+define ptx_device double @cvt_f64_i32(i32 %x) {
+; CHECK: cvt.rn.f64.u32 fd0, r1;
+; CHECK: ret;
+	%a = uitofp i32 %x to double
+	ret double %a
+}
+
+define ptx_device double @cvt_f64_i64(i64 %x) {
+; CHECK: cvt.rn.f64.u64 fd0, rd1;
+; CHECK: ret;
+	%a = uitofp i64 %x to double
+	ret double %a
+}
+
+define ptx_device double @cvt_f64_f32(float %x) {
+; CHECK: cvt.f64.f32 fd0, f1;
+; CHECK: ret;
+	%a = fpext float %x to double
+	ret double %a
+}
diff --git a/test/CodeGen/PTX/fneg.ll b/test/CodeGen/PTX/fneg.ll
new file mode 100644
index 0000000..22eeda3
--- /dev/null
+++ b/test/CodeGen/PTX/fneg.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=ptx32 | FileCheck %s
+
+define ptx_device float @t1_f32(float %x) {
+; CHECK: neg.f32 f0, f1;
+; CHECK-NEXT: ret;
+	%y = fsub float -0.000000e+00, %x
+	ret float %y
+}
+
+define ptx_device double @t1_f64(double %x) {
+; CHECK: neg.f64 fd0, fd1;
+; CHECK-NEXT: ret;
+	%y = fsub double -0.000000e+00, %x
+	ret double %y
+}
diff --git a/test/CodeGen/PTX/mad-disabling.ll b/test/CodeGen/PTX/mad-disabling.ll
new file mode 100644
index 0000000..ad7b341
--- /dev/null
+++ b/test/CodeGen/PTX/mad-disabling.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=ptx32 -mattr=+ptx20,+sm20 | grep "mad"
+; RUN: llc < %s -march=ptx32 -mattr=+ptx20,+sm20,+no-fma | grep -v "mad"
+
+define ptx_device float @test_mul_add_f(float %x, float %y, float %z) {
+entry:
+  %a = fmul float %x, %y
+  %b = fadd float %a, %z
+  ret float %b
+}
+
+define ptx_device double @test_mul_add_d(double %x, double %y, double %z) {
+entry:
+  %a = fmul double %x, %y
+  %b = fadd double %a, %z
+  ret double %b
+}
diff --git a/test/CodeGen/PTX/options.ll b/test/CodeGen/PTX/options.ll
index ac33fef..92effa6 100644
--- a/test/CodeGen/PTX/options.ll
+++ b/test/CodeGen/PTX/options.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -march=ptx32 -mattr=ptx20 | grep ".version 2.0"
 ; RUN: llc < %s -march=ptx32 -mattr=ptx21 | grep ".version 2.1"
 ; RUN: llc < %s -march=ptx32 -mattr=ptx22 | grep ".version 2.2"
+; RUN: llc < %s -march=ptx32 -mattr=ptx23 | grep ".version 2.3"
 ; RUN: llc < %s -march=ptx32 -mattr=sm10 | grep ".target sm_10"
 ; RUN: llc < %s -march=ptx32 -mattr=sm13 | grep ".target sm_13"
 ; RUN: llc < %s -march=ptx32 -mattr=sm20 | grep ".target sm_20"
diff --git a/test/CodeGen/PTX/selp.ll b/test/CodeGen/PTX/selp.ll
new file mode 100644
index 0000000..6f1b03e
--- /dev/null
+++ b/test/CodeGen/PTX/selp.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=ptx32 | FileCheck %s
+
+define ptx_device i32 @test_selp_i32(i1 %x, i32 %y, i32 %z) {
+; CHECK: selp.u32 r0, r1, r2, p1;
+	%a = select i1 %x, i32 %y, i32 %z
+	ret i32 %a
+}
+
+define ptx_device i64 @test_selp_i64(i1 %x, i64 %y, i64 %z) {
+; CHECK: selp.u64 rd0, rd1, rd2, p1;
+	%a = select i1 %x, i64 %y, i64 %z
+	ret i64 %a
+}
+
+define ptx_device float @test_selp_f32(i1 %x, float %y, float %z) {
+; CHECK: selp.f32 f0, f1, f2, p1;
+	%a = select i1 %x, float %y, float %z
+	ret float %a
+}
+
+define ptx_device double @test_selp_f64(i1 %x, double %y, double %z) {
+; CHECK: selp.f64 fd0, fd1, fd2, p1;
+	%a = select i1 %x, double %y, double %z
+	ret double %a
+}
diff --git a/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll b/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
index d094509..6b31397 100644
--- a/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
+++ b/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=ppc32 -mcpu=g5 -mtriple=powerpc-apple-darwin10.0 | FileCheck %s
+; RUN: llc < %s -march=ppc32 -mcpu=g5 -mtriple=powerpc-apple-darwin10.0 -join-physregs | FileCheck %s
 ; ModuleID = 'nn.c'
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
 target triple = "powerpc-apple-darwin11.0"
diff --git a/test/CodeGen/PowerPC/big-endian-formal-args.ll b/test/CodeGen/PowerPC/big-endian-formal-args.ll
index e46e1ec..318ccb0 100644
--- a/test/CodeGen/PowerPC/big-endian-formal-args.ll
+++ b/test/CodeGen/PowerPC/big-endian-formal-args.ll
@@ -1,14 +1,12 @@
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-unknown-linux-gnu | \
-; RUN:   grep {li 6, 3}
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-unknown-linux-gnu | \
-; RUN:   grep {li 4, 2}
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-unknown-linux-gnu | \
-; RUN:   grep {li 3, 0}
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-unknown-linux-gnu | \
-; RUN:   grep {mr 5, 3}
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-unknown-linux-gnu | FileCheck %s
 
 declare void @bar(i64 %x, i64 %y)
 
+; CHECK: li 4, 2
+; CHECK: li {{[53]}}, 0
+; CHECK: li 6, 3
+; CHECK: mr {{[53]}}, {{[53]}}
+
 define void @foo() {
   call void @bar(i64 2, i64 3)
   ret void
diff --git a/test/CodeGen/PowerPC/indirectbr.ll b/test/CodeGen/PowerPC/indirectbr.ll
index ac56625..29c620e 100644
--- a/test/CodeGen/PowerPC/indirectbr.ll
+++ b/test/CodeGen/PowerPC/indirectbr.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -relocation-model=pic -march=ppc32 -mtriple=powerpc-apple-darwin | FileCheck %s -check-prefix=PIC
 ; RUN: llc < %s -relocation-model=static -march=ppc32 -mtriple=powerpc-apple-darwin | FileCheck %s -check-prefix=STATIC
+; RUN: llc < %s -relocation-model=pic -march=ppc64 -mtriple=powerpc64-apple-darwin | FileCheck %s -check-prefix=PPC64
 
 @nextaddr = global i8* null                       ; <i8**> [#uses=2]
 @C.0.2070 = private constant [5 x i8*] [i8* blockaddress(@foo, %L1), i8* blockaddress(@foo, %L2), i8* blockaddress(@foo, %L3), i8* blockaddress(@foo, %L4), i8* blockaddress(@foo, %L5)] ; <[5 x i8*]*> [#uses=1]
@@ -7,6 +8,7 @@
 define internal i32 @foo(i32 %i) nounwind {
 ; PIC: foo:
 ; STATIC: foo:
+; PPC64: foo:
 entry:
   %0 = load i8** @nextaddr, align 4               ; <i8*> [#uses=2]
   %1 = icmp eq i8* %0, null                       ; <i1> [#uses=1]
@@ -18,6 +20,8 @@ bb2:                                              ; preds = %entry, %bb3
 ; PIC-NEXT: bctr
 ; STATIC: mtctr
 ; STATIC-NEXT: bctr
+; PPC64: mtctr
+; PPC64-NEXT: bctr
   indirectbr i8* %gotovar.4.0, [label %L5, label %L4, label %L3, label %L2, label %L1]
 
 bb3:                                              ; preds = %entry
diff --git a/test/CodeGen/SPARC/2011-01-22-SRet.ll b/test/CodeGen/SPARC/2011-01-22-SRet.ll
index 506d3a8..5393392 100644
--- a/test/CodeGen/SPARC/2011-01-22-SRet.ll
+++ b/test/CodeGen/SPARC/2011-01-22-SRet.ll
@@ -6,7 +6,6 @@ define weak void @make_foo(%struct.foo_t* noalias sret %agg.result, i32 %a, i32
 entry:
 ;CHECK: make_foo
 ;CHECK: ld [%fp+64], {{.+}}
-;CHECK: or {{.+}}, {{.+}}, %i0
 ;CHECK: jmp %i7+12
   %0 = getelementptr inbounds %struct.foo_t* %agg.result, i32 0, i32 0
   store i32 %a, i32* %0, align 4
diff --git a/test/CodeGen/Thumb/2009-08-20-ISelBug.ll b/test/CodeGen/Thumb/2009-08-20-ISelBug.ll
index d6ca0d7..7876557 100644
--- a/test/CodeGen/Thumb/2009-08-20-ISelBug.ll
+++ b/test/CodeGen/Thumb/2009-08-20-ISelBug.ll
@@ -11,7 +11,7 @@
 
 define i32 @t(%struct.asl_file_t* %s, i64 %off, i64* %out) nounwind optsize {
 ; CHECK: t:
-; CHECK: adds r0, #8
+; CHECK: adds {{r[0-7]}}, #8
 entry:
   %val = alloca i64, align 4                      ; <i64*> [#uses=3]
   %0 = icmp eq %struct.asl_file_t* %s, null       ; <i1> [#uses=1]
diff --git a/test/CodeGen/Thumb/2011-05-11-DAGLegalizer.ll b/test/CodeGen/Thumb/2011-05-11-DAGLegalizer.ll
new file mode 100644
index 0000000..ed55bb5
--- /dev/null
+++ b/test/CodeGen/Thumb/2011-05-11-DAGLegalizer.ll
@@ -0,0 +1,60 @@
+; RUN: llc -mtriple=thumbv6-apple-darwin < %s
+; rdar://problem/9416774
+; ModuleID = 'reduced.ll'
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-ios"
+
+%struct.MMMMMMMMMMMM = type { [4 x %struct.RRRRRRRR] }
+%struct.RRRRRRRR = type { [78 x i32] }
+
+@kkkkkk = external constant i8*
+@__PRETTY_FUNCTION__._ZN12CLGll = private unnamed_addr constant [62 x i8] c"static void tttttttttttt::lllllllllllll(const MMMMMMMMMMMM &)\00"
+@.str = private unnamed_addr constant [75 x i8] c"\09GGGGGGGGGGGGGGGGGGGGGGG:,BE:0x%08lx,ALM:0x%08lx,LTO:0x%08lx,CBEE:0x%08lx\0A\00"
+
+define void @_ZN12CLGll(%struct.MMMMMMMMMMMM* %aidData) ssp align 2 {
+entry:
+  %aidData.addr = alloca %struct.MMMMMMMMMMMM*, align 4
+  %agg.tmp = alloca %struct.RRRRRRRR, align 4
+  %agg.tmp4 = alloca %struct.RRRRRRRR, align 4
+  %agg.tmp10 = alloca %struct.RRRRRRRR, align 4
+  %agg.tmp16 = alloca %struct.RRRRRRRR, align 4
+  store %struct.MMMMMMMMMMMM* %aidData, %struct.MMMMMMMMMMMM** %aidData.addr, align 4
+  br label %do.body
+
+do.body:                                          ; preds = %entry
+  %tmp = load i8** @kkkkkk, align 4
+  %tmp1 = load %struct.MMMMMMMMMMMM** %aidData.addr
+  %eph = getelementptr inbounds %struct.MMMMMMMMMMMM* %tmp1, i32 0, i32 0
+  %arrayidx = getelementptr inbounds [4 x %struct.RRRRRRRR]* %eph, i32 0, i32 0
+  %tmp2 = bitcast %struct.RRRRRRRR* %agg.tmp to i8*
+  %tmp3 = bitcast %struct.RRRRRRRR* %arrayidx to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp2, i8* %tmp3, i32 312, i32 4, i1 false)
+  %tmp5 = load %struct.MMMMMMMMMMMM** %aidData.addr
+  %eph6 = getelementptr inbounds %struct.MMMMMMMMMMMM* %tmp5, i32 0, i32 0
+  %arrayidx7 = getelementptr inbounds [4 x %struct.RRRRRRRR]* %eph6, i32 0, i32 1
+  %tmp8 = bitcast %struct.RRRRRRRR* %agg.tmp4 to i8*
+  %tmp9 = bitcast %struct.RRRRRRRR* %arrayidx7 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp8, i8* %tmp9, i32 312, i32 4, i1 false)
+  %tmp11 = load %struct.MMMMMMMMMMMM** %aidData.addr
+  %eph12 = getelementptr inbounds %struct.MMMMMMMMMMMM* %tmp11, i32 0, i32 0
+  %arrayidx13 = getelementptr inbounds [4 x %struct.RRRRRRRR]* %eph12, i32 0, i32 2
+  %tmp14 = bitcast %struct.RRRRRRRR* %agg.tmp10 to i8*
+  %tmp15 = bitcast %struct.RRRRRRRR* %arrayidx13 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp14, i8* %tmp15, i32 312, i32 4, i1 false)
+  %tmp17 = load %struct.MMMMMMMMMMMM** %aidData.addr
+  %eph18 = getelementptr inbounds %struct.MMMMMMMMMMMM* %tmp17, i32 0, i32 0
+  %arrayidx19 = getelementptr inbounds [4 x %struct.RRRRRRRR]* %eph18, i32 0, i32 3
+  %tmp20 = bitcast %struct.RRRRRRRR* %agg.tmp16 to i8*
+  %tmp21 = bitcast %struct.RRRRRRRR* %arrayidx19 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp20, i8* %tmp21, i32 312, i32 4, i1 false)
+  call void (i8*, i32, i8*, i8*, ...)* @CLLoggingLog(i8* %tmp, i32 2, i8* getelementptr inbounds ([62 x i8]* @__PRETTY_FUNCTION__._ZN12CLGll, i32 0, i32 0), i8* getelementptr inbounds ([75 x i8]* @.str, i32 0, i32 0), %struct.RRRRRRRR* byval %agg.tmp, %struct.RRRRRRRR* byval %agg.tmp4, %struct.RRRRRRRR* byval %agg.tmp10, %struct.RRRRRRRR* byval %agg.tmp16)
+  br label %do.end
+
+do.end:                                           ; preds = %do.body
+  ret void
+}
+
+declare void @CLLoggingLog(i8*, i32, i8*, i8*, ...)
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/CodeGen/Thumb2/2009-10-15-ITBlockBranch.ll b/test/CodeGen/Thumb2/2009-10-15-ITBlockBranch.ll
index 3594424..9aee910 100644
--- a/test/CodeGen/Thumb2/2009-10-15-ITBlockBranch.ll
+++ b/test/CodeGen/Thumb2/2009-10-15-ITBlockBranch.ll
@@ -12,7 +12,7 @@
 define weak arm_aapcs_vfpcc i32 @_ZNKSs7compareERKSs(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this, %"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %__str) {
 ; CHECK: _ZNKSs7compareERKSs:
 ; CHECK:      it  eq
-; CHECK-NEXT: subeq r0, r{{[0-9]+}}, r{{[0-9]+}}
+; CHECK-NEXT: subeq{{(.w)?}} r0, r{{[0-9]+}}, r{{[0-9]+}}
 ; CHECK-NEXT: ldmia.w sp!,
 entry:
   %0 = tail call arm_aapcs_vfpcc  i32 @_ZNKSs4sizeEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this) ; <i32> [#uses=3]
diff --git a/test/CodeGen/Thumb2/2010-08-10-VarSizedAllocaBug.ll b/test/CodeGen/Thumb2/2010-08-10-VarSizedAllocaBug.ll
index 41f7f29..47d7a9c 100644
--- a/test/CodeGen/Thumb2/2010-08-10-VarSizedAllocaBug.ll
+++ b/test/CodeGen/Thumb2/2010-08-10-VarSizedAllocaBug.ll
@@ -7,8 +7,8 @@ entry:
 ; CHECK: Callee:
 ; CHECK: push
 ; CHECK: mov r4, sp
-; CHECK: sub.w r12, r4, #1000
-; CHECK: mov sp, r12
+; CHECK: sub.w [[R12:r[0-9]+]], r4, #1000
+; CHECK: mov sp, [[R12]]
   %0 = icmp eq i32 %i, 0                          ; <i1> [#uses=1]
   br i1 %0, label %bb2, label %bb
 
diff --git a/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll b/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll
new file mode 100644
index 0000000..9e6d78e
--- /dev/null
+++ b/test/CodeGen/Thumb2/2011-06-07-TwoAddrEarlyClobber.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-darwin10"
+
+%struct.op = type { %struct.op*, %struct.op*, %struct.op* ()*, i32, i16, i16, i8, i8 }
+
+; CHECK: Perl_ck_sort
+; CHECK: ldr
+; CHECK: mov [[REGISTER:(r[0-9]+)|(lr)]]
+; CHECK: str {{(r[0-9])|(lr)}}, {{\[}}[[REGISTER]]{{\]}}, #24
+
+define void @Perl_ck_sort() nounwind optsize {
+entry:
+  %tmp27 = load %struct.op** undef, align 4
+  switch i16 undef, label %if.end151 [
+    i16 178, label %if.then60
+    i16 177, label %if.then60
+  ]
+
+if.then60:                                        ; preds = %if.then40
+  br i1 undef, label %if.then67, label %if.end95
+
+if.then67:                                        ; preds = %if.then60
+  %op_next71 = getelementptr inbounds %struct.op* %tmp27, i32 0, i32 0
+  store %struct.op* %tmp27, %struct.op** %op_next71, align 4
+  %0 = getelementptr inbounds %struct.op* %tmp27, i32 1, i32 0
+  br label %if.end95
+
+if.end95:                                         ; preds = %if.else92, %if.then67
+  %.pre-phi = phi %struct.op** [ undef, %if.then60 ], [ %0, %if.then67 ]
+  %tmp98 = load %struct.op** %.pre-phi, align 4
+  br label %if.end151
+
+if.end151:                                        ; preds = %if.end100, %if.end, %entry
+  ret void
+}
diff --git a/test/CodeGen/Thumb2/bfi.ll b/test/CodeGen/Thumb2/bfi.ll
index 0e76770..3612e27 100644
--- a/test/CodeGen/Thumb2/bfi.ll
+++ b/test/CodeGen/Thumb2/bfi.ll
@@ -30,9 +30,8 @@ entry:
 define i32 @f3(i32 %A, i32 %B) nounwind readnone optsize {
 entry:
 ; CHECK: f3
-; CHECK: lsrs  r2, r0, #7
-; CHECK: mov r0, r1
-; CHECK: bfi r0, r2, #7, #16
+; CHECK: lsrs {{.*}}, #7
+; CHECK: bfi {{.*}}, #7, #16
   %and = and i32 %A, 8388480                      ; <i32> [#uses=1]
   %and2 = and i32 %B, -8388481                    ; <i32> [#uses=1]
   %or = or i32 %and2, %and                        ; <i32> [#uses=1]
@@ -42,8 +41,8 @@ entry:
 ; rdar://8752056
 define i32 @f4(i32 %a) nounwind {
 ; CHECK: f4
-; CHECK: movw r1, #3137
-; CHECK: bfi r1, r0, #15, #5
+; CHECK: movw [[R1:r[0-9]+]], #3137
+; CHECK: bfi [[R1]], {{.*}}, #15, #5
   %1 = shl i32 %a, 15
   %ins7 = and i32 %1, 1015808
   %ins12 = or i32 %ins7, 3137
diff --git a/test/CodeGen/Thumb2/thumb2-cbnz.ll b/test/CodeGen/Thumb2/thumb2-cbnz.ll
index 10a4985..0992fa8 100644
--- a/test/CodeGen/Thumb2/thumb2-cbnz.ll
+++ b/test/CodeGen/Thumb2/thumb2-cbnz.ll
@@ -3,26 +3,29 @@
 
 declare double @floor(double) nounwind readnone
 
-define void @t(i1 %a, double %b) {
+define void @t(i32 %c, double %b) {
 entry:
-  br i1 %a, label %bb3, label %bb1
+  %cmp1 = icmp ne i32 %c, 0
+  br i1 %cmp1, label %bb3, label %bb1
 
 bb1:                                              ; preds = %entry
   unreachable
 
 bb3:                                              ; preds = %entry
-  br i1 %a, label %bb7, label %bb5
+  %cmp2 = icmp ne i32 %c, 0
+  br i1 %cmp2, label %bb7, label %bb5
 
 bb5:                                              ; preds = %bb3
   unreachable
 
 bb7:                                              ; preds = %bb3
-  br i1 %a, label %bb11, label %bb9
+  %cmp3 = icmp ne i32 %c, 0
+  br i1 %cmp3, label %bb11, label %bb9
 
 bb9:                                              ; preds = %bb7
-; CHECK:      cmp r0, #0
-; CHECK:      cmp r0, #0
-; CHECK-NEXT: cbnz
+; CHECK:      cmp	r0, #0
+; CHECK:      cmp	r0, #0
+; CHECK-NEXT:      cbnz
   %0 = tail call  double @floor(double %b) nounwind readnone ; <double> [#uses=0]
   br label %bb11
 
diff --git a/test/CodeGen/Thumb2/thumb2-cmn.ll b/test/CodeGen/Thumb2/thumb2-cmn.ll
index eeaaa7fb..df221b9 100644
--- a/test/CodeGen/Thumb2/thumb2-cmn.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmn.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
+
+; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
+; test as 'mov.w r0, #0'. So far, that requires physreg joining.
 
 define i1 @f1(i32 %a, i32 %b) {
     %nb = sub i32 0, %b
diff --git a/test/CodeGen/Thumb2/thumb2-cmp.ll b/test/CodeGen/Thumb2/thumb2-cmp.ll
index 63249f4..da12114 100644
--- a/test/CodeGen/Thumb2/thumb2-cmp.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmp.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
+
+; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
+; test as 'mov.w r0, #0'. So far, that requires physreg joining.
 
 ; 0x000000bb = 187
 define i1 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-cmp2.ll b/test/CodeGen/Thumb2/thumb2-cmp2.ll
index 55c321d..15052e0 100644
--- a/test/CodeGen/Thumb2/thumb2-cmp2.ll
+++ b/test/CodeGen/Thumb2/thumb2-cmp2.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
+
+; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
+; test as 'mov.w r0, #0'. So far, that requires physreg joining.
 
 define i1 @f1(i32 %a, i32 %b) {
 ; CHECK: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-teq.ll b/test/CodeGen/Thumb2/thumb2-teq.ll
index 69f0383..566408a 100644
--- a/test/CodeGen/Thumb2/thumb2-teq.ll
+++ b/test/CodeGen/Thumb2/thumb2-teq.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
 
+; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
+; test as 'mov.w r0, #0'. So far, that requires physreg joining.
 
 ; 0x000000bb = 187
 define i1 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-teq2.ll b/test/CodeGen/Thumb2/thumb2-teq2.ll
index 0f122f2..cdd3489 100644
--- a/test/CodeGen/Thumb2/thumb2-teq2.ll
+++ b/test/CodeGen/Thumb2/thumb2-teq2.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
+
+; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
+; tst as 'mov.w r0, #0'. So far, that requires physreg joining.
 
 define i1 @f1(i32 %a, i32 %b) {
 ; CHECK: f1
diff --git a/test/CodeGen/Thumb2/thumb2-tst.ll b/test/CodeGen/Thumb2/thumb2-tst.ll
index d905217..47f553f 100644
--- a/test/CodeGen/Thumb2/thumb2-tst.ll
+++ b/test/CodeGen/Thumb2/thumb2-tst.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
 
+; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
+; tst as 'mov.w r0, #0'. So far, that requires physreg joining.
 
 ; 0x000000bb = 187
 define i1 @f1(i32 %a) {
diff --git a/test/CodeGen/Thumb2/thumb2-tst2.ll b/test/CodeGen/Thumb2/thumb2-tst2.ll
index db202dd..405b3bb 100644
--- a/test/CodeGen/Thumb2/thumb2-tst2.ll
+++ b/test/CodeGen/Thumb2/thumb2-tst2.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -join-physregs | FileCheck %s
+
+; These tests implicitly depend on 'movs r0, #0' being rematerialized below the
+; tst as 'mov.w r0, #0'. So far, that requires physreg joining.
 
 define i1 @f1(i32 %a, i32 %b) {
 ; CHECK: f1:
diff --git a/test/CodeGen/X86/2006-05-22-FPSetEQ.ll b/test/CodeGen/X86/2006-05-22-FPSetEQ.ll
index 35b0159..6c5a4fb 100644
--- a/test/CodeGen/X86/2006-05-22-FPSetEQ.ll
+++ b/test/CodeGen/X86/2006-05-22-FPSetEQ.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 | grep setnp
-; RUN: llc < %s -march=x86 -enable-unsafe-fp-math -enable-no-nans-fp-math | \
+; RUN: llc < %s -march=x86 -mattr=-sse | grep setnp
+; RUN: llc < %s -march=x86 -mattr=-sse -enable-unsafe-fp-math -enable-no-nans-fp-math | \
 ; RUN:   not grep setnp
 
 define i32 @test(float %f) {
diff --git a/test/CodeGen/X86/2008-07-11-SpillerBug.ll b/test/CodeGen/X86/2008-07-11-SpillerBug.ll
deleted file mode 100644
index dee7415..0000000
--- a/test/CodeGen/X86/2008-07-11-SpillerBug.ll
+++ /dev/null
@@ -1,52 +0,0 @@
-; RUN: llc < %s -march=x86 -relocation-model=static -disable-fp-elim -post-RA-scheduler=false -asm-verbose=0 | FileCheck %s
-; PR2536
-
-; CHECK: andl    $65534, %
-; CHECK-NEXT: movl %
-; CHECK-NEXT: movzwl
-
-@g_5 = external global i16		; <i16*> [#uses=2]
-@g_107 = external global i16		; <i16*> [#uses=1]
-@g_229 = external global i32		; <i32*> [#uses=1]
-@g_227 = external global i16		; <i16*> [#uses=1]
-
-define i32 @func_54(i32 %p_55, i16 zeroext  %p_56) nounwind  {
-entry:
-	load i16* @g_5, align 2		; <i16>:0 [#uses=1]
-	zext i16 %0 to i32		; <i32>:1 [#uses=1]
-	%.mask = and i32 %1, 65534		; <i32> [#uses=1]
-	icmp eq i32 %.mask, 0		; <i1>:2 [#uses=1]
-	load i32* @g_229, align 4		; <i32>:3 [#uses=1]
-	load i16* @g_227, align 2		; <i16>:4 [#uses=1]
-	icmp eq i16 %4, 0		; <i1>:5 [#uses=1]
-	load i16* @g_5, align 2		; <i16>:6 [#uses=1]
-	br label %bb
-
-bb:		; preds = %bb7.preheader, %entry
-	%indvar4 = phi i32 [ 0, %entry ], [ %indvar.next5, %bb7.preheader ]		; <i32> [#uses=1]
-	%p_56_addr.1.reg2mem.0 = phi i16 [ %p_56, %entry ], [ %p_56_addr.0, %bb7.preheader ]		; <i16> [#uses=2]
-	br i1 %2, label %bb7.preheader, label %bb5
-
-bb5:		; preds = %bb
-	store i16 %6, i16* @g_107, align 2
-	br label %bb7.preheader
-
-bb7.preheader:		; preds = %bb5, %bb
-	icmp eq i16 %p_56_addr.1.reg2mem.0, 0		; <i1>:7 [#uses=1]
-	%.0 = select i1 %7, i32 1, i32 %3		; <i32> [#uses=1]
-	urem i32 1, %.0		; <i32>:8 [#uses=1]
-	icmp eq i32 %8, 0		; <i1>:9 [#uses=1]
-	%.not = xor i1 %9, true		; <i1> [#uses=1]
-	%.not1 = xor i1 %5, true		; <i1> [#uses=1]
-	%brmerge = or i1 %.not, %.not1		; <i1> [#uses=1]
-	%iftmp.6.0 = select i1 %brmerge, i32 3, i32 0		; <i32> [#uses=1]
-	mul i32 %iftmp.6.0, %3		; <i32>:10 [#uses=1]
-	icmp eq i32 %10, 0		; <i1>:11 [#uses=1]
-	%p_56_addr.0 = select i1 %11, i16 %p_56_addr.1.reg2mem.0, i16 1		; <i16> [#uses=1]
-	%indvar.next5 = add i32 %indvar4, 1		; <i32> [#uses=2]
-	%exitcond6 = icmp eq i32 %indvar.next5, 17		; <i1> [#uses=1]
-	br i1 %exitcond6, label %bb25, label %bb
-
-bb25:		; preds = %bb7.preheader
-	ret i32 1
-}
diff --git a/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll b/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll
index b92c789..1d27fc5 100644
--- a/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll
+++ b/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll
@@ -1,5 +1,5 @@
 ; Check that eh_return & unwind_init were properly lowered
-; RUN: llc < %s | grep %ebp | count 7
+; RUN: llc < %s | grep %ebp | count 9
 ; RUN: llc < %s | grep %ecx | count 5
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll b/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
index 00ab735..d423bfc 100644
--- a/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
+++ b/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
@@ -1,5 +1,5 @@
 ; Check that eh_return & unwind_init were properly lowered
-; RUN: llc < %s | grep %rbp | count 5
+; RUN: llc < %s | grep %rbp | count 7
 ; RUN: llc < %s | grep %rcx | count 3
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
diff --git a/test/CodeGen/X86/2008-09-18-inline-asm-2.ll b/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
index 947a1f1..dfd165c 100644
--- a/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
+++ b/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
@@ -1,12 +1,32 @@
-; RUN: llc < %s -march=x86 -regalloc=linearscan | grep "#%ebp %edi %ebx 8(%esi) %eax %dl"
-; RUN: llc < %s -march=x86 -regalloc=fast       | grep "#%ebx %esi %edi 8(%ebp) %eax %dl"
-; RUN: llc < %s -march=x86 -regalloc=basic      | grep "#%ebp %esi %edx 8(%edi) %eax %bl"
-; RUN: llc < %s -march=x86 -regalloc=greedy     | grep "#%edx %edi %ebp 8(%esi) %eax %bl"
+; RUN: llc < %s -march=x86 -regalloc=linearscan | FileCheck %s
+; RUN: llc < %s -march=x86 -regalloc=fast       | FileCheck %s
+; RUN: llc < %s -march=x86 -regalloc=basic      | FileCheck %s
+; RUN: llc < %s -march=x86 -regalloc=greedy     | FileCheck %s
 
-; The 1st, 2nd, 3rd and 5th registers above must all be different.  The registers
+; The 1st, 2nd, 3rd and 5th registers must all be different.  The registers
 ; referenced in the 4th and 6th operands must not be the same as the 1st or 5th
-; operand.  There are many combinations that work; this is what llc puts out now.
-; ModuleID = '<stdin>'
+; operand.
+;
+; CHECK: 1st=[[A1:%...]]
+; CHECK-NOT: [[A1]]
+; CHECK: 2nd=[[A2:%...]]
+; CHECK-NOT: [[A1]]
+; CHECK-NOT: [[A2]]
+; CHECK: 3rd=[[A3:%...]]
+; CHECK-NOT: [[A1]]
+; CHECK-NOT: [[A2]]
+; CHECK-NOT: [[A3]]
+; CHECK: 5th=[[A5:%...]]
+; CHECK-NOT: [[A1]]
+; CHECK-NOT; [[A5]]
+; CHECK: =4th
+
+; The 6th operand is an 8-bit register, and it mustn't alias the 1st and 5th.
+; CHECK: 1%e[[S1:.]]x
+; CHECK: 5%e[[S5:.]]x
+; CHECK-NOT: %[[S1]]
+; CHECK-NOT: %[[S5]]
+
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
 	%struct.foo = type { i32, i32, i8* }
@@ -19,7 +39,7 @@ entry:
 	%3 = load i32* %0, align 4		; <i32> [#uses=1]
 	%4 = load i32* %1, align 4		; <i32> [#uses=1]
 	%5 = load i8* %state, align 1		; <i8> [#uses=1]
-	%asmtmp = tail call { i32, i32, i32, i32 } asm sideeffect "#$0 $1 $2 $3 $4 $5", "=&r,=r,=r,=*m,=&q,=*imr,1,2,*m,5,~{dirflag},~{fpsr},~{flags},~{cx}"(i8** %2, i8* %state, i32 %3, i32 %4, i8** %2, i8 %5) nounwind		; <{ i32, i32, i32, i32 }> [#uses=3]
+	%asmtmp = tail call { i32, i32, i32, i32 } asm sideeffect "#1st=$0 $1 2nd=$1 $2 3rd=$2 $4 5th=$4 $3=4th 1$0 1%eXx 5$4 5%eXx 6th=$5", "=&r,=r,=r,=*m,=&q,=*imr,1,2,*m,5,~{dirflag},~{fpsr},~{flags},~{cx}"(i8** %2, i8* %state, i32 %3, i32 %4, i8** %2, i8 %5) nounwind		; <{ i32, i32, i32, i32 }> [#uses=3]
 	%asmresult = extractvalue { i32, i32, i32, i32 } %asmtmp, 0		; <i32> [#uses=1]
 	%asmresult1 = extractvalue { i32, i32, i32, i32 } %asmtmp, 1		; <i32> [#uses=1]
 	store i32 %asmresult1, i32* %0
diff --git a/test/CodeGen/X86/2008-12-19-EarlyClobberBug.ll b/test/CodeGen/X86/2008-12-19-EarlyClobberBug.ll
index 5eba9b9..75e0b8a 100644
--- a/test/CodeGen/X86/2008-12-19-EarlyClobberBug.ll
+++ b/test/CodeGen/X86/2008-12-19-EarlyClobberBug.ll
@@ -4,7 +4,7 @@
 
 ; CHECK:         ## InlineAsm End
 ; CHECK-NEXT: BB0_2:
-; CHECK-NEXT:    movl	%esi, %eax
+; CHECK-NEXT:    {{movl	%esi, %eax|addl	%edi, %esi}}
 
 
 @"\01LC" = internal constant [7 x i8] c"n0=%d\0A\00"		; <[7 x i8]*> [#uses=1]
diff --git a/test/CodeGen/X86/2009-03-13-PHIElimBug.ll b/test/CodeGen/X86/2009-03-13-PHIElimBug.ll
index 2853930..45fc269 100644
--- a/test/CodeGen/X86/2009-03-13-PHIElimBug.ll
+++ b/test/CodeGen/X86/2009-03-13-PHIElimBug.ll
@@ -28,5 +28,5 @@ lpad:		; preds = %cont, %entry
 }
 
 ; CHECK: call{{.*}}f
-; CHECK-NEXT: Ltmp0:
-; CHECK-NEXT: movl %eax, %esi
+; CHECK: movl %eax, %esi
+; CHECK: call{{.*}}g
diff --git a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
index 848af82..2fceab6 100644
--- a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
@@ -1,8 +1,10 @@
-; RUN: llc -march=x86-64 -O2 < %s | FileCheck %s
-; RUN: llc -march=x86-64 -O2 -regalloc=basic < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux -O2 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux -O2 -regalloc=basic < %s | FileCheck %s
 ; Test to check .debug_loc support. This test case emits many debug_loc entries.
 
 ; CHECK: Loc expr size
+; CHECK-NEXT: .short
+; CHECK-NEXT: .Ltmp
 ; CHECK-NEXT: DW_OP_reg
 
 %0 = type { double }
diff --git a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
index 6600cc3..7909d27 100644
--- a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
@@ -68,9 +68,15 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 ; CHECK: Ldebug_loc0:
 ; CHECK-NEXT: .quad   Lfunc_begin0
 ; CHECK-NEXT: .quad   [[LABEL]]
-; CHECK-NEXT: .short  1
+; CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}}               ## Loc expr size
+; CHECK-NEXT: .short  Lset{{.*}}
+; CHECK-NEXT: Ltmp{{.*}}:
 ; CHECK-NEXT: .byte   85
+; CHECK-NEXT: Ltmp{{.*}}:
 ; CHECK-NEXT: .quad   [[LABEL]]
 ; CHECK-NEXT: .quad   [[CLOBBER]]
-; CHECK-NEXT: .short  1
+; CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}}               ## Loc expr size
+; CHECK-NEXT: .short  Lset{{.*}}
+; CHECK-NEXT: Ltmp{{.*}}:
 ; CHECK-NEXT: .byte   83
+; CHECK-NEXT: Ltmp{{.*}}:
+\ No newline at end of file
diff --git a/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll b/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll
index 6db3ce1..bb1db59 100644
--- a/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll
+++ b/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll
@@ -22,7 +22,7 @@ bb:
 ; it is.
 ;
 ; CHECK: # %bb
-; CHECK: addq $64036, %rdi
+; CHECK: leaq	64036(%rdx), %rdi
 ; CHECK: rep;stosl
 
   %tmp5 = bitcast i32* %tmp4 to i8*
diff --git a/test/CodeGen/Generic/2011-02-12-shuffle.ll b/test/CodeGen/X86/2011-02-12-shuffle.ll
index b4d56d1..b4d56d1 100644
--- a/test/CodeGen/Generic/2011-02-12-shuffle.ll
+++ b/test/CodeGen/X86/2011-02-12-shuffle.ll
diff --git a/test/CodeGen/X86/2011-05-09-loaduse.ll b/test/CodeGen/X86/2011-05-09-loaduse.ll
new file mode 100644
index 0000000..8673d74
--- /dev/null
+++ b/test/CodeGen/X86/2011-05-09-loaduse.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+
+;CHECK: test
+;CHECK-not: pshufd
+;CHECK: ret
+define float @test(<4 x float>* %A) nounwind {
+entry:
+  %T = load <4 x float>* %A
+  %R = extractelement <4 x float> %T, i32 3
+  store <4 x float><float 0.0, float 0.0, float 0.0, float 0.0>, <4 x float>* %A
+  ret float %R
+}
+
diff --git a/test/CodeGen/X86/2011-05-26-UnreachableBlockElim.ll b/test/CodeGen/X86/2011-05-26-UnreachableBlockElim.ll
new file mode 100644
index 0000000..0f18f09
--- /dev/null
+++ b/test/CodeGen/X86/2011-05-26-UnreachableBlockElim.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -verify-coalescing
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.0"
+
+%struct.attrib = type { i32, i32 }
+%struct.dfa = type { [80 x i8], i32, %struct.state*, i32, i32, %struct.attrib*, i32, i32 }
+%struct.state = type { i32, [4 x i32] }
+
+@aux_temp = external global %struct.dfa, align 8
+
+declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readnone
+
+declare void @__memset_chk() nounwind
+
+define void @dfa_add_string() nounwind uwtable ssp {
+entry:
+  br label %if.end.i
+
+if.end.i:                                         ; preds = %entry
+  %idxprom.i = add i64 0, 1
+  br i1 undef, label %land.end.thread.i, label %land.end.i
+
+land.end.thread.i:                                ; preds = %if.end.i
+  %0 = call i64 @llvm.objectsize.i64(i8* undef, i1 false) nounwind
+  %cmp1710.i = icmp eq i64 %0, -1
+  br i1 %cmp1710.i, label %cond.false156.i, label %cond.true138.i
+
+land.end.i:                                       ; preds = %if.end.i
+  %1 = call i64 @llvm.objectsize.i64(i8* undef, i1 false) nounwind
+  %cmp17.i = icmp eq i64 %1, -1
+  br i1 %cmp17.i, label %cond.false156.i, label %cond.true138.i
+
+cond.true138.i:                                   ; preds = %for.end.i, %land.end.thread.i
+  call void @__memset_chk() nounwind
+  br label %cond.end166.i
+
+cond.false156.i:                                  ; preds = %for.end.i, %land.end.thread.i
+  %idxprom1114.i = phi i64 [ undef, %land.end.thread.i ], [ %idxprom.i, %land.end.i ]
+  call void @__memset_chk() nounwind
+  br label %cond.end166.i
+
+cond.end166.i:                                    ; preds = %cond.false156.i, %cond.true138.i
+  %idxprom1113.i = phi i64 [ %idxprom1114.i, %cond.false156.i ], [ undef, %cond.true138.i ]
+  %tmp235.i = load %struct.state** getelementptr inbounds (%struct.dfa* @aux_temp, i64 0, i32 2), align 8, !tbaa !0
+  %att.i = getelementptr inbounds %struct.state* %tmp235.i, i64 %idxprom1113.i, i32 0
+  store i32 0, i32* %att.i, align 4, !tbaa !3
+  ret void
+}
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
+!3 = metadata !{metadata !"int", metadata !1}
diff --git a/test/CodeGen/X86/2011-05-27-CrossClassCoalescing.ll b/test/CodeGen/X86/2011-05-27-CrossClassCoalescing.ll
new file mode 100644
index 0000000..c595bba
--- /dev/null
+++ b/test/CodeGen/X86/2011-05-27-CrossClassCoalescing.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -verify-coalescing
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.0"
+
+@bit_count = external constant [256 x i32], align 16
+
+define fastcc void @unate_intersect() nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc.i
+  br label %do.body.i
+
+do.body.i:                                        ; preds = %do.body.i, %for.body
+  %exitcond149 = icmp eq i64 undef, undef
+  br i1 %exitcond149, label %land.lhs.true, label %do.body.i
+
+land.lhs.true:                                    ; preds = %do.body.i
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.inc.i, %if.then
+  %tmp3524.i = phi i32 [ 0, %land.lhs.true ], [ %tmp351.i, %for.inc.i ]
+  %tmp6.i12 = load i32* undef, align 4
+  br i1 undef, label %for.inc.i, label %if.then.i17
+
+if.then.i17:                                      ; preds = %for.body.i
+  %shr.i14 = lshr i32 %tmp6.i12, 8
+  %and14.i = and i32 %shr.i14, 255
+  %idxprom15.i = zext i32 %and14.i to i64
+  %arrayidx16.i = getelementptr inbounds [256 x i32]* @bit_count, i64 0, i64 %idxprom15.i
+  %tmp17.i15 = load i32* %arrayidx16.i, align 4
+  %add.i = add i32 0, %tmp3524.i
+  %add24.i = add i32 %add.i, %tmp17.i15
+  %add31.i = add i32 %add24.i, 0
+  %add33.i = add i32 %add31.i, 0
+  br label %for.inc.i
+
+for.inc.i:                                        ; preds = %if.then.i17, %for.body.i
+  %tmp351.i = phi i32 [ %add33.i, %if.then.i17 ], [ %tmp3524.i, %for.body.i ]
+  br label %for.body.i
+}
diff --git a/test/CodeGen/X86/2011-05-31-movmsk.ll b/test/CodeGen/X86/2011-05-31-movmsk.ll
new file mode 100644
index 0000000..2b54d5c
--- /dev/null
+++ b/test/CodeGen/X86/2011-05-31-movmsk.ll
@@ -0,0 +1,79 @@
+; RUN: llc -mcpu=core2 < %s | FileCheck %s
+; ModuleID = '<stdin>'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.6"
+
+%0 = type { double }
+%union.anon = type { float }
+
+define i32 @double_signbit(double %d1) nounwind uwtable readnone ssp {
+entry:
+  %__x.addr.i = alloca double, align 8
+  %__u.i = alloca %0, align 8
+  %0 = bitcast double* %__x.addr.i to i8*
+  %1 = bitcast %0* %__u.i to i8*
+  store double %d1, double* %__x.addr.i, align 8
+  %__f.i = getelementptr inbounds %0* %__u.i, i64 0, i32 0
+  store double %d1, double* %__f.i, align 8
+  %tmp = bitcast double %d1 to i64
+; CHECK-NOT: shr
+; CHECK: movmskpd
+; CHECK-NEXT: and
+  %tmp1 = lshr i64 %tmp, 63
+  %shr.i = trunc i64 %tmp1 to i32
+  ret i32 %shr.i
+}
+
+define i32 @double_add_signbit(double %d1, double %d2) nounwind uwtable readnone ssp {
+entry:
+  %__x.addr.i = alloca double, align 8
+  %__u.i = alloca %0, align 8
+  %add = fadd double %d1, %d2
+  %0 = bitcast double* %__x.addr.i to i8*
+  %1 = bitcast %0* %__u.i to i8*
+  store double %add, double* %__x.addr.i, align 8
+  %__f.i = getelementptr inbounds %0* %__u.i, i64 0, i32 0
+  store double %add, double* %__f.i, align 8
+  %tmp = bitcast double %add to i64
+; CHECK-NOT: shr
+; CHECK: movmskpd
+; CHECK-NEXT: and
+  %tmp1 = lshr i64 %tmp, 63
+  %shr.i = trunc i64 %tmp1 to i32
+  ret i32 %shr.i
+}
+
+define i32 @float_signbit(float %f1) nounwind uwtable readnone ssp {
+entry:
+  %__x.addr.i = alloca float, align 4
+  %__u.i = alloca %union.anon, align 4
+  %0 = bitcast float* %__x.addr.i to i8*
+  %1 = bitcast %union.anon* %__u.i to i8*
+  store float %f1, float* %__x.addr.i, align 4
+  %__f.i = getelementptr inbounds %union.anon* %__u.i, i64 0, i32 0
+  store float %f1, float* %__f.i, align 4
+  %2 = bitcast float %f1 to i32
+; CHECK-NOT: shr
+; CHECK: movmskps
+; CHECK-NEXT: and
+  %shr.i = lshr i32 %2, 31
+  ret i32 %shr.i
+}
+
+define i32 @float_add_signbit(float %f1, float %f2) nounwind uwtable readnone ssp {
+entry:
+  %__x.addr.i = alloca float, align 4
+  %__u.i = alloca %union.anon, align 4
+  %add = fadd float %f1, %f2
+  %0 = bitcast float* %__x.addr.i to i8*
+  %1 = bitcast %union.anon* %__u.i to i8*
+  store float %add, float* %__x.addr.i, align 4
+  %__f.i = getelementptr inbounds %union.anon* %__u.i, i64 0, i32 0
+  store float %add, float* %__f.i, align 4
+  %2 = bitcast float %add to i32
+; CHECK-NOT: shr
+; CHECK: movmskps
+; CHECK-NEXT: and
+  %shr.i = lshr i32 %2, 31
+  ret i32 %shr.i
+}
diff --git a/test/CodeGen/X86/2011-06-01-fildll.ll b/test/CodeGen/X86/2011-06-01-fildll.ll
new file mode 100644
index 0000000..3a0b05f
--- /dev/null
+++ b/test/CodeGen/X86/2011-06-01-fildll.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=x86 | FileCheck %s
+; ModuleID = '<stdin>'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+target triple = "i386-apple-macosx10.6.6"
+
+define float @f(i64* nocapture %x) nounwind readonly ssp {
+entry:
+; CHECK: movl
+; CHECK-NOT: movl
+  %tmp1 = load i64* %x, align 4
+; CHECK: fildll
+  %conv = sitofp i64 %tmp1 to float
+  %add = fadd float %conv, 1.000000e+00
+  ret float %add
+}
diff --git a/test/CodeGen/X86/2011-06-03-x87chain.ll b/test/CodeGen/X86/2011-06-03-x87chain.ll
new file mode 100644
index 0000000..bf7f583
--- /dev/null
+++ b/test/CodeGen/X86/2011-06-03-x87chain.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=x86 -mattr=+sse | FileCheck %s
+
+define float @chainfail1(i64* nocapture %a, i64* nocapture %b, i32 %x, i32 %y, float* nocapture %f) nounwind uwtable noinline ssp {
+entry:
+  %tmp1 = load i64* %a, align 8
+; Insure x87 ops are properly chained, order preserved.
+; CHECK: fildll
+  %conv = sitofp i64 %tmp1 to float
+; CHECK: fstps
+  store float %conv, float* %f, align 4
+; CHECK: idivl
+  %div = sdiv i32 %x, %y
+  %conv5 = sext i32 %div to i64
+  store i64 %conv5, i64* %b, align 8
+  ret float %conv
+}
+
+define float @chainfail2(i64* nocapture %a, i64* nocapture %b, i32 %x, i32 %y, float* nocapture %f) nounwind uwtable noinline ssp {
+entry:
+; CHECK: movl $0,
+  store i64 0, i64* %b, align 8
+  %mul = mul nsw i32 %y, %x
+  %sub = add nsw i32 %mul, -1
+  %idxprom = sext i32 %sub to i64
+  %arrayidx = getelementptr inbounds i64* %a, i64 %idxprom
+  %tmp4 = load i64* %arrayidx, align 8
+; CHECK: fildll
+  %conv = sitofp i64 %tmp4 to float
+  store float %conv, float* %f, align 4
+  ret float %conv
+}
diff --git a/test/CodeGen/X86/2011-06-06-fgetsign80bit.ll b/test/CodeGen/X86/2011-06-06-fgetsign80bit.ll
new file mode 100644
index 0000000..d934148
--- /dev/null
+++ b/test/CodeGen/X86/2011-06-06-fgetsign80bit.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=x86-64 < %s
+define i32 @signbitl(x86_fp80 %x) nounwind uwtable readnone {
+entry:
+  %tmp4 = bitcast x86_fp80 %x to i80
+  %tmp4.lobit = lshr i80 %tmp4, 79
+  %tmp = trunc i80 %tmp4.lobit to i32
+  ret i32 %tmp
+}
diff --git a/test/CodeGen/X86/2011-06-19-QuicksortCoalescerBug.ll b/test/CodeGen/X86/2011-06-19-QuicksortCoalescerBug.ll
new file mode 100644
index 0000000..08178a3
--- /dev/null
+++ b/test/CodeGen/X86/2011-06-19-QuicksortCoalescerBug.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -verify-coalescing
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.7.0"
+
+define void @Quicksort(i32* %a, i32 %l, i32 %r) nounwind ssp {
+entry:
+  br label %tailrecurse
+
+tailrecurse:                                      ; preds = %do.cond, %entry
+  %l.tr = phi i32 [ %l, %entry ], [ %i.1, %do.cond ]
+  %r.tr = phi i32 [ %r, %entry ], [ %l.tr, %do.cond ]
+  %idxprom12 = sext i32 %r.tr to i64
+  %arrayidx14 = getelementptr inbounds i32* %a, i64 %idxprom12
+  br label %do.body
+
+do.body:                                          ; preds = %do.cond, %tailrecurse
+  %i.0 = phi i32 [ %l.tr, %tailrecurse ], [ %i.1, %do.cond ]
+  %add7 = add nsw i32 %i.0, 1
+  %cmp = icmp sgt i32 %add7, %r.tr
+  br i1 %cmp, label %do.cond, label %if.then
+
+if.then:                                          ; preds = %do.body
+  store i32 %add7, i32* %arrayidx14, align 4
+  %add16 = add i32 %i.0, 2
+  br label %do.cond
+
+do.cond:                                          ; preds = %do.body, %if.then
+  %i.1 = phi i32 [ %add16, %if.then ], [ %add7, %do.body ]
+  %cmp19 = icmp sgt i32 %i.1, %r.tr
+  br i1 %cmp19, label %tailrecurse, label %do.body
+}
diff --git a/test/CodeGen/X86/9601.ll b/test/CodeGen/X86/9601.ll
new file mode 100644
index 0000000..cd65a03
--- /dev/null
+++ b/test/CodeGen/X86/9601.ll
@@ -0,0 +1,12 @@
+; RUN:  llc < %s -mtriple=x86_64-unknown-linux-gnu
+; PR9601
+; Previously we'd crash trying to put a 32-bit float into a constraint
+; for a normal 'r' register.
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test() {
+entry:
+  %0 = call float asm sideeffect "xchg $0, $1", "=r,*m,0,~{memory},~{dirflag},~{fpsr},~{flags}"(i32* undef, float 2.000000e+00) nounwind
+  unreachable
+}
diff --git a/test/CodeGen/X86/abi-isel.ll b/test/CodeGen/X86/abi-isel.ll
index 7535e07..5068d29 100644
--- a/test/CodeGen/X86/abi-isel.ll
+++ b/test/CodeGen/X86/abi-isel.ll
@@ -12,17 +12,6 @@
 ; RUN: llc < %s -asm-verbose=0 -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=dynamic-no-pic -code-model=small | FileCheck %s -check-prefix=DARWIN-64-DYNAMIC
 ; RUN: llc < %s -asm-verbose=0 -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=pic -code-model=small | FileCheck %s -check-prefix=DARWIN-64-PIC
 
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=i686-unknown-linux-gnu -march=x86 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=LINUX-32-STATIC
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=i686-unknown-linux-gnu -march=x86 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=LINUX-32-PIC
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=LINUX-64-STATIC
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=pic -code-model=small | FileCheck %s -check-prefix=LINUX-64-PIC
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=i686-apple-darwin -march=x86 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=DARWIN-32-STATIC
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=i686-apple-darwin -march=x86 -relocation-model=dynamic-no-pic -code-model=small | FileCheck %s -check-prefix=DARWIN-32-DYNAMIC
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=i686-apple-darwin -march=x86 -relocation-model=pic -code-model=small | FileCheck %s -check-prefix=DARWIN-32-PIC
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=DARWIN-64-STATIC
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=dynamic-no-pic -code-model=small | FileCheck %s -check-prefix=DARWIN-64-DYNAMIC
-; RUN: llc < %s -asm-verbose=0 -regalloc=basic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=pic -code-model=small | FileCheck %s -check-prefix=DARWIN-64-PIC
-
 @src = external global [131072 x i32]
 @dst = external global [131072 x i32]
 @xsrc = external global [32 x i32]
diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll
index f924ec8..a4abccb 100644
--- a/test/CodeGen/X86/add-of-carry.ll
+++ b/test/CodeGen/X86/add-of-carry.ll
@@ -4,9 +4,9 @@
 define i32 @test1(i32 %sum, i32 %x) nounwind readnone ssp {
 entry:
 ; CHECK: test1:
-; CHECK:	sbbl	%ecx, %ecx
+; CHECK: cmpl %ecx, %eax
 ; CHECK-NOT: addl
-; CHECK: subl	%ecx, %eax
+; CHECK: adcl $0, %eax
   %add4 = add i32 %x, %sum
   %cmp = icmp ult i32 %add4, %x
   %inc = zext i1 %cmp to i32
@@ -18,8 +18,7 @@ entry:
 ; CHECK: test2:
 ; CHECK: movl
 ; CHECK-NEXT: addl
-; CHECK-NEXT: sbbl
-; CHECK-NEXT: subl
+; CHECK-NEXT: adcl $0
 ; CHECK-NEXT: ret
 define i32 @test2(i32 %sum, i32 %x) nounwind readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/add.ll b/test/CodeGen/X86/add.ll
index b95e5b5..7bf527a 100644
--- a/test/CodeGen/X86/add.ll
+++ b/test/CodeGen/X86/add.ll
@@ -1,6 +1,8 @@
 ; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-linux -join-physregs | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-win32 -join-physregs | FileCheck %s -check-prefix=X64
+
+; Some of these tests depend on -join-physregs to commute instructions.
 
 ; The immediate can be encoded in a smaller way if the
 ; instruction is a sub instead of an add.
diff --git a/test/CodeGen/X86/andimm8.ll b/test/CodeGen/X86/andimm8.ll
index 640237d..a3dc85f 100644
--- a/test/CodeGen/X86/andimm8.ll
+++ b/test/CodeGen/X86/andimm8.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux-gnu -show-mc-encoding | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux-gnu -show-mc-encoding -join-physregs | FileCheck %s
 
 ; PR8365
 ; CHECK: andl	$-64, %edi              # encoding: [0x83,0xe7,0xc0]
diff --git a/test/CodeGen/X86/basic-promote-integers.ll b/test/CodeGen/X86/basic-promote-integers.ll
new file mode 100644
index 0000000..c80f2b0
--- /dev/null
+++ b/test/CodeGen/X86/basic-promote-integers.ll
@@ -0,0 +1,98 @@
+; Test that vectors are scalarized/lowered correctly
+; (with both legalization methods).
+; RUN: llc -march=x86 -promote-elements < %s
+; RUN: llc -march=x86                   < %s
+
+; A simple test to check copyToParts and copyFromParts.
+
+define <4 x i64> @test_param_0(<4 x i64> %A, <2 x i32> %B, <4 x i8> %C)  {
+   ret <4 x i64> %A
+}
+
+define <2 x i32> @test_param_1(<4 x i64> %A, <2 x i32> %B, <4 x i8> %C)  {
+   ret <2 x i32> %B
+}
+
+define <4 x i8> @test_param_2(<4 x i64> %A, <2 x i32> %B, <4 x i8> %C)  {
+   ret <4 x i8> %C
+}
+
+; Simple tests to check arithmetic and vector operations on types which need to
+; be legalized (no loads/stores to/from memory here).
+
+define <4 x i64> @test_arith_0(<4 x i64> %A, <2 x i32> %B, <4 x i8> %C)  {
+   %K = add <4 x i64> %A, <i64 0, i64 1, i64 3, i64 9>
+   ret <4 x i64> %K
+}
+
+define <2 x i32> @test_arith_1(<4 x i64> %A, <2 x i32> %B, <4 x i8> %C)  {
+   %K = add <2 x i32> %B, <i32 0, i32 1>
+   ret <2 x i32> %K
+}
+
+define <4 x i8> @test_arith_2(<4 x i64> %A, <2 x i32> %B, <4 x i8> %C)  {
+   %K = add <4 x i8> %C, <i8 0, i8 1, i8 3, i8 9>
+   ret <4 x i8> %K
+}
+
+define i8 @test_arith_3(<4 x i64> %A, <2 x i32> %B, <4 x i8> %C)  {
+   %K = add <4 x i8> %C, <i8 0, i8 1, i8 3, i8 9>
+   %Y = extractelement <4 x i8> %K, i32 1
+   ret i8 %Y
+}
+
+define <4 x i8> @test_arith_4(<4 x i64> %A, <2 x i32> %B, <4 x i8> %C)  {
+   %Y = insertelement <4 x i8> %C, i8 1, i32 0
+   ret <4 x i8> %Y
+}
+
+define <4 x i32> @test_arith_5(<4 x i64> %A, <2 x i32> %B, <4 x i32> %C)  {
+   %Y = insertelement <4 x i32> %C, i32 1, i32 0
+   ret <4 x i32> %Y
+}
+
+define <4 x i32> @test_arith_6(<4 x i64> %A, <2 x i32> %B, <4 x i32> %C)  {
+   %F = extractelement <2 x i32> %B, i32 1
+   %Y = insertelement <4 x i32> %C, i32 %F, i32 0
+   ret <4 x i32> %Y
+}
+
+define <4 x i64> @test_arith_7(<4 x i64> %A, <2 x i32> %B, <4 x i32> %C)  {
+   %F = extractelement <2 x i32> %B, i32 1
+   %W = zext i32 %F to i64
+   %Y = insertelement <4 x i64> %A, i64 %W, i32 0
+   ret <4 x i64> %Y
+}
+
+define i64 @test_arith_8(<4 x i64> %A, <2 x i32> %B, <4 x i32> %C)  {
+   %F = extractelement <2 x i32> %B, i32 1
+   %W = zext i32 %F to i64
+   %T = add i64 %W , 11
+   ret i64 %T
+}
+
+define <4 x i64> @test_arith_9(<4 x i64> %A, <2 x i32> %B, <4 x i16> %C)  {
+   %T = add <4 x i16> %C, %C
+   %F0 = extractelement <4 x i16> %T, i32 0
+   %F1 = extractelement <4 x i16> %T, i32 1
+   %W0 = zext i16 %F0 to i64
+   %W1 = zext i16 %F1 to i64
+   %Y0 = insertelement <4 x i64> %A,  i64 %W0, i32 0
+   %Y1 = insertelement <4 x i64> %Y0, i64 %W1, i32 2
+   ret <4 x i64> %Y1
+}
+
+define <4 x i16> @test_arith_10(<4 x i64> %A, <2 x i32> %B, <4 x i32> %C)  {
+   %F = bitcast <2 x i32> %B to <4 x i16>
+   %T = add <4 x i16> %F , <i16 0, i16 1, i16 2, i16 3>
+   ret <4 x i16> %T
+}
+
+
+; Simple tests to check saving/loading from memory
+define <4 x i16> @test_mem_0(<4 x i64> %A, <2 x i32> %B, <4 x i32> %C)  {
+   %F = bitcast <2 x i32> %B to <4 x i16>
+   %T = add <4 x i16> %F , <i16 0, i16 1, i16 2, i16 3>
+   ret <4 x i16> %T
+}
+
diff --git a/test/CodeGen/X86/bool-zext.ll b/test/CodeGen/X86/bool-zext.ll
index d2c30c6..3558376 100644
--- a/test/CodeGen/X86/bool-zext.ll
+++ b/test/CodeGen/X86/bool-zext.ll
@@ -1,8 +1,12 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN64
 
-; CHECK: @bar1
-; CHECK: movzbl
-; CHECK: callq
+; X64: @bar1
+; X64: movzbl
+; X64: jmp
+; WIN64: @bar1
+; WIN64: movzbl
+; WIN64: callq
 define void @bar1(i1 zeroext %v1) nounwind ssp {
 entry:
   %conv = zext i1 %v1 to i32
@@ -10,9 +14,12 @@ entry:
   ret void
 }
 
-; CHECK: @bar2
-; CHECK-NOT: movzbl
-; CHECK: callq
+; X64: @bar2
+; X64-NOT: movzbl
+; X64: jmp
+; WIN64: @bar2
+; WIN64-NOT: movzbl
+; WIN64: callq
 define void @bar2(i8 zeroext %v1) nounwind ssp {
 entry:
   %conv = zext i8 %v1 to i32
@@ -20,11 +27,16 @@ entry:
   ret void
 }
 
-; CHECK: @bar3
-; CHECK: callq
-; CHECK-NOT: movzbl
-; CHECK-NOT: and
-; CHECK: ret
+; X64: @bar3
+; X64: callq
+; X64-NOT: movzbl
+; X64-NOT: and
+; X64: ret
+; WIN64: @bar3
+; WIN64: callq
+; WIN64-NOT: movzbl
+; WIN64-NOT: and
+; WIN64: ret
 define zeroext i1 @bar3() nounwind ssp {
 entry:
   %call = call i1 @foo2() nounwind
diff --git a/test/CodeGen/X86/byval-align.ll b/test/CodeGen/X86/byval-align.ll
new file mode 100644
index 0000000..c62a181
--- /dev/null
+++ b/test/CodeGen/X86/byval-align.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+%struct.S = type { i32}
+
+@.str = private constant [10 x i8] c"ptr = %p\0A\00", align 1 ; <[10 x i8]*> [#uses=1]
+@.str1 = private constant [8 x i8] c"Failed \00", align 1 ; <[8 x i8]*> [#uses=1]
+@.str2 = private constant [2 x i8] c"0\00", align 1 ; <[2 x i8]*> [#uses=1]
+@.str3 = private constant [7 x i8] c"test.c\00", align 1 ; <[7 x i8]*> [#uses=1]
+@__PRETTY_FUNCTION__.2067 = internal constant [13 x i8] c"aligned_func\00" ; <[13 x i8]*> [#uses=1]
+
+define void @aligned_func(%struct.S* byval align 64 %obj) nounwind {
+entry:
+  %ptr = alloca i8*                               ; <i8**> [#uses=3]
+  %p = alloca i64                                 ; <i64*> [#uses=3]
+  %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+  %obj1 = bitcast %struct.S* %obj to i8*          ; <i8*> [#uses=1]
+  store i8* %obj1, i8** %ptr, align 8
+  %0 = load i8** %ptr, align 8                    ; <i8*> [#uses=1]
+  %1 = ptrtoint i8* %0 to i64                     ; <i64> [#uses=1]
+  store i64 %1, i64* %p, align 8
+  %2 = load i8** %ptr, align 8                    ; <i8*> [#uses=1]
+  %3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str, i64 0, i64 0), i8* %2) nounwind ; <i32> [#uses=0]
+  %4 = load i64* %p, align 8                      ; <i64> [#uses=1]
+  %5 = and i64 %4, 140737488355264                ; <i64> [#uses=1]
+  %6 = load i64* %p, align 8                      ; <i64> [#uses=1]
+  %7 = icmp ne i64 %5, %6                         ; <i1> [#uses=1]
+  br i1 %7, label %bb, label %bb2
+
+bb:                                               ; preds = %entry
+  %8 = call i32 @puts(i8* getelementptr inbounds ([8 x i8]* @.str1, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
+  call void @__assert_fail(i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0), i8* getelementptr inbounds ([7 x i8]* @.str3, i64 0, i64 0), i32 18, i8* getelementptr inbounds ([13 x i8]* @__PRETTY_FUNCTION__.2067, i64 0, i64 0)) noreturn nounwind
+  unreachable
+
+bb2:                                              ; preds = %entry
+  br label %return
+
+return:                                           ; preds = %bb2
+  ret void
+}
+
+declare i32 @printf(i8*, ...) nounwind
+
+declare i32 @puts(i8*)
+
+declare void @__assert_fail(i8*, i8*, i32, i8*) noreturn nounwind
+
+define void @main() nounwind {
+entry:
+; CHECK: main
+; CHECK: andq    $-64, %rsp
+  %s1 = alloca %struct.S                          ; <%struct.S*> [#uses=4]
+  %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+  %0 = getelementptr inbounds %struct.S* %s1, i32 0, i32 0 ; <i32*> [#uses=1]
+  store i32 1, i32* %0, align 4
+  call void @aligned_func(%struct.S* byval align 64 %s1) nounwind
+  br label %return
+
+return:                                           ; preds = %entry
+  ret void
+}
diff --git a/test/CodeGen/X86/byval7.ll b/test/CodeGen/X86/byval7.ll
index 686ed9c..98a26e4 100644
--- a/test/CodeGen/X86/byval7.ll
+++ b/test/CodeGen/X86/byval7.ll
@@ -9,7 +9,6 @@ entry:
 ; CHECK: main:
 ; CHECK: movl $1, (%esp)
 ; CHECK: leal 16(%esp), %edi
-; CHECK: movl $36, %ecx
 ; CHECK: leal 160(%esp), %esi
 ; CHECK: rep;movsl
 	%s = alloca %struct.S		; <%struct.S*> [#uses=2]
diff --git a/test/CodeGen/X86/clz.ll b/test/CodeGen/X86/clz.ll
index 623ac75..d76fab4 100644
--- a/test/CodeGen/X86/clz.ll
+++ b/test/CodeGen/X86/clz.ll
@@ -31,3 +31,18 @@ entry:
 }
 
 declare i16 @llvm.ctlz.i16(i16) nounwind readnone 
+
+; Don't generate the cmovne when the source is known non-zero (and bsr would
+; not set ZF).
+; rdar://9490949
+
+define i32 @t4(i32 %n) nounwind {
+entry:
+; CHECK: t4:
+; CHECK: bsrl
+; CHECK-NOT: cmov
+; CHECK: ret
+  %or = or i32 %n, 1
+  %tmp1 = tail call i32 @llvm.ctlz.i32(i32 %or)
+  ret i32 %tmp1
+}
diff --git a/test/CodeGen/X86/coalescer-commute2.ll b/test/CodeGen/X86/coalescer-commute2.ll
index 7306920..6e5c1cf 100644
--- a/test/CodeGen/X86/coalescer-commute2.ll
+++ b/test/CodeGen/X86/coalescer-commute2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -join-physregs | FileCheck %s
 ; CHECK-NOT:     mov
 ; CHECK:     paddw
 ; CHECK-NOT:     mov
diff --git a/test/CodeGen/X86/dbg-const-int.ll b/test/CodeGen/X86/dbg-const-int.ll
new file mode 100644
index 0000000..bfc96f1
--- /dev/null
+++ b/test/CodeGen/X86/dbg-const-int.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s - | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.7"
+; Radar 9511391
+
+;CHECK:         .byte   4                       ## DW_AT_const_value
+define i32 @foo() nounwind uwtable readnone optsize ssp {
+entry:
+  tail call void @llvm.dbg.value(metadata !8, i64 0, metadata !6), !dbg !9
+  ret i32 42, !dbg !10
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+!llvm.dbg.sp = !{!1}
+!llvm.dbg.lv.foo = !{!6}
+
+!0 = metadata !{i32 589841, i32 0, i32 12, metadata !"a.c", metadata !"/private/tmp", metadata !"clang version 3.0 (trunk 132191)", i1 true, i1 true, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{i32 589870, i32 0, metadata !2, metadata !"foo", metadata !"foo", metadata !"", metadata !2, i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 true, i32 ()* @foo, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 589865, metadata !"a.c", metadata !"/private/tmp", metadata !0} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 589845, metadata !2, metadata !"", metadata !2, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 589860, metadata !0, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{i32 590080, metadata !7, metadata !"i", metadata !2, i32 2, metadata !5, i32 0} ; [ DW_TAG_auto_variable ]
+!7 = metadata !{i32 589835, metadata !1, i32 1, i32 11, metadata !2, i32 0} ; [ DW_TAG_lexical_block ]
+!8 = metadata !{i32 42}
+!9 = metadata !{i32 2, i32 12, metadata !7, null}
+!10 = metadata !{i32 3, i32 2, metadata !7, null}
diff --git a/test/CodeGen/X86/dbg-const.ll b/test/CodeGen/X86/dbg-const.ll
new file mode 100644
index 0000000..5a51eb8
--- /dev/null
+++ b/test/CodeGen/X86/dbg-const.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s - | FileCheck %s
+target triple = "x86_64-apple-darwin10.0.0"
+
+;CHECK:        ## DW_OP_constu
+;CHECK-NEXT:  .byte	42
+define i32 @foobar() nounwind readonly noinline ssp {
+entry:
+  %call = tail call i32 @bar(), !dbg !11
+  tail call void @llvm.dbg.value(metadata !8, i64 0, metadata !6), !dbg !9
+  %call2 = tail call i32 @bar(), !dbg !11
+  tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !6), !dbg !11
+  %add = add nsw i32 %call2, %call, !dbg !12
+  ret i32 %add, !dbg !10
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare i32 @bar() nounwind readnone
+
+!llvm.dbg.sp = !{!0}
+!llvm.dbg.lv.foobar = !{!6}
+
+!0 = metadata !{i32 524334, i32 0, metadata !1, metadata !"foobar", metadata !"foobar", metadata !"foobar", metadata !1, i32 12, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 ()* @foobar}
+!1 = metadata !{i32 524329, metadata !"mu.c", metadata !"/private/tmp", metadata !2}
+!2 = metadata !{i32 524305, i32 0, i32 12, metadata !"mu.c", metadata !"/private/tmp", metadata !"clang version 2.9 (trunk 114183)", i1 true, i1 true, metadata !"", i32 0}
+!3 = metadata !{i32 524309, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 524324, metadata !1, metadata !"int", metadata !1, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
+!6 = metadata !{i32 524544, metadata !7, metadata !"j", metadata !1, i32 15, metadata !5}
+!7 = metadata !{i32 524299, metadata !0, i32 12, i32 52, metadata !1, i32 0}
+!8 = metadata !{i32 42}
+!9 = metadata !{i32 15, i32 12, metadata !7, null}
+!10 = metadata !{i32 23, i32 3, metadata !7, null}
+!11 = metadata !{i32 17, i32 3, metadata !7, null}
+!12 = metadata !{i32 18, i32 3, metadata !7, null}
diff --git a/test/CodeGen/X86/dbg-file-name.ll b/test/CodeGen/X86/dbg-file-name.ll
index e7d5f92..3a849aa 100644
--- a/test/CodeGen/X86/dbg-file-name.ll
+++ b/test/CodeGen/X86/dbg-file-name.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple x86_64-apple-darwin10.0.0  < %s | FileCheck %s
 
 ; Radar 8884898
-; CHECK: file	1 "/Users/manav/one/two/simple.c"
+; CHECK: file	1 "/Users/manav/one/two{{/|\\\\}}simple.c"
 
 declare i32 @printf(i8*, ...) nounwind
 
diff --git a/test/CodeGen/X86/dbg-merge-loc-entry.ll b/test/CodeGen/X86/dbg-merge-loc-entry.ll
index 76b93dd..afe1729 100644
--- a/test/CodeGen/X86/dbg-merge-loc-entry.ll
+++ b/test/CodeGen/X86/dbg-merge-loc-entry.ll
@@ -6,8 +6,11 @@ target triple = "x86_64-apple-darwin8"
 ;CHECK: Ldebug_loc0:
 ;CHECK-NEXT:	.quad	Lfunc_begin0
 ;CHECK-NEXT:	.quad	L
-;CHECK-NEXT:	.short	1                       ## Loc expr size
+;CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}}          ## Loc expr size
+;CHECK-NEXT:    .short  Lset
+;CHECK-NEXT: Ltmp
 ;CHECK-NEXT:	.byte	85                      ## DW_OP_reg5
+;CHECK-NEXT: Ltmp7
 ;CHECK-NEXT:	.quad	0
 ;CHECK-NEXT:	.quad	0
 
diff --git a/test/CodeGen/X86/dbg-prolog-end.ll b/test/CodeGen/X86/dbg-prolog-end.ll
new file mode 100644
index 0000000..81303bb
--- /dev/null
+++ b/test/CodeGen/X86/dbg-prolog-end.ll
@@ -0,0 +1,55 @@
+; RUN: llc -O0 < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.7"
+
+;CHECK: .loc	1 2 11 prologue_end
+define i32 @foo(i32 %i) nounwind ssp {
+entry:
+  %i.addr = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !7), !dbg !8
+  call void @llvm.dbg.declare(metadata !{i32* %j}, metadata !9), !dbg !11
+  store i32 2, i32* %j, align 4, !dbg !12
+  %tmp = load i32* %j, align 4, !dbg !13
+  %inc = add nsw i32 %tmp, 1, !dbg !13
+  store i32 %inc, i32* %j, align 4, !dbg !13
+  %tmp1 = load i32* %j, align 4, !dbg !14
+  %tmp2 = load i32* %i.addr, align 4, !dbg !14
+  %add = add nsw i32 %tmp1, %tmp2, !dbg !14
+  store i32 %add, i32* %j, align 4, !dbg !14
+  %tmp3 = load i32* %j, align 4, !dbg !15
+  ret i32 %tmp3, !dbg !15
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define i32 @main() nounwind ssp {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 @foo(i32 21), !dbg !16
+  ret i32 %call, !dbg !16
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.dbg.sp = !{!1, !6}
+
+!0 = metadata !{i32 589841, i32 0, i32 12, metadata !"/tmp/a.c", metadata !"/private/tmp", metadata !"clang version 3.0 (trunk 131100)", i1 true, i1 false, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{i32 589870, i32 0, metadata !2, metadata !"foo", metadata !"foo", metadata !"", metadata !2, i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 (i32)* @foo, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 589865, metadata !"/tmp/a.c", metadata !"/private/tmp", metadata !0} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 589845, metadata !2, metadata !"", metadata !2, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 589860, metadata !0, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{i32 589870, i32 0, metadata !2, metadata !"main", metadata !"main", metadata !"", metadata !2, i32 7, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, i32 ()* @main, null, null} ; [ DW_TAG_subprogram ]
+!7 = metadata !{i32 590081, metadata !1, metadata !"i", metadata !2, i32 16777217, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
+!8 = metadata !{i32 1, i32 13, metadata !1, null}
+!9 = metadata !{i32 590080, metadata !10, metadata !"j", metadata !2, i32 2, metadata !5, i32 0} ; [ DW_TAG_auto_variable ]
+!10 = metadata !{i32 589835, metadata !1, i32 1, i32 16, metadata !2, i32 0} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{i32 2, i32 6, metadata !10, null}
+!12 = metadata !{i32 2, i32 11, metadata !10, null}
+!13 = metadata !{i32 3, i32 2, metadata !10, null}
+!14 = metadata !{i32 4, i32 2, metadata !10, null}
+!15 = metadata !{i32 5, i32 2, metadata !10, null}
+!16 = metadata !{i32 8, i32 2, metadata !17, null}
+!17 = metadata !{i32 589835, metadata !6, i32 7, i32 12, metadata !2, i32 1} ; [ DW_TAG_lexical_block ]
diff --git a/test/CodeGen/X86/dbg-value-dag-combine.ll b/test/CodeGen/X86/dbg-value-dag-combine.ll
new file mode 100644
index 0000000..b115bf4
--- /dev/null
+++ b/test/CodeGen/X86/dbg-value-dag-combine.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+; PR 9817
+
+
+declare  <4 x i32> @__amdil_get_global_id_int()
+declare  void @llvm.dbg.value(metadata , i64 , metadata )
+define void @__OpenCL_test_kernel(i32 addrspace(1)* %ip) nounwind {
+entry:
+  call void @llvm.dbg.value(metadata !{i32 addrspace(1)* %ip}, i64 0, metadata
+!7), !dbg !8
+  %0 = call <4 x i32> @__amdil_get_global_id_int() nounwind
+  %1 = extractelement <4 x i32> %0, i32 0
+  call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !9), !dbg !11
+  call void @llvm.dbg.value(metadata !12, i64 0, metadata !13), !dbg !14
+  %tmp2 = load i32 addrspace(1)* %ip, align 4, !dbg !15
+  %tmp3 = add i32 0, %tmp2, !dbg !15
+; CHECK:  ##DEBUG_VALUE: idx <- EAX+0
+  call void @llvm.dbg.value(metadata !{i32 %tmp3}, i64 0, metadata !13), !dbg
+!15
+  %arrayidx = getelementptr i32 addrspace(1)* %ip, i32 %1, !dbg !16
+  store i32 %tmp3, i32 addrspace(1)* %arrayidx, align 4, !dbg !16
+  ret void, !dbg !17
+}
+!llvm.dbg.sp = !{!0}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata
+!"__OpenCL_test_kernel", metadata !"__OpenCL_test_kernel", metadata
+!"__OpenCL_test_kernel", metadata !1, i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"OCL6368.tmp.cl", metadata !"E:\5CUsers\5Cmvillmow.AMD\5CAppData\5CLocal\5CTemp", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 1, metadata !"OCL6368.tmp.cl", metadata !"E:\5CUsers\5Cmvillmow.AMD\5CAppData\5CLocal\5CTemp", metadata !"clc", i1 true, i1 false, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{null, metadata !5}
+!5 = metadata !{i32 589839, metadata !2, metadata !"", null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_pointer_type ]
+!6 = metadata !{i32 589860, metadata !2, metadata !"unsigned int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
+!7 = metadata !{i32 590081, metadata !0, metadata !"ip", metadata !1, i32 1, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
+!8 = metadata !{i32 1, i32 42, metadata !0, null}
+!9 = metadata !{i32 590080, metadata !10, metadata !"gid", metadata !1, i32 3, metadata !6, i32 0} ; [ DW_TAG_auto_variable ]
+!10 = metadata !{i32 589835, metadata !0, i32 2, i32 1, metadata !1, i32 0} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{i32 3, i32 41, metadata !10, null}
+!12 = metadata !{i32 0}
+!13 = metadata !{i32 590080, metadata !10, metadata !"idx", metadata !1, i32 4, metadata !6, i32 0} ; [ DW_TAG_auto_variable ]
+!14 = metadata !{i32 4, i32 20, metadata !10, null}
+!15 = metadata !{i32 5, i32 15, metadata !10, null}
+!16 = metadata !{i32 6, i32 18, metadata !10, null}
+!17 = metadata !{i32 7, i32 1, metadata !0, null}
+
diff --git a/test/CodeGen/X86/dbg-value-isel.ll b/test/CodeGen/X86/dbg-value-isel.ll
new file mode 100644
index 0000000..d1a9e57
--- /dev/null
+++ b/test/CodeGen/X86/dbg-value-isel.ll
@@ -0,0 +1,102 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+; PR 9879
+
+; CHECK: ##DEBUG_VALUE: tid <-
+%0 = type { i8*, i8*, i8*, i8*, i32 }
+
+@sgv = internal addrspace(2) constant [1 x i8] zeroinitializer
+@fgv = internal addrspace(2) constant [1 x i8] zeroinitializer
+@lvgv = internal constant [0 x i8*] zeroinitializer
+@llvm.global.annotations = appending global [1 x %0] [%0 { i8* bitcast (void (i32 addrspace(1)*)* @__OpenCL_nbt02_kernel to i8*), i8* bitcast ([1 x i8] addrspace(2)* @sgv to i8*), i8* bitcast ([1 x i8] addrspace(2)* @fgv to i8*), i8* bitcast ([0 x i8*]* @lvgv to i8*), i32 0 }], section "llvm.metadata"
+
+define void @__OpenCL_nbt02_kernel(i32 addrspace(1)* %ip) nounwind {
+entry:
+  call void @llvm.dbg.value(metadata !{i32 addrspace(1)* %ip}, i64 0, metadata !8), !dbg !9
+  %0 = call <4 x i32> @__amdil_get_local_id_int() nounwind
+  %1 = extractelement <4 x i32> %0, i32 0
+  br label %2
+
+; <label>:2                                       ; preds = %entry
+  %3 = phi i32 [ %1, %entry ]
+  br label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = phi i32 [ %3, %2 ]
+  br label %get_local_id.exit
+
+get_local_id.exit:                                ; preds = %4
+  %6 = phi i32 [ %5, %4 ]
+  call void @llvm.dbg.value(metadata !{i32 %6}, i64 0, metadata !10), !dbg !12
+  %7 = call <4 x i32> @__amdil_get_global_id_int() nounwind
+  %8 = extractelement <4 x i32> %7, i32 0
+  br label %9
+
+; <label>:9                                       ; preds = %get_local_id.exit
+  %10 = phi i32 [ %8, %get_local_id.exit ]
+  br label %11
+
+; <label>:11                                      ; preds = %9
+  %12 = phi i32 [ %10, %9 ]
+  br label %get_global_id.exit
+
+get_global_id.exit:                               ; preds = %11
+  %13 = phi i32 [ %12, %11 ]
+  call void @llvm.dbg.value(metadata !{i32 %13}, i64 0, metadata !13), !dbg !14
+  %14 = call <4 x i32> @__amdil_get_local_size_int() nounwind
+  %15 = extractelement <4 x i32> %14, i32 0
+  br label %16
+
+; <label>:16                                      ; preds = %get_global_id.exit
+  %17 = phi i32 [ %15, %get_global_id.exit ]
+  br label %18
+
+; <label>:18                                      ; preds = %16
+  %19 = phi i32 [ %17, %16 ]
+  br label %get_local_size.exit
+
+get_local_size.exit:                              ; preds = %18
+  %20 = phi i32 [ %19, %18 ]
+  call void @llvm.dbg.value(metadata !{i32 %20}, i64 0, metadata !15), !dbg !16
+  %tmp5 = add i32 %6, %13, !dbg !17
+  %tmp7 = add i32 %tmp5, %20, !dbg !17
+  store i32 %tmp7, i32 addrspace(1)* %ip, align 4, !dbg !17
+  br label %return, !dbg !17
+
+return:                                           ; preds = %get_local_size.exit
+  ret void, !dbg !18
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+declare <4 x i32> @__amdil_get_local_size_int() nounwind
+
+declare <4 x i32> @__amdil_get_local_id_int() nounwind
+
+declare <4 x i32> @__amdil_get_global_id_int() nounwind
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.sp = !{!0}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"__OpenCL_nbt02_kernel", metadata !"__OpenCL_nbt02_kernel", metadata !"__OpenCL_nbt02_kernel", metadata !1, i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"OCLlLwTXZ.cl", metadata !"/tmp", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 1, metadata !"OCLlLwTXZ.cl", metadata !"/tmp", metadata !"clc", i1 true, i1 false, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{null, metadata !5}
+!5 = metadata !{i32 589839, metadata !2, metadata !"", null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_pointer_type ]
+!6 = metadata !{i32 589846, metadata !2, metadata !"uint", metadata !1, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ]
+!7 = metadata !{i32 589860, metadata !2, metadata !"unsigned int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
+!8 = metadata !{i32 590081, metadata !0, metadata !"ip", metadata !1, i32 1, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{i32 1, i32 32, metadata !0, null}
+!10 = metadata !{i32 590080, metadata !11, metadata !"tid", metadata !1, i32 3, metadata !6, i32 0} ; [ DW_TAG_auto_variable ]
+!11 = metadata !{i32 589835, metadata !0, i32 2, i32 1, metadata !1, i32 1} ; [ DW_TAG_lexical_block ]
+!12 = metadata !{i32 5, i32 24, metadata !11, null}
+!13 = metadata !{i32 590080, metadata !11, metadata !"gid", metadata !1, i32 3, metadata !6, i32 0} ; [ DW_TAG_auto_variable ]
+!14 = metadata !{i32 6, i32 25, metadata !11, null}
+!15 = metadata !{i32 590080, metadata !11, metadata !"lsz", metadata !1, i32 3, metadata !6, i32 0} ; [ DW_TAG_auto_variable ]
+!16 = metadata !{i32 7, i32 26, metadata !11, null}
+!17 = metadata !{i32 9, i32 24, metadata !11, null}
+!18 = metadata !{i32 10, i32 1, metadata !0, null}
+
diff --git a/test/CodeGen/X86/dbg-value-range.ll b/test/CodeGen/X86/dbg-value-range.ll
index da49f2d..28d873b 100644
--- a/test/CodeGen/X86/dbg-value-range.ll
+++ b/test/CodeGen/X86/dbg-value-range.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin10 < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-apple-darwin10 -regalloc=basic < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin10 -regalloc=basic -join-physregs < %s | FileCheck %s
 
 %struct.a = type { i32 }
 
@@ -53,7 +53,10 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 ;CHECK:Ldebug_loc0:
 ;CHECK-NEXT:	.quad
 ;CHECK-NEXT:	.quad	[[CLOBBER]]
-;CHECK-NEXT:	.short	1
+;CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}}
+;CHECK-NEXT:    .short  Lset
+;CHECK-NEXT: Ltmp
 ;CHECK-NEXT:	.byte	85
+;CHECK-NEXT: Ltmp
 ;CHECK-NEXT:	.quad	0
 ;CHECK-NEXT:	.quad	0
diff --git a/test/CodeGen/X86/div8.ll b/test/CodeGen/X86/div8.ll
new file mode 100644
index 0000000..0825f79
--- /dev/null
+++ b/test/CodeGen/X86/div8.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s | FileCheck %s
+; ModuleID = '8div.c'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.6"
+
+define signext i8 @test_div(i8 %dividend, i8 %divisor) nounwind ssp {
+entry:
+  %dividend.addr = alloca i8, align 2
+  %divisor.addr = alloca i8, align 1
+  %quotient = alloca i8, align 1
+  store i8 %dividend, i8* %dividend.addr, align 2
+  store i8 %divisor, i8* %divisor.addr, align 1
+  %tmp = load i8* %dividend.addr, align 2
+  %tmp1 = load i8* %divisor.addr, align 1
+; Insist on i8->i32 zero extension, even though divb demands only i16:
+; CHECK: movzbl {{.*}}%eax
+; CHECK: divb
+  %div = udiv i8 %tmp, %tmp1
+  store i8 %div, i8* %quotient, align 1
+  %tmp4 = load i8* %quotient, align 1
+  ret i8 %tmp4
+}
diff --git a/test/CodeGen/X86/eh_frame.ll b/test/CodeGen/X86/eh_frame.ll
new file mode 100644
index 0000000..3b792b2
--- /dev/null
+++ b/test/CodeGen/X86/eh_frame.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple x86_64-unknown-linux-gnu | FileCheck -check-prefix=STATIC %s
+; RUN: llc < %s -mtriple x86_64-unknown-linux-gnu -relocation-model=pic | FileCheck -check-prefix=PIC %s
+
+@__FRAME_END__ = constant [1 x i32] zeroinitializer, section ".eh_frame"
+
+@foo = external global i32
+@bar1 = constant i8* bitcast (i32* @foo to i8*), section "my_bar1", align 8
+
+
+; STATIC: .section	.eh_frame,"a",@progbits
+; STATIC: .section	my_bar1,"a",@progbits
+
+; PIC:	.section	.eh_frame,"a",@progbits
+; PIC:	.section	my_bar1,"aw",@progbits
diff --git a/test/CodeGen/X86/empty-functions.ll b/test/CodeGen/X86/empty-functions.ll
index b303cd1..874c53a 100644
--- a/test/CodeGen/X86/empty-functions.ll
+++ b/test/CodeGen/X86/empty-functions.ll
@@ -6,10 +6,24 @@ entry:
   unreachable
 }
 ; CHECK-NO-FP:     _func:
-; CHECK-NO-FP-NOT: movq %rsp, %rbp
+; CHECK-NO-FP-NEXT: :
+; CHECK-NO-FP-NEXT: .cfi_startproc
 ; CHECK-NO-FP:     nop
+; CHECK-NO-FP-NEXT: :
+; CHECK-NO-FP-NEXT: .cfi_endproc
 
 ; CHECK-FP:      _func:
-; CHECK-FP:      movq %rsp, %rbp
-; CHECK-FP-NEXT: Ltmp1:
-; CHECK-FP:      nop
+; CHECK-FP-NEXT: :
+; CHECK-FP-NEXT: .cfi_startproc
+; CHECK-FP-NEXT: :
+; CHECK-FP-NEXT: pushq %rbp
+; CHECK-FP-NEXT: :
+; CHECK-FP-NEXT: .cfi_def_cfa_offset 16
+; CHECK-FP-NEXT: :
+; CHECK-FP-NEXT: .cfi_offset %rbp, -16
+; CHECK-FP-NEXT: movq %rsp, %rbp
+; CHECK-FP-NEXT: :
+; CHECK-FP-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-FP-NEXT: nop
+; CHECK-FP-NEXT: :
+; CHECK-FP-NEXT: .cfi_endproc
diff --git a/test/CodeGen/X86/fast-isel-agg-constant.ll b/test/CodeGen/X86/fast-isel-agg-constant.ll
new file mode 100644
index 0000000..ce0dff7
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-agg-constant.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=x86-64 -O0 | FileCheck %s
+; Make sure fast-isel doesn't screw up aggregate constants.
+; (Failing out is okay, as long as we don't miscompile.)
+
+%bar = type { i32 }
+
+define i32 @foo()  {
+  %tmp = extractvalue %bar { i32 3 }, 0
+  ret i32 %tmp
+; CHECK: movl $3, %eax
+}
diff --git a/test/CodeGen/X86/fast-isel-call.ll b/test/CodeGen/X86/fast-isel-call.ll
index 5fcdbbb..2fbe4e2 100644
--- a/test/CodeGen/X86/fast-isel-call.ll
+++ b/test/CodeGen/X86/fast-isel-call.ll
@@ -1,6 +1,8 @@
-; RUN: llc < %s -fast-isel -march=x86 | grep and
+; RUN: llc < %s -O0 -fast-isel-abort -march=x86 | FileCheck %s
 
-define i32 @t() nounwind {
+%struct.s = type {i32, i32, i32}
+
+define i32 @test1() nounwind {
 tak:
 	%tmp = call i1 @foo()
 	br i1 %tmp, label %BB1, label %BB2
@@ -8,6 +10,46 @@ BB1:
 	ret i32 1
 BB2:
 	ret i32 0
+; CHECK: test1:
+; CHECK: calll
+; CHECK-NEXT: testb	$1
 }
-
 declare i1 @foo() zeroext nounwind
+
+declare void @foo2(%struct.s* byval)
+
+define void @test2(%struct.s* %d) nounwind {
+  call void @foo2(%struct.s* %d byval)
+  ret void
+; CHECK: test2:
+; CHECK: movl	(%eax)
+; CHECK: movl {{.*}}, (%esp)
+; CHECK: movl	4(%eax)
+; CHECK: movl {{.*}}, 4(%esp)
+; CHECK: movl	8(%eax)
+; CHECK: movl {{.*}}, 8(%esp)
+}
+
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+
+define void @test3(i8* %a) {
+  call void @llvm.memset.p0i8.i32(i8* %a, i8 0, i32 100, i32 1, i1 false)
+  ret void
+; CHECK: test3:
+; CHECK:   movl	{{.*}}, (%esp)
+; CHECK:   movl	$0, 4(%esp)
+; CHECK:   movl	$100, 8(%esp)
+; CHECK:   calll {{.*}}memset
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+define void @test4(i8* %a, i8* %b) {
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %b, i32 100, i32 1, i1 false)
+  ret void
+; CHECK: test4:
+; CHECK:   movl	{{.*}}, (%esp)
+; CHECK:   movl	{{.*}}, 4(%esp)
+; CHECK:   movl	$100, 8(%esp)
+; CHECK:   calll {{.*}}memcpy
+}
diff --git a/test/CodeGen/X86/fast-isel-extract.ll b/test/CodeGen/X86/fast-isel-extract.ll
new file mode 100644
index 0000000..f63396e
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-extract.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -mtriple x86_64-apple-darwin11 -O0 -fast-isel-abort | FileCheck %s
+
+%struct.x = type { i64, i64 }
+%addovf = type { i32, i1 }
+declare %struct.x @f()
+
+define void @test1(i64*) nounwind ssp {
+  %2 = tail call %struct.x @f() nounwind
+  %3 = extractvalue %struct.x %2, 0
+  %4 = add i64 %3, 10
+  store i64 %4, i64* %0
+  ret void
+; CHECK: test1:
+; CHECK: callq _f
+; CHECK-NEXT: addq	$10, %rax
+}
+
+define void @test2(i64*) nounwind ssp {
+  %2 = tail call %struct.x @f() nounwind
+  %3 = extractvalue %struct.x %2, 1
+  %4 = add i64 %3, 10
+  store i64 %4, i64* %0
+  ret void
+; CHECK: test2:
+; CHECK: callq _f
+; CHECK-NEXT: addq	$10, %rdx
+}
+
+declare %addovf @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
+
+define void @test3(i32 %x, i32 %y, i32* %z) {
+  %r = call %addovf @llvm.sadd.with.overflow.i32(i32 %x, i32 %y)
+  %sum = extractvalue %addovf %r, 0
+  %sum3 = mul i32 %sum, 3
+  %bit = extractvalue %addovf %r, 1
+  br i1 %bit, label %then, label %end
+  
+then:
+  store i32 %sum3, i32* %z
+  br label %end
+
+end:
+  ret void
+; CHECK: test3
+; CHECK: addl
+; CHECK: seto %al
+; CHECK: testb $1, %al
+}
diff --git a/test/CodeGen/X86/fast-isel-fneg.ll b/test/CodeGen/X86/fast-isel-fneg.ll
index 5ffd48b..f42a4a2 100644
--- a/test/CodeGen/X86/fast-isel-fneg.ll
+++ b/test/CodeGen/X86/fast-isel-fneg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -fast-isel -fast-isel-abort -march=x86-64 | FileCheck %s
+; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin10 | FileCheck %s
 ; RUN: llc < %s -fast-isel -march=x86 -mattr=+sse2 | grep xor | count 2
 
 ; CHECK: doo:
diff --git a/test/CodeGen/X86/fast-isel-i1.ll b/test/CodeGen/X86/fast-isel-i1.ll
index 5d572c1..bea18a1 100644
--- a/test/CodeGen/X86/fast-isel-i1.ll
+++ b/test/CodeGen/X86/fast-isel-i1.ll
@@ -1,14 +1,15 @@
-; RUN: llc < %s -march=x86 -fast-isel | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin10 -fast-isel -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort | FileCheck %s
 
-declare i64 @test1a(i64)
+declare i32 @test1a(i32)
 
-define i32 @test1(i64 %x) nounwind {
+define i32 @test1(i32 %x) nounwind {
 ; CHECK: test1:
 ; CHECK: andb $1, %
-	%y = add i64 %x, -3
-	%t = call i64 @test1a(i64 %y)
-	%s = mul i64 %t, 77
-	%z = trunc i64 %s to i1
+	%y = add i32 %x, -3
+	%t = call i32 @test1a(i32 %y)
+	%s = mul i32 %t, 77
+	%z = trunc i32 %s to i1
 	br label %next
 
 next:		; preds = %0
diff --git a/test/CodeGen/X86/fast-isel-ret-ext.ll b/test/CodeGen/X86/fast-isel-ret-ext.ll
new file mode 100644
index 0000000..fd768cb
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-ret-ext.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s  -O0 -fast-isel-abort -mtriple i686-apple-darwin10 | FileCheck %s
+; RUN: llc < %s  -O0 -fast-isel-abort -mtriple x86_64-apple-darwin10 | FileCheck %s
+
+define zeroext i8 @test1(i32 %y) nounwind {
+  %conv = trunc i32 %y to i8
+  ret i8 %conv
+  ; CHECK: test1:
+  ; CHECK: movzbl {{.*}}, %eax
+}
+
+define signext i8 @test2(i32 %y) nounwind {
+  %conv = trunc i32 %y to i8
+  ret i8 %conv
+  ; CHECK: test2:
+  ; CHECK: movsbl {{.*}}, %eax
+}
+
+define zeroext i16 @test3(i32 %y) nounwind {
+  %conv = trunc i32 %y to i16
+  ret i16 %conv
+  ; CHECK: test3:
+  ; CHECK: movzwl {{.*}}, %eax
+}
+
+define signext i16 @test4(i32 %y) nounwind {
+  %conv = trunc i32 %y to i16
+  ret i16 %conv
+  ; CHECK: test4:
+  ; CHECK: movswl {{.*}}, %eax
+}
+
+define zeroext i1 @test5(i32 %y) nounwind {
+  %conv = trunc i32 %y to i1
+  ret i1 %conv
+  ; CHECK: test5:
+  ; CHECK: andb $1
+  ; CHECK: movzbl {{.*}}, %eax
+}
diff --git a/test/CodeGen/X86/fast-isel.ll b/test/CodeGen/X86/fast-isel.ll
index 5a1d213..8391860 100644
--- a/test/CodeGen/X86/fast-isel.ll
+++ b/test/CodeGen/X86/fast-isel.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -fast-isel -fast-isel-abort -march=x86 -mattr=sse2
-; RUN: llc < %s -fast-isel -fast-isel-abort -march=x86-64
+; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin10
 
 ; This tests very minimal fast-isel functionality.
 
@@ -27,7 +27,7 @@ exit:
   ret i32* %t8
 }
 
-define double @bar(double* %p, double* %q) nounwind {
+define void @bar(double* %p, double* %q) nounwind {
 entry:
   %r = load double* %p
   %s = load double* %q
@@ -41,7 +41,8 @@ fast:
   br label %exit
 
 exit:
-  ret double %t3
+  store double %t3, double* %q
+  ret void
 }
 
 define i32 @cast() nounwind {
@@ -68,24 +69,28 @@ define i8* @inttoptr_i32(i32 %p) nounwind {
   ret i8* %t
 }
 
-define i8 @trunc_i32_i8(i32 %x) signext nounwind  {
+define void @trunc_i32_i8(i32 %x, i8* %p) nounwind  {
 	%tmp1 = trunc i32 %x to i8
-	ret i8 %tmp1
+	store i8 %tmp1, i8* %p
+	ret void
 }
 
-define i8 @trunc_i16_i8(i16 signext %x) signext nounwind  {
+define void @trunc_i16_i8(i16 signext %x, i8* %p) nounwind  {
 	%tmp1 = trunc i16 %x to i8
-	ret i8 %tmp1
+	store i8 %tmp1, i8* %p
+	ret void
 }
 
-define i8 @shl_i8(i8 %a, i8 %c) nounwind {
-       %tmp = shl i8 %a, %c
-       ret i8 %tmp
+define void @shl_i8(i8 %a, i8 %c, i8* %p) nounwind {
+  %tmp = shl i8 %a, %c
+  store i8 %tmp, i8* %p
+  ret void
 }
 
-define i8 @mul_i8(i8 %a) nounwind {
-       %tmp = mul i8 %a, 17
-       ret i8 %tmp
+define void @mul_i8(i8 %a, i8* %p) nounwind {
+  %tmp = mul i8 %a, 17
+  store i8 %tmp, i8* %p
+  ret void
 }
 
 define void @load_store_i1(i1* %p, i1* %q) nounwind {
diff --git a/test/CodeGen/X86/fold-xmm-zero.ll b/test/CodeGen/X86/fold-xmm-zero.ll
new file mode 100644
index 0000000..b4eeb40
--- /dev/null
+++ b/test/CodeGen/X86/fold-xmm-zero.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=i386-apple-macosx10.6.7 -mattr=+sse2 | FileCheck %s
+
+; Simple test to make sure folding for special constants (like float zero)
+; isn't completely broken.
+
+; CHECK: divss	LCPI0
+
+%0 = type { float, float, float, float, float, float, float, float }
+
+define void @f() nounwind ssp {
+entry:
+  %0 = tail call %0 asm sideeffect "foo", "={xmm0},={xmm1},={xmm2},={xmm3},={xmm4},={xmm5},={xmm6},={xmm7},0,1,2,3,4,5,6,7,~{dirflag},~{fpsr},~{flags}"(float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00) nounwind
+  %asmresult = extractvalue %0 %0, 0
+  %asmresult8 = extractvalue %0 %0, 1
+  %asmresult9 = extractvalue %0 %0, 2
+  %asmresult10 = extractvalue %0 %0, 3
+  %asmresult11 = extractvalue %0 %0, 4
+  %asmresult12 = extractvalue %0 %0, 5
+  %asmresult13 = extractvalue %0 %0, 6
+  %asmresult14 = extractvalue %0 %0, 7
+  %div = fdiv float %asmresult, 0.000000e+00
+  %1 = tail call %0 asm sideeffect "bar", "={xmm0},={xmm1},={xmm2},={xmm3},={xmm4},={xmm5},={xmm6},={xmm7},0,1,2,3,4,5,6,7,~{dirflag},~{fpsr},~{flags}"(float %div, float %asmresult8, float %asmresult9, float %asmresult10, float %asmresult11, float %asmresult12, float %asmresult13, float %asmresult14) nounwind
+  %asmresult24 = extractvalue %0 %1, 0
+  %asmresult25 = extractvalue %0 %1, 1
+  %asmresult26 = extractvalue %0 %1, 2
+  %asmresult27 = extractvalue %0 %1, 3
+  %asmresult28 = extractvalue %0 %1, 4
+  %asmresult29 = extractvalue %0 %1, 5
+  %asmresult30 = extractvalue %0 %1, 6
+  %asmresult31 = extractvalue %0 %1, 7
+  %div33 = fdiv float %asmresult24, 0.000000e+00
+  %2 = tail call %0 asm sideeffect "baz", "={xmm0},={xmm1},={xmm2},={xmm3},={xmm4},={xmm5},={xmm6},={xmm7},0,1,2,3,4,5,6,7,~{dirflag},~{fpsr},~{flags}"(float %div33, float %asmresult25, float %asmresult26, float %asmresult27, float %asmresult28, float %asmresult29, float %asmresult30, float %asmresult31) nounwind
+  ret void
+}
diff --git a/test/CodeGen/X86/hidden-vis-pic.ll b/test/CodeGen/X86/hidden-vis-pic.ll
index 217dba6..67be3d0 100644
--- a/test/CodeGen/X86/hidden-vis-pic.ll
+++ b/test/CodeGen/X86/hidden-vis-pic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -disable-cfi -mtriple=i386-apple-darwin9 -relocation-model=pic -disable-fp-elim -unwind-tables | FileCheck %s
+; RUN: llc < %s -disable-cfi -mtriple=i386-apple-darwin9 -relocation-model=pic -disable-fp-elim | FileCheck %s
 
 
 
@@ -26,7 +26,7 @@ entry:
 
 @.str = private constant [12 x i8] c"hello world\00", align 1 ; <[12 x i8]*> [#uses=1]
 
-define hidden void @func() nounwind ssp {
+define hidden void @func() nounwind ssp uwtable {
 entry:
   %0 = call i32 @puts(i8* getelementptr inbounds ([12 x i8]* @.str, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
   br label %return
@@ -37,7 +37,7 @@ return:                                           ; preds = %entry
 
 declare i32 @puts(i8*)
 
-define hidden i32 @main() nounwind ssp {
+define hidden i32 @main() nounwind ssp uwtable {
 entry:
   %retval = alloca i32                            ; <i32*> [#uses=1]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
diff --git a/test/CodeGen/X86/hoist-common.ll b/test/CodeGen/X86/hoist-common.ll
new file mode 100644
index 0000000..72e17c0
--- /dev/null
+++ b/test/CodeGen/X86/hoist-common.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx  | FileCheck %s
+
+; Common "xorb al, al" instruction in the two successor blocks should be
+; moved to the entry block above the test + je.
+
+; rdar://9145558
+
+define zeroext i1 @t(i32 %c) nounwind ssp {
+entry:
+; CHECK: t:
+; CHECK: xorb %al, %al
+; CHECK: test
+; CHECK: je
+  %tobool = icmp eq i32 %c, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+; CHECK: callq
+  %call = tail call zeroext i1 (...)* @foo() nounwind
+  br label %return
+
+return:
+; CHECK: ret
+  %retval.0 = phi i1 [ %call, %if.then ], [ false, %entry ]
+  ret i1 %retval.0
+}
+
+declare zeroext i1 @foo(...)
diff --git a/test/CodeGen/X86/inline-asm-error.ll b/test/CodeGen/X86/inline-asm-error.ll
new file mode 100644
index 0000000..29c5ae5
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-error.ll
@@ -0,0 +1,17 @@
+; RUN: not llc -march x86 -regalloc=fast       < %s 2> %t1
+; RUN: not llc -march x86 -regalloc=basic      < %s 2> %t2
+; RUN: not llc -march x86 -regalloc=greedy     < %s 2> %t3
+; RUN: FileCheck %s < %t1
+; RUN: FileCheck %s < %t2
+; RUN: FileCheck %s < %t3
+
+; The register allocator must fail on this function, and it should print the
+; inline asm in the diagnostic.
+; CHECK: LLVM ERROR: Ran out of registers during register allocation!
+; CHECK: INLINEASM <es:hello world>
+
+define void @f(i32 %x0, i32 %x1, i32 %x2, i32 %x3, i32 %x4, i32 %x5, i32 %x6, i32 %x7, i32 %x8, i32 %x9) nounwind ssp {
+entry:
+  tail call void asm sideeffect "hello world", "r,r,r,r,r,r,r,r,r,r,~{dirflag},~{fpsr},~{flags}"(i32 %x0, i32 %x1, i32 %x2, i32 %x3, i32 %x4, i32 %x5, i32 %x6, i32 %x7, i32 %x8, i32 %x9) nounwind
+  ret void
+}
diff --git a/test/CodeGen/X86/isint.ll b/test/CodeGen/X86/isint.ll
index 507a328..4a98e63 100644
--- a/test/CodeGen/X86/isint.ll
+++ b/test/CodeGen/X86/isint.ll
@@ -1,17 +1,15 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 > %t
-; RUN: not grep cmp %t
-; RUN: not grep xor %t
-; RUN: grep jne %t | count 1
-; RUN: grep jp %t | count 1
-; RUN: grep setnp %t | count 1
-; RUN: grep sete %t | count 1
-; RUN: grep and %t | count 1
-; RUN: grep cvt %t | count 4
+; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
 
 define i32 @isint_return(double %d) nounwind {
+; CHECK-NOT: xor
+; CHECK: cvt
   %i = fptosi double %d to i32
+; CHECK-NEXT: cvt
   %e = sitofp i32 %i to double
+; CHECK: cmpeqsd
   %c = fcmp oeq double %d, %e
+; CHECK-NEXT: movd
+; CHECK-NEXT: andl
   %z = zext i1 %c to i32
   ret i32 %z
 }
@@ -19,9 +17,14 @@ define i32 @isint_return(double %d) nounwind {
 declare void @foo()
 
 define void @isint_branch(double %d) nounwind {
+; CHECK: cvt
   %i = fptosi double %d to i32
+; CHECK-NEXT: cvt
   %e = sitofp i32 %i to double
+; CHECK: ucomisd
   %c = fcmp oeq double %d, %e
+; CHECK-NEXT: jne
+; CHECK-NEXT: jp
   br i1 %c, label %true, label %false
 true:
   call void @foo()
diff --git a/test/CodeGen/X86/lsr-overflow.ll b/test/CodeGen/X86/lsr-overflow.ll
index 5bc4f7e..09c1c07 100644
--- a/test/CodeGen/X86/lsr-overflow.ll
+++ b/test/CodeGen/X86/lsr-overflow.ll
@@ -25,3 +25,21 @@ __ABContainsLabel.exit:
   %cmp = icmp eq i64 %indvar, 9223372036854775807
   ret i1 %cmp
 }
+
+define void @func_37() noreturn nounwind readonly {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc8, %entry
+  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.inc8 ]
+  %sub.i = add i64 undef, %indvar
+  %cmp.i = icmp eq i64 %sub.i, -9223372036854775808
+  br i1 undef, label %for.inc8, label %for.cond4
+
+for.cond4:                                        ; preds = %for.cond4, %for.body
+  br label %for.cond4
+
+for.inc8:                                         ; preds = %for.body
+  %indvar.next = add i64 %indvar, 1
+  br label %for.body
+}
diff --git a/test/CodeGen/X86/movntdq-no-avx.ll b/test/CodeGen/X86/movntdq-no-avx.ll
new file mode 100644
index 0000000..8b7e6ef
--- /dev/null
+++ b/test/CodeGen/X86/movntdq-no-avx.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+; Test that we produce a movntdq, not a vmovntdq
+; CHECK-NOT: vmovntdq
+
+define void @test(<2 x i64>* nocapture %a, <2 x i64> %b) nounwind optsize {
+entry:
+  store <2 x i64> %b, <2 x i64>* %a, align 16, !nontemporal !0
+  ret void
+}
+
+!0 = metadata !{i32 1}
diff --git a/test/CodeGen/X86/nontemporal.ll b/test/CodeGen/X86/nontemporal.ll
new file mode 100644
index 0000000..1d09535
--- /dev/null
+++ b/test/CodeGen/X86/nontemporal.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+
+define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E) {
+; CHECK: movntps
+  %cast = bitcast i8* %B to <4 x float>*
+  store <4 x float> %A, <4 x float>* %cast, align 16, !nontemporal !0
+; CHECK: movntdq
+  %cast1 = bitcast i8* %B to <2 x i64>*
+  store <2 x i64> %E, <2 x i64>* %cast1, align 16, !nontemporal !0
+; CHECK: movntpd
+  %cast2 = bitcast i8* %B to <2 x double>*
+  store <2 x double> %C, <2 x double>* %cast2, align 16, !nontemporal !0
+; CHECK: movnti
+  %cast3 = bitcast i8* %B to i32*
+  store i32 %D, i32* %cast3, align 16, !nontemporal !0
+  ret void
+}
+
+!0 = metadata !{i32 1}
diff --git a/test/CodeGen/X86/optimize-max-3.ll b/test/CodeGen/X86/optimize-max-3.ll
index e35eb70..e42aa9d 100644
--- a/test/CodeGen/X86/optimize-max-3.ll
+++ b/test/CodeGen/X86/optimize-max-3.ll
@@ -41,14 +41,13 @@ for.end:                                          ; preds = %for.body, %entry
 
 ;      CHECK:         jle
 ;  CHECK-NOT:         cmov
-;      CHECK:         xorl    {{%edi, %edi|%ecx, %ecx}}
+;      CHECK:         xorl    {{%edi, %edi|%ecx, %ecx|%eax, %eax}}
 ; CHECK-NEXT:         align
 ; CHECK-NEXT: BB1_2:
-; CHECK-NEXT:         callq
+; CHECK:              callq
 ; CHECK-NEXT:         incl    [[BX:%[a-z0-9]+]]
 ; CHECK-NEXT:         cmpl    [[R14:%[a-z0-9]+]], [[BX]]
-; CHECK-NEXT:         movq    %rax, %r{{di|cx}}
-; CHECK-NEXT:         jl
+; CHECK:              jl
 
 define void @_Z18GenerateStatusPagei(i32 %jobs_to_display) nounwind {
 entry:
diff --git a/test/CodeGen/X86/peep-setb.ll b/test/CodeGen/X86/peep-setb.ll
new file mode 100644
index 0000000..0bab789
--- /dev/null
+++ b/test/CodeGen/X86/peep-setb.ll
@@ -0,0 +1,82 @@
+; RUN: llc -march=x86-64 < %s | FileCheck %s
+
+define i8 @test1(i8 %a, i8 %b) nounwind {
+  %cmp = icmp ult i8 %a, %b
+  %cond = zext i1 %cmp to i8
+  %add = add i8 %cond, %b
+  ret i8 %add
+; CHECK: test1:
+; CHECK: adcb $0
+}
+
+define i32 @test2(i32 %a, i32 %b) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %cond = zext i1 %cmp to i32
+  %add = add i32 %cond, %b
+  ret i32 %add
+; CHECK: test2:
+; CHECK: adcl $0
+}
+
+define i64 @test3(i64 %a, i64 %b) nounwind {
+  %cmp = icmp ult i64 %a, %b
+  %conv = zext i1 %cmp to i64
+  %add = add i64 %conv, %b
+  ret i64 %add
+; CHECK: test3:
+; CHECK: adcq $0
+}
+
+define i8 @test4(i8 %a, i8 %b) nounwind {
+  %cmp = icmp ult i8 %a, %b
+  %cond = zext i1 %cmp to i8
+  %sub = sub i8 %b, %cond
+  ret i8 %sub
+; CHECK: test4:
+; CHECK: sbbb $0
+}
+
+define i32 @test5(i32 %a, i32 %b) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %cond = zext i1 %cmp to i32
+  %sub = sub i32 %b, %cond
+  ret i32 %sub
+; CHECK: test5:
+; CHECK: sbbl $0
+}
+
+define i64 @test6(i64 %a, i64 %b) nounwind {
+  %cmp = icmp ult i64 %a, %b
+  %conv = zext i1 %cmp to i64
+  %sub = sub i64 %b, %conv
+  ret i64 %sub
+; CHECK: test6:
+; CHECK: sbbq $0
+}
+
+define i8 @test7(i8 %a, i8 %b) nounwind {
+  %cmp = icmp ult i8 %a, %b
+  %cond = sext i1 %cmp to i8
+  %sub = sub i8 %b, %cond
+  ret i8 %sub
+; CHECK: test7:
+; CHECK: adcb $0
+}
+
+define i32 @test8(i32 %a, i32 %b) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %cond = sext i1 %cmp to i32
+  %sub = sub i32 %b, %cond
+  ret i32 %sub
+; CHECK: test8:
+; CHECK: adcl $0
+}
+
+define i64 @test9(i64 %a, i64 %b) nounwind {
+  %cmp = icmp ult i64 %a, %b
+  %conv = sext i1 %cmp to i64
+  %sub = sub i64 %b, %conv
+  ret i64 %sub
+; CHECK: test9:
+; CHECK: adcq $0
+}
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-2.ll b/test/CodeGen/X86/phys_subreg_coalesce-2.ll
index 13e804d..02c519f 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-2.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep mov | count 4
+; RUN: llc < %s -march=x86 | FileCheck %s
 ; PR2659
 
 define i32 @binomial(i32 %n, i32 %k) nounwind {
@@ -12,7 +12,8 @@ forcond.preheader:		; preds = %entry
 
 ifthen:		; preds = %entry
 	ret i32 0
-
+; CHECK: forbody
+; CHECK-NOT: mov
 forbody:		; preds = %forbody, %forcond.preheader
 	%indvar = phi i32 [ 0, %forcond.preheader ], [ %divisor.02, %forbody ]		; <i32> [#uses=3]
 	%accumulator.01 = phi i32 [ 1, %forcond.preheader ], [ %div, %forbody ]		; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-3.ll b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
index f23669e..4162015 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-3.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
@@ -1,6 +1,11 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -join-physregs | FileCheck %s
 ; rdar://5571034
 
+; This requires physreg joining, %vreg13 is live everywhere:
+; 304L		%CL<def> = COPY %vreg13:sub_8bit; GR32_ABCD:%vreg13
+; 320L		%vreg15<def> = COPY %vreg19; GR32:%vreg15 GR32_NOSP:%vreg19
+; 336L		%vreg15<def> = SAR32rCL %vreg15, %EFLAGS<imp-def,dead>, %CL<imp-use,kill>; GR32:%vreg15
+
 define void @foo(i32* nocapture %quadrant, i32* nocapture %ptr, i32 %bbSize, i32 %bbStart, i32 %shifts) nounwind ssp {
 ; CHECK: foo:
 entry:
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index bf5229a..d8ed4c0 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1,7 +1,9 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -stack-alignment=16 > %t
+; RUN: llc < %s -march=x86 -mattr=sse41 -stack-alignment=16 -join-physregs > %t
 ; RUN: grep pmul %t | count 12
 ; RUN: grep mov %t | count 11
 
+; The f() arguments in %xmm0 and %xmm1 cause an extra movdqa without -join-physregs.
+
 define <4 x i32> @a(<4 x i32> %i) nounwind  {
         %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
         ret <4 x i32> %A
diff --git a/test/CodeGen/X86/pr10068.ll b/test/CodeGen/X86/pr10068.ll
new file mode 100644
index 0000000..8829c5d
--- /dev/null
+++ b/test/CodeGen/X86/pr10068.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=x86
+
+define void @foobar() {
+entry:
+  %sub.i = trunc i64 undef to i32
+  %shr80.i = ashr i32 %sub.i, 16
+  %add82.i = add nsw i32 %shr80.i, 1
+  %notlhs.i = icmp slt i32 %shr80.i, undef
+  %notrhs.i = icmp sgt i32 %add82.i, -1
+  %or.cond.not.i = and i1 %notrhs.i, %notlhs.i
+  %cmp154.i = icmp slt i32 0, undef
+  %or.cond406.i = and i1 %or.cond.not.i, %cmp154.i
+  %or.cond406.not.i = xor i1 %or.cond406.i, true
+  %or.cond407.i = or i1 undef, %or.cond406.not.i
+  br i1 %or.cond407.i, label %if.then158.i, label %if.end163.i
+
+if.then158.i:
+  ret void
+
+if.end163.i:                                      ; preds = %if.end67.i
+  ret void
+}
diff --git a/test/CodeGen/X86/pr2659.ll b/test/CodeGen/X86/pr2659.ll
index ef0f9ea..5dab5c9 100644
--- a/test/CodeGen/X86/pr2659.ll
+++ b/test/CodeGen/X86/pr2659.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin9.4.0 | grep movl | count 4
-; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin9.4.0 | FileCheck %s
+; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin9.4.0 -disable-branch-fold | FileCheck %s
 ; PR2659
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
@@ -19,7 +18,11 @@ forcond.preheader:              ; preds = %entry
 ; CHECK-NOT: xorl
 ; CHECK-NOT: movl
 ; CHECK-NOT: LBB
-; CHECK: je
+; CHECK: jne
+
+; There should be no moves required in the for loop body.
+; CHECK: %forbody
+; CHECK-NOT: mov
 
 ifthen:         ; preds = %entry
   ret i32 0
diff --git a/test/CodeGen/X86/pr9127.ll b/test/CodeGen/X86/pr9127.ll
index 9b251f5..ba92c77 100644
--- a/test/CodeGen/X86/pr9127.ll
+++ b/test/CodeGen/X86/pr9127.ll
@@ -10,4 +10,4 @@ entry:
 }
 
 ; test that the load is folded.
-; CHECK: ucomisd	(%{{rdi|rdx}}), %xmm0
+; CHECK: cmpeqsd	(%{{rdi|rdx}}), %xmm0
diff --git a/test/CodeGen/X86/pr9743.ll b/test/CodeGen/X86/pr9743.ll
index 8feccd9..6597c23 100644
--- a/test/CodeGen/X86/pr9743.ll
+++ b/test/CodeGen/X86/pr9743.ll
@@ -9,9 +9,9 @@ define void @f() {
 ; CHECK-NEXT: :
 ; CHECK-NEXT:  .cfi_def_cfa_offset 16
 ; CHECK-NEXT: :
-; CHECK-NEXT:  .cfi_offset 6, -16
+; CHECK-NEXT:  .cfi_offset %rbp, -16
 ; CHECK-NEXT:  movq    %rsp, %rbp
 ; CHECK-NEXT: :
-; CHECK-NEXT:  .cfi_def_cfa_register 6
+; CHECK-NEXT:  .cfi_def_cfa_register %rbp
 ; CHECK-NEXT:  popq    %rbp
 ; CHECK-NEXT:  ret
diff --git a/test/CodeGen/X86/ret-mmx.ll b/test/CodeGen/X86/ret-mmx.ll
index 04b57dd..865e147 100644
--- a/test/CodeGen/X86/ret-mmx.ll
+++ b/test/CodeGen/X86/ret-mmx.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -mattr=+mmx,+sse2 | FileCheck %s
 ; rdar://6602459
 
 @g_v1di = external global <1 x i64>
@@ -8,19 +8,32 @@ entry:
 	%call = call <1 x i64> @return_v1di()		; <<1 x i64>> [#uses=0]
 	store <1 x i64> %call, <1 x i64>* @g_v1di
         ret void
+; CHECK: t1:
+; CHECK: callq
+; CHECK-NEXT: movq	_g_v1di
+; CHECK-NEXT: movq	%rax,
 }
 
 declare <1 x i64> @return_v1di()
 
 define <1 x i64> @t2() nounwind {
 	ret <1 x i64> <i64 1>
+; CHECK: t2:
+; CHECK: movl	$1
+; CHECK-NEXT: ret
 }
 
 define <2 x i32> @t3() nounwind {
 	ret <2 x i32> <i32 1, i32 0>
+; CHECK: t3:
+; CHECK: movl $1
+; CHECK: movd {{.*}}, %xmm0
 }
 
 define double @t4() nounwind {
 	ret double bitcast (<2 x i32> <i32 1, i32 0> to double)
+; CHECK: t4:
+; CHECK: movl $1
+; CHECK: movd {{.*}}, %xmm0
 }
 
diff --git a/test/CodeGen/X86/setoeq.ll b/test/CodeGen/X86/setoeq.ll
index 4a9c1ba..aa2f0af 100644
--- a/test/CodeGen/X86/setoeq.ll
+++ b/test/CodeGen/X86/setoeq.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=x86  | grep set | count 2
-; RUN: llc < %s -march=x86  | grep and
+; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
 
 define zeroext i8 @t(double %x) nounwind readnone {
 entry:
@@ -7,5 +6,16 @@ entry:
 	%1 = sitofp i32 %0 to double		; <double> [#uses=1]
 	%2 = fcmp oeq double %1, %x		; <i1> [#uses=1]
 	%retval12 = zext i1 %2 to i8		; <i8> [#uses=1]
+; CHECK: cmpeqsd
+	ret i8 %retval12
+}
+
+define zeroext i8 @u(double %x) nounwind readnone {
+entry:
+	%0 = fptosi double %x to i32		; <i32> [#uses=1]
+	%1 = sitofp i32 %0 to double		; <double> [#uses=1]
+	%2 = fcmp une double %1, %x		; <i1> [#uses=1]
+	%retval12 = zext i1 %2 to i8		; <i8> [#uses=1]
+; CHECK: cmpneqsd
 	ret i8 %retval12
 }
diff --git a/test/CodeGen/X86/shift-pair.ll b/test/CodeGen/X86/shift-pair.ll
new file mode 100644
index 0000000..24ba1fc
--- /dev/null
+++ b/test/CodeGen/X86/shift-pair.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+define i64 @test(i64 %A) {
+; CHECK: @test
+; CHECK: shrq $54
+; CHECK: andq $1020
+; CHECK: ret
+    %B = lshr i64 %A, 56
+    %C = shl i64 %B, 2
+    ret i64 %C
+}
diff --git a/test/CodeGen/X86/sibcall.ll b/test/CodeGen/X86/sibcall.ll
index de2a81e8..4a98efb 100644
--- a/test/CodeGen/X86/sibcall.ll
+++ b/test/CodeGen/X86/sibcall.ll
@@ -198,7 +198,7 @@ declare i32 @foo6(i32, i32, %struct.t* byval align 4)
 
 ; rdar://r7717598
 %struct.ns = type { i32, i32 }
-%struct.cp = type { float, float }
+%struct.cp = type { float, float, float, float, float }
 
 define %struct.ns* @t13(%struct.cp* %yy) nounwind ssp {
 ; 32: t13:
@@ -229,7 +229,7 @@ entry:
 ; 64: t14:
 ; 64: movq 32(%rdi)
 ; 64-NOT: movq 16(%rdi)
-; 64: jmpq *16(%rdi)
+; 64: jmpq *16({{%rdi|%rax}})
   %0 = getelementptr inbounds %struct.__block_literal_2* %.block_descriptor, i64 0, i32 5 ; <void ()**> [#uses=1]
   %1 = load void ()** %0, align 8                 ; <void ()*> [#uses=2]
   %2 = bitcast void ()* %1 to %struct.__block_literal_1* ; <%struct.__block_literal_1*> [#uses=1]
diff --git a/test/CodeGen/X86/smul-with-overflow-2.ll b/test/CodeGen/X86/smul-with-overflow-2.ll
deleted file mode 100644
index 7c23adb..0000000
--- a/test/CodeGen/X86/smul-with-overflow-2.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc < %s -march=x86 | grep mul | count 1
-; RUN: llc < %s -march=x86 | grep add | count 3
-
-define i32 @t1(i32 %a, i32 %b) nounwind readnone {
-entry:
-        %tmp0 = add i32 %b, %a
-	%tmp1 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %tmp0, i32 2)
-	%tmp2 = extractvalue { i32, i1 } %tmp1, 0
-	ret i32 %tmp2
-}
-
-define i32 @t2(i32 %a, i32 %b) nounwind readnone {
-entry:
-        %tmp0 = add i32 %b, %a
-	%tmp1 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %tmp0, i32 4)
-	%tmp2 = extractvalue { i32, i1 } %tmp1, 0
-	ret i32 %tmp2
-}
-
-declare { i32, i1 } @llvm.smul.with.overflow.i32(i32, i32) nounwind
diff --git a/test/CodeGen/X86/smul-with-overflow-3.ll b/test/CodeGen/X86/smul-with-overflow-3.ll
deleted file mode 100644
index 49c31f5..0000000
--- a/test/CodeGen/X86/smul-with-overflow-3.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc < %s -march=x86 | grep {jno} | count 1
-
-@ok = internal constant [4 x i8] c"%d\0A\00"
-@no = internal constant [4 x i8] c"no\0A\00"
-
-define i1 @func1(i32 %v1, i32 %v2) nounwind {
-entry:
-  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
-  %sum = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %normal
-
-overflow:
-  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8]* @no, i32 0, i32 0) ) nounwind
-  ret i1 false
-
-normal:
-  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
-  ret i1 true
-}
-
-declare i32 @printf(i8*, ...) nounwind
-declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32)
diff --git a/test/CodeGen/X86/smul-with-overflow.ll b/test/CodeGen/X86/smul-with-overflow.ll
index 6d125e4..7c2e247 100644
--- a/test/CodeGen/X86/smul-with-overflow.ll
+++ b/test/CodeGen/X86/smul-with-overflow.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -march=x86 | grep {jo} | count 1
+; RUN: llc < %s -march=x86 | FileCheck %s
 
 @ok = internal constant [4 x i8] c"%d\0A\00"
 @no = internal constant [4 x i8] c"no\0A\00"
 
-define i1 @func1(i32 %v1, i32 %v2) nounwind {
+define i1 @test1(i32 %v1, i32 %v2) nounwind {
 entry:
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
   %sum = extractvalue {i32, i1} %t, 0
@@ -17,7 +17,53 @@ normal:
 overflow:
   %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8]* @no, i32 0, i32 0) ) nounwind
   ret i1 false
+; CHECK: test1:
+; CHECK: imull
+; CHECK-NEXT: jo
+}
+
+define i1 @test2(i32 %v1, i32 %v2) nounwind {
+entry:
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %sum = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %normal
+
+overflow:
+  %t2 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8]* @no, i32 0, i32 0) ) nounwind
+  ret i1 false
+
+normal:
+  %t1 = tail call i32 (i8*, ...)* @printf( i8* getelementptr ([4 x i8]* @ok, i32 0, i32 0), i32 %sum ) nounwind
+  ret i1 true
+; CHECK: test2:
+; CHECK: imull
+; CHECK-NEXT: jno
 }
 
 declare i32 @printf(i8*, ...) nounwind
 declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32)
+
+define i32 @test3(i32 %a, i32 %b) nounwind readnone {
+entry:
+	%tmp0 = add i32 %b, %a
+	%tmp1 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %tmp0, i32 2)
+	%tmp2 = extractvalue { i32, i1 } %tmp1, 0
+	ret i32 %tmp2
+; CHECK: test3:
+; CHECK: addl
+; CHECK-NEXT: addl
+; CHECK-NEXT: ret
+}
+
+define i32 @test4(i32 %a, i32 %b) nounwind readnone {
+entry:
+	%tmp0 = add i32 %b, %a
+	%tmp1 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %tmp0, i32 4)
+	%tmp2 = extractvalue { i32, i1 } %tmp1, 0
+	ret i32 %tmp2
+; CHECK: test4:
+; CHECK: addl
+; CHECK: mull
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index 348121a..ff0af25 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86-64 -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -march=x86-64 -asm-verbose=false -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck -check-prefix=UNSAFE %s
-; RUN: llc < %s -march=x86-64 -asm-verbose=false -enable-no-nans-fp-math | FileCheck -check-prefix=FINITE %s
+; RUN: llc < %s -march=x86-64 -asm-verbose=false -join-physregs | FileCheck %s
+; RUN: llc < %s -march=x86-64 -asm-verbose=false -join-physregs -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck -check-prefix=UNSAFE %s
+; RUN: llc < %s -march=x86-64 -asm-verbose=false -join-physregs -enable-no-nans-fp-math | FileCheck -check-prefix=FINITE %s
 
 ; Some of these patterns can be matched as SSE min or max. Some of
 ; then can be matched provided that the operands are swapped.
@@ -12,6 +12,9 @@
 ; y_ : use -0.0 instead of %y
 ; _inverse : swap the arms of the select.
 
+; Some of these tests depend on -join-physregs commuting instructions to
+; eliminate copies.
+
 ; CHECK:      ogt:
 ; CHECK-NEXT: maxsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index 8e72f13..8c2e58d 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -62,11 +62,10 @@ define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind {
 	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 7, i32 2, i32 3, i32 1, i32 5, i32 6, i32 5 >
 	ret <8 x i16> %tmp
 ; X64: t4:
-; X64: 	pextrw	$7, %xmm0, %eax
-; X64: 	pshufhw	$100, %xmm0, %xmm1
-; X64: 	pinsrw	$1, %eax, %xmm1
-; X64: 	pextrw	$1, %xmm0, %eax
-; X64: 	movdqa	%xmm1, %xmm0
+; X64: 	pextrw	$7, [[XMM0:%xmm[0-9]+]], %eax
+; X64: 	pshufhw	$100, [[XMM0]], [[XMM1:%xmm[0-9]+]]
+; X64: 	pinsrw	$1, %eax, [[XMM1]]
+; X64: 	pextrw	$1, [[XMM0]], %eax
 ; X64: 	pinsrw	$4, %eax, %xmm0
 ; X64: 	ret
 }
@@ -251,13 +250,13 @@ entry:
         %tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0,  <16 x i32> < i32 0, i32 1, i32 2, i32 17,  i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
         ret <16 x i8> %tmp9
 ; X64: 	t16:
-; X64: 		pinsrw	$0, %eax, %xmm1
-; X64: 		pextrw	$8, %xmm0, %eax
-; X64: 		pinsrw	$1, %eax, %xmm1
-; X64: 		pextrw	$1, %xmm1, %ecx
-; X64: 		movd	%xmm1, %edx
-; X64: 		pinsrw	$0, %edx, %xmm1
-; X64: 		pinsrw	$1, %eax, %xmm0
+; X64: 		pinsrw	$0, %eax, [[X1:%xmm[0-9]+]]
+; X64: 		pextrw	$8, [[X0:%xmm[0-9]+]], %eax
+; X64: 		pinsrw	$1, %eax, [[X1]]
+; X64: 		pextrw	$1, [[X1]], %ecx
+; X64: 		movd	[[X1]], %edx
+; X64: 		pinsrw	$0, %edx, %xmm
+; X64: 		pinsrw	$1, %eax, %xmm
 ; X64: 		ret
 }
 
diff --git a/test/CodeGen/X86/sse42.ll b/test/CodeGen/X86/sse42.ll
index 1723909..c787523 100644
--- a/test/CodeGen/X86/sse42.ll
+++ b/test/CodeGen/X86/sse42.ll
@@ -1,38 +1,39 @@
 ; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse42 | FileCheck %s -check-prefix=X32
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse42 | FileCheck %s -check-prefix=X64
 
-declare i32 @llvm.x86.sse42.crc32.8(i32, i8) nounwind
-declare i32 @llvm.x86.sse42.crc32.16(i32, i16) nounwind
-declare i32 @llvm.x86.sse42.crc32.32(i32, i32) nounwind
+declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
+declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
+declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
 
-define i32 @crc32_8(i32 %a, i8 %b) nounwind {
-  %tmp = call i32 @llvm.x86.sse42.crc32.8(i32 %a, i8 %b)
+define i32 @crc32_32_8(i32 %a, i8 %b) nounwind {
+  %tmp = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a, i8 %b)
   ret i32 %tmp
-; X32: _crc32_8:
+; X32: _crc32_32_8:
 ; X32:     crc32b   8(%esp), %eax
 
-; X64: _crc32_8:
-; X64:     crc32b   %sil, %eax
+; X64: _crc32_32_8:
+; X64:     crc32b   %sil,
 }
 
 
-define i32 @crc32_16(i32 %a, i16 %b) nounwind {
-  %tmp = call i32 @llvm.x86.sse42.crc32.16(i32 %a, i16 %b)
+define i32 @crc32_32_16(i32 %a, i16 %b) nounwind {
+  %tmp = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a, i16 %b)
   ret i32 %tmp
-; X32: _crc32_16:
+; X32: _crc32_32_16:
 ; X32:     crc32w   8(%esp), %eax
 
-; X64: _crc32_16:
-; X64:     crc32w   %si, %eax
+; X64: _crc32_32_16:
+; X64:     crc32w   %si,
 }
 
 
-define i32 @crc32_32(i32 %a, i32 %b) nounwind {
-  %tmp = call i32 @llvm.x86.sse42.crc32.32(i32 %a, i32 %b)
+define i32 @crc32_32_32(i32 %a, i32 %b) nounwind {
+  %tmp = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a, i32 %b)
   ret i32 %tmp
-; X32: _crc32_32:
+; X32: _crc32_32_32:
 ; X32:     crc32l   8(%esp), %eax
 
-; X64: _crc32_32:
-; X64:     crc32l   %esi, %eax
+; X64: _crc32_32_32:
+; X64:     crc32l   %esi,
 }
+
diff --git a/test/CodeGen/X86/sse42_64.ll b/test/CodeGen/X86/sse42_64.ll
new file mode 100644
index 0000000..8b3a69b
--- /dev/null
+++ b/test/CodeGen/X86/sse42_64.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse42 | FileCheck %s -check-prefix=X64
+
+declare i64 @llvm.x86.sse42.crc32.64.8(i64, i8) nounwind
+declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind
+
+define i64 @crc32_64_8(i64 %a, i8 %b) nounwind {
+  %tmp = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a, i8 %b)
+  ret i64 %tmp
+
+; X64: _crc32_64_8:
+; X64:     crc32b   %sil,
+}
+
+define i64 @crc32_64_64(i64 %a, i64 %b) nounwind {
+  %tmp = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a, i64 %b)
+  ret i64 %tmp
+
+; X64: _crc32_64_64:
+; X64:     crc32q   %rsi,
+}
+
diff --git a/test/CodeGen/X86/sse_reload_fold.ll b/test/CodeGen/X86/sse_reload_fold.ll
index 02399c4..a57fa58 100644
--- a/test/CodeGen/X86/sse_reload_fold.ll
+++ b/test/CodeGen/X86/sse_reload_fold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+64bit,+sse3 -print-failed-fuse-candidates |& FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+64bit,+sse3 -print-failed-fuse-candidates -regalloc=basic |& FileCheck %s
 ; CHECK: fail
 ; CHECK-NOT: fail
 
@@ -117,7 +117,16 @@ define <2 x double> @d8(<2 x double> %f) {
   ret <2 x double> %t
 }
 
-; This one should fail to fuse.
+; This one should fail to fuse, but -regalloc=greedy isn't even trying. Instead
+; it produces:
+;   callq	test_vd
+;   movapd	(%rsp), %xmm1           # 16-byte Reload
+;   hsubpd	%xmm0, %xmm1
+;   movapd	%xmm1, %xmm0
+;   addq	$24, %rsp
+;   ret
+; RABasic still tries to fold this one.
+
 define <2 x double> @z0(<2 x double> %f) {
   %y = call <2 x double> @test_vd(<2 x double> %f)
   %t = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %f, <2 x double> %y)
diff --git a/test/CodeGen/X86/tail-opts.ll b/test/CodeGen/X86/tail-opts.ll
index 77710ad..d6c16ca 100644
--- a/test/CodeGen/X86/tail-opts.ll
+++ b/test/CodeGen/X86/tail-opts.ll
@@ -412,9 +412,9 @@ return:
 ; can fall-through into the ret and the other side has to branch anyway.
 
 ; CHECK: TESTE:
-; CHECK: imulq
-; CHECK-NEXT: LBB8_2:
-; CHECK-NEXT: ret
+; CHECK: ret
+; CHECK-NOT: ret
+; CHECK: size TESTE
 
 define i64 @TESTE(i64 %parami, i64 %paraml) nounwind readnone {
 entry:
diff --git a/test/CodeGen/X86/tailcallstack64.ll b/test/CodeGen/X86/tailcallstack64.ll
index 060ce0f..c18c7aa 100644
--- a/test/CodeGen/X86/tailcallstack64.ll
+++ b/test/CodeGen/X86/tailcallstack64.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -tailcallopt -mtriple=x86_64-win32 -post-RA-scheduler=true | FileCheck %s
 
 ; FIXME: Redundant unused stack allocation could be eliminated.
-; CHECK: subq  ${{24|72}}, %rsp
+; CHECK: subq  ${{24|72|80}}, %rsp
 
 ; Check that lowered arguments on the stack do not overwrite each other.
 ; Add %in1 %p1 to a different temporary register (%eax).
diff --git a/test/CodeGen/X86/umul-with-overflow.ll b/test/CodeGen/X86/umul-with-overflow.ll
index c997661..84fcbc7 100644
--- a/test/CodeGen/X86/umul-with-overflow.ll
+++ b/test/CodeGen/X86/umul-with-overflow.ll
@@ -12,3 +12,27 @@ define i1 @a(i32 %x) zeroext nounwind {
 ; CHECK: movzbl	%al, %eax
 ; CHECK: ret
 }
+
+define i32 @test2(i32 %a, i32 %b) nounwind readnone {
+entry:
+	%tmp0 = add i32 %b, %a
+	%tmp1 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %tmp0, i32 2)
+	%tmp2 = extractvalue { i32, i1 } %tmp1, 0
+	ret i32 %tmp2
+; CHECK: test2:
+; CHECK: addl
+; CHECK-NEXT: addl
+; CHECK-NEXT: ret
+}
+
+define i32 @test3(i32 %a, i32 %b) nounwind readnone {
+entry:
+	%tmp0 = add i32 %b, %a
+	%tmp1 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %tmp0, i32 4)
+	%tmp2 = extractvalue { i32, i1 } %tmp1, 0
+	ret i32 %tmp2
+; CHECK: test3:
+; CHECK: addl
+; CHECK: mull
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/use-add-flags.ll b/test/CodeGen/X86/use-add-flags.ll
index 8fbbd39..a0448ec 100644
--- a/test/CodeGen/X86/use-add-flags.ll
+++ b/test/CodeGen/X86/use-add-flags.ll
@@ -7,10 +7,10 @@
 ; Use the flags on the add.
 
 ; CHECK: test1:
-;      CHECK: addl    (%r[[A0:di|cx]]), {{%esi|%edx}}
-; CHECK-NEXT: movl    {{%edx|%r8d}}, %eax
-; CHECK-NEXT: cmovnsl {{%ecx|%r9d}}, %eax
-; CHECK-NEXT: ret
+;     CHECK: addl
+; CHECK-NOT: test
+;     CHECK: cmovnsl
+;     CHECK: ret
 
 define i32 @test1(i32* %x, i32 %y, i32 %a, i32 %b) nounwind {
 	%tmp2 = load i32* %x, align 4		; <i32> [#uses=1]
@@ -42,7 +42,7 @@ false:
 ; Do use the flags result of the and here, since the and has another use.
 
 ; CHECK: test3:
-;      CHECK: andl    $16, %e[[A0]]
+;      CHECK: andl    $16, %e
 ; CHECK-NEXT: jne
 
 define void @test3(i32 %x) nounwind {
diff --git a/test/CodeGen/X86/vararg_tailcall.ll b/test/CodeGen/X86/vararg_tailcall.ll
new file mode 100644
index 0000000..73d80eb
--- /dev/null
+++ b/test/CodeGen/X86/vararg_tailcall.ll
@@ -0,0 +1,98 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN64
+
+@.str = private unnamed_addr constant [5 x i8] c"%ld\0A\00"
+@sel = external global i8*
+@sel3 = external global i8*
+@sel4 = external global i8*
+@sel5 = external global i8*
+@sel6 = external global i8*
+@sel7 = external global i8*
+
+; X64: @foo
+; X64: jmp
+; WIN64: @foo
+; WIN64: callq
+define void @foo(i64 %arg) nounwind optsize ssp noredzone {
+entry:
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i64 0, i64 0), i64 %arg) nounwind optsize noredzone
+  ret void
+}
+
+declare i32 @printf(i8*, ...) optsize noredzone
+
+; X64: @bar
+; X64: jmp
+; WIN64: @bar
+; WIN64: jmp
+define void @bar(i64 %arg) nounwind optsize ssp noredzone {
+entry:
+  tail call void @bar2(i8* getelementptr inbounds ([5 x i8]* @.str, i64 0, i64 0), i64 %arg) nounwind optsize noredzone
+  ret void
+}
+
+declare void @bar2(i8*, i64) optsize noredzone
+
+; X64: @foo2
+; X64: jmp
+; WIN64: @foo2
+; WIN64: callq
+define i8* @foo2(i8* %arg) nounwind optsize ssp noredzone {
+entry:
+  %tmp1 = load i8** @sel, align 8, !tbaa !0
+  %call = tail call i8* (i8*, i8*, ...)* @x2(i8* %arg, i8* %tmp1) nounwind optsize noredzone
+  ret i8* %call
+}
+
+declare i8* @x2(i8*, i8*, ...) optsize noredzone
+
+; X64: @foo6
+; X64: jmp
+; WIN64: @foo6
+; WIN64: callq
+define i8* @foo6(i8* %arg1, i8* %arg2) nounwind optsize ssp noredzone {
+entry:
+  %tmp2 = load i8** @sel3, align 8, !tbaa !0
+  %tmp3 = load i8** @sel4, align 8, !tbaa !0
+  %tmp4 = load i8** @sel5, align 8, !tbaa !0
+  %tmp5 = load i8** @sel6, align 8, !tbaa !0
+  %call = tail call i8* (i8*, i8*, i8*, ...)* @x3(i8* %arg1, i8* %arg2, i8* %tmp2, i8* %tmp3, i8* %tmp4, i8* %tmp5) nounwind optsize noredzone
+  ret i8* %call
+}
+
+declare i8* @x3(i8*, i8*, i8*, ...) optsize noredzone
+
+; X64: @foo7
+; X64: callq
+; WIN64: @foo7
+; WIN64: callq
+define i8* @foo7(i8* %arg1, i8* %arg2) nounwind optsize ssp noredzone {
+entry:
+  %tmp2 = load i8** @sel3, align 8, !tbaa !0
+  %tmp3 = load i8** @sel4, align 8, !tbaa !0
+  %tmp4 = load i8** @sel5, align 8, !tbaa !0
+  %tmp5 = load i8** @sel6, align 8, !tbaa !0
+  %tmp6 = load i8** @sel7, align 8, !tbaa !0
+  %call = tail call i8* (i8*, i8*, i8*, i8*, i8*, i8*, i8*, ...)* @x7(i8* %arg1, i8* %arg2, i8* %tmp2, i8* %tmp3, i8* %tmp4, i8* %tmp5, i8* %tmp6) nounwind optsize noredzone
+  ret i8* %call
+}
+
+declare i8* @x7(i8*, i8*, i8*, i8*, i8*, i8*, i8*, ...) optsize noredzone
+
+; X64: @foo8
+; X64: callq
+; WIN64: @foo8
+; WIN64: callq
+define i8* @foo8(i8* %arg1, i8* %arg2) nounwind optsize ssp noredzone {
+entry:
+  %tmp2 = load i8** @sel3, align 8, !tbaa !0
+  %tmp3 = load i8** @sel4, align 8, !tbaa !0
+  %tmp4 = load i8** @sel5, align 8, !tbaa !0
+  %tmp5 = load i8** @sel6, align 8, !tbaa !0
+  %call = tail call i8* (i8*, i8*, i8*, ...)* @x3(i8* %arg1, i8* %arg2, i8* %tmp2, i8* %tmp3, i8* %tmp4, i8* %tmp5, i32 48879, i32 48879) nounwind optsize noredzone
+  ret i8* %call
+}
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/X86/vec_extract-sse4.ll b/test/CodeGen/X86/vec_extract-sse4.ll
index dab5dd1..f487654 100644
--- a/test/CodeGen/X86/vec_extract-sse4.ll
+++ b/test/CodeGen/X86/vec_extract-sse4.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=x86 -mattr=+sse41 -o %t
-; RUN: grep extractps   %t | count 1
-; RUN: grep pextrd      %t | count 1
+; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse41 -o %t
+; RUN: not grep extractps   %t
+; RUN: not grep pextrd      %t
 ; RUN: not grep pshufd  %t
-; RUN: not grep movss   %t
+; RUN: grep movss   %t | count 2
 
 define void @t1(float* %R, <4 x float>* %P1) nounwind {
 	%X = load <4 x float>* %P1
diff --git a/test/CodeGen/X86/vec_extract.ll b/test/CodeGen/X86/vec_extract.ll
index b013730..2c8796b 100644
--- a/test/CodeGen/X86/vec_extract.ll
+++ b/test/CodeGen/X86/vec_extract.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse41 -o %t
-; RUN: grep movss    %t | count 3
+; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse41 -o %t
+; RUN: grep movss    %t | count 4
 ; RUN: grep movhlps  %t | count 1
-; RUN: grep pshufd   %t | count 1
+; RUN: not grep pshufd   %t 
 ; RUN: grep unpckhpd %t | count 1
 
 define void @test1(<4 x float>* %F, float* %f) nounwind {
diff --git a/test/CodeGen/X86/vec_shuffle-16.ll b/test/CodeGen/X86/vec_shuffle-16.ll
index 2ee87fe..06f38ed 100644
--- a/test/CodeGen/X86/vec_shuffle-16.ll
+++ b/test/CodeGen/X86/vec_shuffle-16.ll
@@ -1,8 +1,9 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse,-sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse
 ; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse2
 
+; sse:  t1:
+; sse2: t1:
 define <4 x float> @t1(<4 x float> %a, <4 x float> %b) nounwind  {
-; sse: movaps
 ; sse: shufps
 ; sse2: pshufd
 ; sse2-NEXT: ret
@@ -10,6 +11,8 @@ define <4 x float> @t1(<4 x float> %a, <4 x float> %b) nounwind  {
         ret <4 x float> %tmp1
 }
 
+; sse:  t2:
+; sse2: t2:
 define <4 x float> @t2(<4 x float> %A, <4 x float> %B) nounwind {
 ; sse: shufps
 ; sse2: pshufd
@@ -18,8 +21,9 @@ define <4 x float> @t2(<4 x float> %A, <4 x float> %B) nounwind {
 	ret <4 x float> %tmp
 }
 
+; sse:  t3:
+; sse2: t3:
 define <4 x float> @t3(<4 x float> %A, <4 x float> %B) nounwind {
-; sse: movaps
 ; sse: shufps
 ; sse2: pshufd
 ; sse2-NEXT: ret
@@ -27,7 +31,10 @@ define <4 x float> @t3(<4 x float> %A, <4 x float> %B) nounwind {
 	ret <4 x float> %tmp
 }
 
+; sse:  t4:
+; sse2: t4:
 define <4 x float> @t4(<4 x float> %A, <4 x float> %B) nounwind {
+
 ; sse: shufps
 ; sse2: pshufd
 ; sse2-NEXT: ret
diff --git a/test/CodeGen/X86/vec_uint_to_fp.ll b/test/CodeGen/X86/vec_uint_to_fp.ll
index 39e7d71..fe7fa2f 100644
--- a/test/CodeGen/X86/vec_uint_to_fp.ll
+++ b/test/CodeGen/X86/vec_uint_to_fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=sandybridge | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck %s
 
 ; Test that we are not lowering uinttofp to scalars
 define <4 x float> @test1(<4 x i32> %A) nounwind {
diff --git a/test/CodeGen/X86/visibility2.ll b/test/CodeGen/X86/visibility2.ll
new file mode 100644
index 0000000..72ea733
--- /dev/null
+++ b/test/CodeGen/X86/visibility2.ll
@@ -0,0 +1,18 @@
+; This test case ensures that when the visibility of a global declaration is 
+; emitted they are not treated as definitions.  Test case for r132825.
+; Fixes <rdar://problem/9429892>.
+;
+; RUN: llc -mtriple=x86_64-apple-darwin %s -o - | FileCheck %s
+
+@foo_private_extern_str = external hidden global i8*
+
+define void @foo1() nounwind ssp {
+entry:
+  %tmp = load i8** @foo_private_extern_str, align 8
+  call void @foo3(i8* %tmp)
+  ret void
+}
+
+declare void @foo3(i8*)
+
+; CHECK-NOT: .private_extern
diff --git a/test/CodeGen/X86/widen_load-0.ll b/test/CodeGen/X86/widen_load-0.ll
index 82c8252..c91627c 100644
--- a/test/CodeGen/X86/widen_load-0.ll
+++ b/test/CodeGen/X86/widen_load-0.ll
@@ -4,15 +4,15 @@
 
 ; Both loads should happen before either store.
 
-; CHECK: movl  (%rdi), %eax
-; CHECK: movl  (%rsi), %ecx
-; CHECK: movl  %ecx, (%rdi)
-; CHECK: movl  %eax, (%rsi)
+; CHECK: movl  (%rdi), %[[R1:...]]
+; CHECK: movl  (%rsi), %[[R2:...]]
+; CHECK: movl  %[[R2]], (%rdi)
+; CHECK: movl  %[[R1]], (%rsi)
 
-; WIN64: movl  (%rcx), %eax
-; WIN64: movl  (%rdx), %esi
-; WIN64: movl  %esi, (%rcx)
-; WIN64: movl  %eax, (%rdx)
+; WIN64: movl  (%rcx), %[[R1:...]]
+; WIN64: movl  (%rdx), %[[R2:...]]
+; WIN64: movl  %[[R2]], (%rcx)
+; WIN64: movl  %[[R1]], (%rdx)
 
 define void @short2_int_swap(<2 x i16>* nocapture %b, i32* nocapture %c) nounwind {
 entry:
diff --git a/test/CodeGen/X86/win64_alloca_dynalloca.ll b/test/CodeGen/X86/win64_alloca_dynalloca.ll
index cbd38da..e39d007 100644
--- a/test/CodeGen/X86/win64_alloca_dynalloca.ll
+++ b/test/CodeGen/X86/win64_alloca_dynalloca.ll
@@ -1,9 +1,12 @@
-; RUN: llc < %s -mtriple=x86_64-mingw32     | FileCheck %s -check-prefix=M64
-; RUN: llc < %s -mtriple=x86_64-win32       | FileCheck %s -check-prefix=W64
-; RUN: llc < %s -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI
+; RUN: llc < %s -join-physregs -mtriple=x86_64-mingw32     | FileCheck %s -check-prefix=M64
+; RUN: llc < %s -join-physregs -mtriple=x86_64-win32       | FileCheck %s -check-prefix=W64
+; RUN: llc < %s -join-physregs -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI
 ; PR8777
 ; PR8778
 
+; Passing the same value in two registers creates a false interference that
+; only -join-physregs resolves. It could also be handled by a parallel copy.
+
 define i64 @foo(i64 %n, i64 %x) nounwind {
 entry:
 
@@ -40,9 +43,9 @@ entry:
 ; W64: subq  %rax, %rsp
 ; W64: movq  %rsp, %rax
 
-; EFI: leaq  15(%rcx), [[R1:%r..]]
+; EFI: leaq  15(%rcx), [[R1:%r.*]]
 ; EFI: andq  $-16, [[R1]]
-; EFI: movq  %rsp, [[R64:%r..]]
+; EFI: movq  %rsp, [[R64:%r.*]]
 ; EFI: subq  [[R1]], [[R64]]
 ; EFI: movq  [[R64]], %rsp
 
diff --git a/test/CodeGen/X86/x86-64-and-mask.ll b/test/CodeGen/X86/x86-64-and-mask.ll
index 2465f23..07ccb23 100644
--- a/test/CodeGen/X86/x86-64-and-mask.ll
+++ b/test/CodeGen/X86/x86-64-and-mask.ll
@@ -39,7 +39,7 @@ define void @ccc(i64 %x) nounwind {
 
 ; This requires a mov and a 64-bit and.
 ; CHECK: ddd:
-; CHECK: movabsq $4294967296, %rax
+; CHECK: movabsq $4294967296, %r
 ; CHECK: andq %rax, %rdi
 
 define void @ddd(i64 %x) nounwind {
diff --git a/test/CodeGen/X86/x86-64-extend-shift.ll b/test/CodeGen/X86/x86-64-extend-shift.ll
index 6852785..6ebaeee 100644
--- a/test/CodeGen/X86/x86-64-extend-shift.ll
+++ b/test/CodeGen/X86/x86-64-extend-shift.ll
@@ -2,7 +2,7 @@
 ; Formerly there were two shifts.
 
 define i64 @baz(i32 %A) nounwind {
-; CHECK:  shlq  $49, %rax
+; CHECK:  shlq  $49, %r
         %tmp1 = shl i32 %A, 17
         %tmp2 = zext i32 %tmp1 to i64
         %tmp3 = shl i64 %tmp2, 32
diff --git a/test/CodeGen/X86/x86-shifts.ll b/test/CodeGen/X86/x86-shifts.ll
new file mode 100644
index 0000000..fdf68f9
--- /dev/null
+++ b/test/CodeGen/X86/x86-shifts.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+
+; Splat patterns below
+
+
+define <4 x i32> @shl4(<4 x i32> %A) nounwind {
+entry:
+; CHECK:      shl4
+; CHECK:      pslld
+; CHECK-NEXT: pslld
+  %B = shl <4 x i32> %A,  < i32 2, i32 2, i32 2, i32 2>
+  %C = shl <4 x i32> %A,  < i32 1, i32 1, i32 1, i32 1>
+  %K = xor <4 x i32> %B, %C
+  ret <4 x i32> %K
+}
+
+define <4 x i32> @shr4(<4 x i32> %A) nounwind {
+entry:
+; CHECK:      shr4
+; CHECK:      psrld
+; CHECK-NEXT: psrld
+  %B = lshr <4 x i32> %A,  < i32 2, i32 2, i32 2, i32 2>
+  %C = lshr <4 x i32> %A,  < i32 1, i32 1, i32 1, i32 1>
+  %K = xor <4 x i32> %B, %C
+  ret <4 x i32> %K
+}
+
+define <4 x i32> @sra4(<4 x i32> %A) nounwind {
+entry:
+; CHECK:      sra4
+; CHECK:      psrad
+; CHECK-NEXT: psrad
+  %B = ashr <4 x i32> %A,  < i32 2, i32 2, i32 2, i32 2>
+  %C = ashr <4 x i32> %A,  < i32 1, i32 1, i32 1, i32 1>
+  %K = xor <4 x i32> %B, %C
+  ret <4 x i32> %K
+}
+
+define <2 x i64> @shl2(<2 x i64> %A) nounwind {
+entry:
+; CHECK:      shl2
+; CHECK:      psllq
+; CHECK-NEXT: psllq
+  %B = shl <2 x i64> %A,  < i64 2, i64 2>
+  %C = shl <2 x i64> %A,  < i64 9, i64 9>
+  %K = xor <2 x i64> %B, %C
+  ret <2 x i64> %K
+}
+
+define <2 x i64> @shr2(<2 x i64> %A) nounwind {
+entry:
+; CHECK:      shr2
+; CHECK:      psrlq
+; CHECK-NEXT: psrlq
+  %B = lshr <2 x i64> %A,  < i64 8, i64 8>
+  %C = lshr <2 x i64> %A,  < i64 1, i64 1>
+  %K = xor <2 x i64> %B, %C
+  ret <2 x i64> %K
+}
+
+
+define <8 x i16> @shl8(<8 x i16> %A) nounwind {
+entry:
+; CHECK:      shl8
+; CHECK:      psllw
+; CHECK-NEXT: psllw
+  %B = shl <8 x i16> %A,  < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  %C = shl <8 x i16> %A,  < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %K = xor <8 x i16> %B, %C
+  ret <8 x i16> %K
+}
+
+define <8 x i16> @shr8(<8 x i16> %A) nounwind {
+entry:
+; CHECK:      shr8
+; CHECK:      psrlw
+; CHECK-NEXT: psrlw
+  %B = lshr <8 x i16> %A,  < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  %C = lshr <8 x i16> %A,  < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %K = xor <8 x i16> %B, %C
+  ret <8 x i16> %K
+}
+
+define <8 x i16> @sra8(<8 x i16> %A) nounwind {
+entry:
+; CHECK:      sra8
+; CHECK:      psraw
+; CHECK-NEXT: psraw
+  %B = ashr <8 x i16> %A,  < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  %C = ashr <8 x i16> %A,  < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %K = xor <8 x i16> %B, %C
+  ret <8 x i16> %K
+}
+
+; non splat test
+
+
+define <8 x i16> @sll8_nosplat(<8 x i16> %A) nounwind {
+entry:
+; CHECK: sll8_nosplat
+; CHECK-NOT: psll
+; CHECK-NOT: psll
+  %B = shl <8 x i16> %A,  < i16 1, i16 2, i16 3, i16 6, i16 2, i16 2, i16 2, i16 2>
+  %C = shl <8 x i16> %A,  < i16 9, i16 7, i16 5, i16 1, i16 4, i16 1, i16 1, i16 1>
+  %K = xor <8 x i16> %B, %C
+  ret <8 x i16> %K
+}
+
+
+define <2 x i64> @shr2_nosplat(<2 x i64> %A) nounwind {
+entry:
+; CHECK: shr2_nosplat
+; CHECK-NOT:  psrlq
+; CHECK-NOT:  psrlq
+  %B = lshr <2 x i64> %A,  < i64 8, i64 1>
+  %C = lshr <2 x i64> %A,  < i64 1, i64 0>
+  %K = xor <2 x i64> %B, %C
+  ret <2 x i64> %K
+}
+
+
+; Other shifts
+
+define <2 x i32> @shl2_other(<2 x i32> %A) nounwind {
+entry:
+; CHECK: shl2_other
+; CHECK-not:      psllq
+  %B = shl <2 x i32> %A,  < i32 2, i32 2>
+  %C = shl <2 x i32> %A,  < i32 9, i32 9>
+  %K = xor <2 x i32> %B, %C
+  ret <2 x i32> %K
+}
+
+define <2 x i32> @shr2_other(<2 x i32> %A) nounwind {
+entry:
+; CHECK: shr2_other
+; CHECK-NOT:      psrlq
+  %B = lshr <2 x i32> %A,  < i32 8, i32 8>
+  %C = lshr <2 x i32> %A,  < i32 1, i32 1>
+  %K = xor <2 x i32> %B, %C
+  ret <2 x i32> %K
+}
diff --git a/test/CodeGen/X86/xor.ll b/test/CodeGen/X86/xor.ll
index b90d81a..178c59d 100644
--- a/test/CodeGen/X86/xor.ll
+++ b/test/CodeGen/X86/xor.ll
@@ -29,9 +29,8 @@ entry:
         ret i32 %tmp4
         
 ; X64: test3:
-; X64:	notl	[[A1:%esi|%edx]]
-; X64:	andl	[[A0:%edi|%ecx]], [[A1]]
-; X64:	movl	[[A1]], %eax
+; X64:	notl
+; X64:	andl
 ; X64:	shrl	%eax
 ; X64:	ret
 
@@ -139,7 +138,7 @@ entry:
   %t2 = add i32 %t1, -1
   ret i32 %t2
 ; X64: test8:
-; X64:   notl %eax
+; X64:   notl {{%eax|%edi|%ecx}}
 ; X32: test8:
 ; X32:   notl %eax
 }
diff --git a/test/CodeGen/XCore/bitrev.ll b/test/CodeGen/XCore/bitrev.ll
deleted file mode 100644
index 09202d3..0000000
--- a/test/CodeGen/XCore/bitrev.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llc < %s -march=xcore > %t1.s
-; RUN: grep bitrev %t1.s | count 1 
-declare i32 @llvm.xcore.bitrev(i32)
-
-define i32 @test(i32 %val) {
-	%result = call i32 @llvm.xcore.bitrev(i32 %val)
-	ret i32 %result
-}
diff --git a/test/CodeGen/XCore/misc-intrinsics.ll b/test/CodeGen/XCore/misc-intrinsics.ll
new file mode 100644
index 0000000..f504a2e
--- /dev/null
+++ b/test/CodeGen/XCore/misc-intrinsics.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -march=xcore | FileCheck %s
+%0 = type { i32, i32 }
+
+declare i32 @llvm.xcore.bitrev(i32)
+declare i32 @llvm.xcore.crc32(i32, i32, i32)
+declare %0 @llvm.xcore.crc8(i32, i32, i32)
+
+define i32 @bitrev(i32 %val) {
+; CHECK: bitrev:
+; CHECK: bitrev r0, r0
+	%result = call i32 @llvm.xcore.bitrev(i32 %val)
+	ret i32 %result
+}
+
+define i32 @crc32(i32 %crc, i32 %data, i32 %poly) {
+; CHECK: crc32:
+; CHECK: crc32 r0, r1, r2
+	%result = call i32 @llvm.xcore.crc32(i32 %crc, i32 %data, i32 %poly)
+	ret i32 %result
+}
+
+define %0 @crc8(i32 %crc, i32 %data, i32 %poly) {
+; CHECK: crc8:
+; CHECK: crc8 r0, r1, r1, r2
+	%result = call %0 @llvm.xcore.crc8(i32 %crc, i32 %data, i32 %poly)
+	ret %0 %result
+}
diff --git a/test/CodeGen/XCore/mul64.ll b/test/CodeGen/XCore/mul64.ll
index 77c6b42..3d373b1 100644
--- a/test/CodeGen/XCore/mul64.ll
+++ b/test/CodeGen/XCore/mul64.ll
@@ -9,7 +9,7 @@ entry:
 }
 ; CHECK: umul_lohi:
 ; CHECK: ldc [[REG:r[0-9]+]], 0
-; CHECK-NEXT: lmul r1, r0, r1, r0, [[REG]], [[REG]]
+; CHECK-NEXT: lmul {{.*}}, [[REG]], [[REG]]
 ; CHECK-NEXT: retsp 0
 
 define i64 @smul_lohi(i32 %a, i32 %b) {
@@ -23,9 +23,7 @@ entry:
 ; CHECK: ldc
 ; CHECK-NEXT: mov
 ; CHECK-NEXT: maccs
-; CHECK-NEXT: mov r0,
-; CHECK-NEXT: mov r1,
-; CHECK-NEXT: retsp 0
+; CHECK: retsp 0
 
 define i64 @mul64(i64 %a, i64 %b) {
 entry:
@@ -37,7 +35,6 @@ entry:
 ; CHECK-NEXT: lmul
 ; CHECK-NEXT: mul
 ; CHECK-NEXT: lmul
-; CHECK-NEXT: mov r0,
 
 define i64 @mul64_2(i64 %a, i32 %b) {
 entry:
@@ -50,4 +47,4 @@ entry:
 ; CHECK-NEXT: lmul
 ; CHECK-NEXT: mul
 ; CHECK-NEXT: add r1,
-; CHECK-NEXT: retsp 0
+; CHECK: retsp 0
diff --git a/test/DebugInfo/X86/debug_frame.ll b/test/DebugInfo/X86/debug_frame.ll
new file mode 100644
index 0000000..d273d73
--- /dev/null
+++ b/test/DebugInfo/X86/debug_frame.ll
@@ -0,0 +1,18 @@
+; RUN: llc %s -mtriple=i686-pc-linux-gnu -o - | FileCheck %s
+
+; Test that we produce a .debug_frame, not an .eh_frame
+
+; CHECK: .cfi_sections .debug_frame
+
+define void @f() nounwind {
+entry:
+  ret void
+}
+
+!llvm.dbg.sp = !{!0}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"f", metadata !"f", metadata !"", metadata !1, i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, void ()* @f, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/llvm/build", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 12, metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/llvm/build", metadata !"clang version 3.0 ()", i1 true, i1 true, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{null}
diff --git a/test/DebugInfo/X86/dg.exp b/test/DebugInfo/X86/dg.exp
new file mode 100644
index 0000000..7b7bd4e
--- /dev/null
+++ b/test/DebugInfo/X86/dg.exp
@@ -0,0 +1,5 @@
+load_lib llvm.exp
+
+if { [llvm_supports_target X86] } {
+  RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll}]]
+}
diff --git a/test/DebugInfo/X86/eh_symbol.ll b/test/DebugInfo/X86/eh_symbol.ll
new file mode 100644
index 0000000..a87afed
--- /dev/null
+++ b/test/DebugInfo/X86/eh_symbol.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=i386-apple-macosx -disable-cfi %s -o - | FileCheck %s
+
+; test that we don't produce foo.eh symbols is a debug_frame section.
+; CHECK-NOT: .globl	_f.eh
+
+define i32 @f() nounwind readnone optsize {
+entry:
+  ret i32 42
+}
+
+!llvm.dbg.sp = !{!0}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"f", metadata !"f", metadata !"", metadata !1, i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 ()* @f, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/tmpfs/build", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 12, metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/tmpfs/build", metadata !"clang version 3.0 ()", i1 true, i1 true, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 589860, metadata !2, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
diff --git a/test/DebugInfo/X86/stmt-list.ll b/test/DebugInfo/X86/stmt-list.ll
new file mode 100644
index 0000000..145649b
--- /dev/null
+++ b/test/DebugInfo/X86/stmt-list.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s
+
+; CHECK:      .section        .debug_line,"",@progbits
+; CHECK-NEXT: .Lsection_line:
+
+; CHECK:      .long   .Lsection_line          # DW_AT_stmt_list
+
+define void @f() {
+entry:
+  ret void
+}
+
+!llvm.dbg.sp = !{!0}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"f", metadata !"f", metadata !"", metadata !1, i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, void ()* @f, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"test2.c", metadata !"/home/espindola/llvm", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 12, metadata !"test2.c", metadata !"/home/espindola/llvm", metadata !"clang version 3.0 ()", i1 true, i1 true, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{null}
diff --git a/test/DebugInfo/X86/subreg.ll b/test/DebugInfo/X86/subreg.ll
new file mode 100644
index 0000000..1c4456f
--- /dev/null
+++ b/test/DebugInfo/X86/subreg.ll
@@ -0,0 +1,27 @@
+; RUN: llc %s -mtriple=x86_64-pc-linux-gnu -O0 -o - | FileCheck %s
+
+; We are testing that a value in a 16 bit register gets reported as
+; being in its superregister.
+; FIXME: There should be a DW_OP_bit_piece too.
+
+; CHECK: .byte   80                      # DW_OP_reg0
+
+define i16 @f(i16 signext %zzz) nounwind {
+entry:
+  call void @llvm.dbg.value(metadata !{i16 %zzz}, i64 0, metadata !0)
+  %conv = sext i16 %zzz to i32, !dbg !7
+  %conv1 = trunc i32 %conv to i16
+  ret i16 %conv1
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!0 = metadata !{i32 590081, metadata !1, metadata !"zzz", metadata !2, i32 16777219, metadata !6, i32 0} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{i32 589870, i32 0, metadata !2, metadata !"f", metadata !"f", metadata !"", metadata !2, i32 3, metadata !4, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i16 (i16)* @f, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 589865, metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/tmpfs/build", metadata !3} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 589841, i32 0, i32 12, metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/tmpfs/build", metadata !"clang version 3.0 ()", i1 true, i1 false, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{i32 589845, metadata !2, metadata !"", metadata !2, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !5, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!5 = metadata !{null}
+!6 = metadata !{i32 589860, metadata !3, metadata !"short", null, i32 0, i64 16, i64 16, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!7 = metadata !{i32 4, i32 22, metadata !8, null}
+!8 = metadata !{i32 589835, metadata !1, i32 3, i32 19, metadata !2, i32 0} ; [ DW_TAG_lexical_block ]
diff --git a/test/DebugInfo/pr9951.ll b/test/DebugInfo/pr9951.ll
new file mode 100644
index 0000000..7716cd7
--- /dev/null
+++ b/test/DebugInfo/pr9951.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple x86_64-apple-darwin10.0.0 -disable-cfi %s -o - | FileCheck %s
+
+define i32 @f() nounwind {
+entry:
+  ret i32 42
+}
+
+!llvm.dbg.sp = !{!0}
+
+!0 = metadata !{i32 589870, i32 0, metadata !1, metadata !"f", metadata !"f", metadata !"", metadata !1, i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 ()* @f, null, null} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589865, metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/llvm/build-rust2", metadata !2} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 589841, i32 0, i32 12, metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/llvm/build-rust2", metadata !"clang version 3.0 ()", i1 true, i1 false, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 589860, metadata !2, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+
+
+; CHECK:      _f:                                     ## @f
+; CHECK-NEXT: Ltmp0:
+
+; CHECK:      Ltmp9 = (Ltmp3-Ltmp2)-0
+; CHECK-NEXT:	.long	Ltmp9
+; CHECK-NEXT:	.quad	Ltmp0
diff --git a/test/FrontendC++/2006-11-06-StackTrace.cpp b/test/FrontendC++/2006-11-06-StackTrace.cpp
index b79c0bf..2813c36 100644
--- a/test/FrontendC++/2006-11-06-StackTrace.cpp
+++ b/test/FrontendC++/2006-11-06-StackTrace.cpp
@@ -1,7 +1,7 @@
 // This is a regression test on debug info to make sure that we can get a
 // meaningful stack trace from a C++ program.
 // RUN: %llvmgcc -S -O0 -g %s -o - | \
-// RUN:    llc --disable-fp-elim -o %t.s -O0 -relocation-model=pic
+// RUN:    llc --disable-cfi --disable-fp-elim -o %t.s -O0 -relocation-model=pic
 // RUN: %compile_c %t.s -o %t.o
 // RUN: %link %t.o -o %t.exe
 // RUN: echo {break DeepStack::deepest\nrun 17\nwhere\n} > %t.in 
diff --git a/test/FrontendC++/2006-11-30-Pubnames.cpp b/test/FrontendC++/2006-11-30-Pubnames.cpp
index 239d3f5..fc7beeb 100644
--- a/test/FrontendC++/2006-11-30-Pubnames.cpp
+++ b/test/FrontendC++/2006-11-30-Pubnames.cpp
@@ -1,7 +1,7 @@
 // This is a regression test on debug info to make sure that we can access 
 // qualified global names.
 // RUN: %llvmgcc -S -O0 -g %s -o - | \
-// RUN:   llc --disable-fp-elim -o %t.s -O0
+// RUN:   llc -disable-cfi --disable-fp-elim -o %t.s -O0
 // RUN: %compile_c %t.s -o %t.o
 // RUN: %link %t.o -o %t.exe
 // RUN: %llvmdsymutil %t.exe 
diff --git a/test/FrontendC++/2009-04-21-DtorNames-dbg.cpp b/test/FrontendC++/2009-04-21-DtorNames-dbg.cpp
index e3616da..da09c0b 100644
--- a/test/FrontendC++/2009-04-21-DtorNames-dbg.cpp
+++ b/test/FrontendC++/2009-04-21-DtorNames-dbg.cpp
@@ -1,4 +1,4 @@
-// RUN: %llvmgcc -S -g %s -o - | llc -O0 -o %t.s
+// RUN: %llvmgcc -S -g %s -o - | llc --disable-cfi -O0 -o %t.s
 // RUN: %compile_c %t.s -o %t.o
 // PR4025
 
diff --git a/test/FrontendC++/2010-08-31-ByValArg.cpp b/test/FrontendC++/2010-08-31-ByValArg.cpp
index be0d354..4ccaabd 100644
--- a/test/FrontendC++/2010-08-31-ByValArg.cpp
+++ b/test/FrontendC++/2010-08-31-ByValArg.cpp
@@ -1,7 +1,7 @@
 // This regression test checks byval arguments' debug info.
 // Radar 8367011
 // RUN: %llvmgcc -S -O0 -g %s -o - | \
-// RUN:    llc --disable-fp-elim -o %t.s -O0 -relocation-model=pic
+// RUN:    llc --disable-cfi --disable-fp-elim -o %t.s -O0 -relocation-model=pic
 // RUN: %compile_c %t.s -o %t.o
 // RUN: %link %t.o -o %t.exe
 // RUN: echo {break get\nrun\np missing_arg.b} > %t.in 
diff --git a/test/FrontendC/2008-07-29-EHLabel.ll b/test/FrontendC/2008-07-29-EHLabel.ll
index 7577bc9..186eafa 100644
--- a/test/FrontendC/2008-07-29-EHLabel.ll
+++ b/test/FrontendC/2008-07-29-EHLabel.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -o - | %llvmgcc -xassembler -c -o /dev/null -
+; RUN: llc -disable-cfi %s -o - | %llvmgcc -xassembler -c -o /dev/null -
 ; PR2609
 	%struct..0._11 = type { i32 }
 	%struct..1__pthread_mutex_s = type { i32, i32, i32, i32, i32, %struct..0._11 }
diff --git a/test/FrontendC/2009-02-17-BitField-dbg.c b/test/FrontendC/2009-02-17-BitField-dbg.c
index 80ccc4a..88d2cbb 100644
--- a/test/FrontendC/2009-02-17-BitField-dbg.c
+++ b/test/FrontendC/2009-02-17-BitField-dbg.c
@@ -1,6 +1,6 @@
 // Check bitfields.
 // RUN: %llvmgcc -S -O0 -g %s -o - | \
-// RUN: llc --disable-fp-elim -o 2009-02-17-BitField-dbg.s
+// RUN: llc -disable-cfi --disable-fp-elim -o 2009-02-17-BitField-dbg.s
 // RUN: %compile_c 2009-02-17-BitField-dbg.s -o 2009-02-17-BitField-dbg.o
 // RUN: echo {ptype mystruct} > %t2
 // RUN: gdb -q -batch -n -x %t2 2009-02-17-BitField-dbg.o | \
diff --git a/test/FrontendC/2010-01-05-LinkageName.c b/test/FrontendC/2010-01-05-LinkageName.c
index 9c1a215..279df03 100644
--- a/test/FrontendC/2010-01-05-LinkageName.c
+++ b/test/FrontendC/2010-01-05-LinkageName.c
@@ -1,4 +1,4 @@
-// RUN: %llvmgcc -O2 -S -g %s -o - | llc -o 2010-01-05-LinkageName.s -O0 
+// RUN: %llvmgcc -O2 -S -g %s -o - | llc -disable-cfi -o 2010-01-05-LinkageName.s -O0 
 // RUN: %compile_c 2010-01-05-LinkageName.s -o 2010-01-05-LinkageName.s
 
 struct tm {};
diff --git a/test/FrontendC/2010-01-14-StaticVariable.c b/test/FrontendC/2010-01-14-StaticVariable.c
index 80dd4d4..0635900 100644
--- a/test/FrontendC/2010-01-14-StaticVariable.c
+++ b/test/FrontendC/2010-01-14-StaticVariable.c
@@ -1,7 +1,7 @@
 // This is a regression test on debug info to make sure that llvm emitted
 // debug info does not crash gdb.
 // RUN: %llvmgcc -S -O0 -g %s -o - | \
-// RUN:    llc --disable-fp-elim -o %t.s -O0 -relocation-model=pic
+// RUN:    llc -disable-cfi --disable-fp-elim -o %t.s -O0 -relocation-model=pic
 // RUN: %compile_c %t.s -o %t.o
 // RUN: echo {quit\n} > %t.in 
 // RUN: gdb -q -batch -n -x %t.in %t.o > /dev/null
diff --git a/test/FrontendC/2010-02-16-DbgVarScope.c b/test/FrontendC/2010-02-16-DbgVarScope.c
index 1d912d0f..24910ad 100644
--- a/test/FrontendC/2010-02-16-DbgVarScope.c
+++ b/test/FrontendC/2010-02-16-DbgVarScope.c
@@ -1,5 +1,5 @@
 // RUN: %llvmgcc -S -O0 -g %s -o - | \
-// RUN:    llc --disable-fp-elim -o %t.s -O0 -relocation-model=pic
+// RUN:    llc -disable-cfi --disable-fp-elim -o %t.s -O0 -relocation-model=pic
 // RUN: %compile_c %t.s -o %t.o
 // RUN: %link %t.o -o %t.exe
 // RUN: echo {break 24\nrun\np loc\n} > %t.in 
diff --git a/test/FrontendC/2010-05-18-asmsched.c b/test/FrontendC/2010-05-18-asmsched.c
index 33b8770..ca7625f 100644
--- a/test/FrontendC/2010-05-18-asmsched.c
+++ b/test/FrontendC/2010-05-18-asmsched.c
@@ -3,8 +3,9 @@
 
 void foo(int x, int y) {
 // CHECK: bar
-// CHECK: movq  %r9, %r10
-// CHECK: movq  %rdi, %r9
+// CHECK-NOT: {{, %r9$}}
+// CHECK: movq  %r9,
+// CHECK: movq  {{.*}}, %r9
 // CHECK: bar
   register int lr9 asm("r9") = x;
   register int lr10 asm("r10") = y;
diff --git a/test/FrontendC/ARM/dg.exp b/test/FrontendC/ARM/dg.exp
new file mode 100644
index 0000000..df7d49e
--- /dev/null
+++ b/test/FrontendC/ARM/dg.exp
@@ -0,0 +1,5 @@
+load_lib llvm.exp
+
+if { [llvm_supports_target ARM] && [llvm_gcc_supports c] } {
+    RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll,c,cpp,s}]]
+}
diff --git a/test/FrontendC/ARM/inline-asm-multichar.c b/test/FrontendC/ARM/inline-asm-multichar.c
new file mode 100644
index 0000000..7e2eeef
--- /dev/null
+++ b/test/FrontendC/ARM/inline-asm-multichar.c
@@ -0,0 +1,11 @@
+// RUN: %llvmgcc -S -march=armv7a %s 
+
+// XFAIL: *
+// XTARGET: arm
+
+int t1() {
+  static float k = 1.0f;
+CHECK: call void asm sideeffect "flds s15, $0 \0A", "*^Uv,~{s15}"
+  __asm__ volatile ("flds s15, %[k] \n" :: [k] "Uv,m" (k) : "s15");
+  return 0;
+}
diff --git a/test/FrontendC/pr4349.c b/test/FrontendC/pr4349.c
index 24acd9c..49c89e2 100644
--- a/test/FrontendC/pr4349.c
+++ b/test/FrontendC/pr4349.c
@@ -16,22 +16,22 @@ struct svar
 {
     void *ptr;
 };
-// CHECK: @svars1 = unnamed_addr global [1 x %struct.svar] [%struct.svar { i8* bitcast (%struct.cpu* @cpu to i8*) }]
+// CHECK: @svars1 = global [1 x %struct.svar] [%struct.svar { i8* bitcast (%struct.cpu* @cpu to i8*) }]
 struct svar svars1[] =
 {
     { &((cpu.pc).w[0]) }
 };
-// CHECK: @svars2 = unnamed_addr global [1 x %struct.svar] [%struct.svar { i8* getelementptr ([2 x i8]* bitcast (%struct.cpu* @cpu to [2 x i8]*), i{{[0-9]+}} 0, i{{[0-9]+}} 1) }]
+// CHECK: @svars2 = global [1 x %struct.svar] [%struct.svar { i8* getelementptr ([2 x i8]* bitcast (%struct.cpu* @cpu to [2 x i8]*), i{{[0-9]+}} 0, i{{[0-9]+}} 1) }]
 struct svar svars2[] =
 {
     { &((cpu.pc).b[0][1]) }
 };
-// CHECK: @svars3 = unnamed_addr global [1 x %struct.svar] [%struct.svar { i8* bitcast (i16* getelementptr ([2 x i16]* bitcast (%struct.cpu* @cpu to [2 x i16]*), i{{[0-9]+}} 0, i{{[0-9]+}} 1) to i8*) }]
+// CHECK: @svars3 = global [1 x %struct.svar] [%struct.svar { i8* bitcast (i16* getelementptr ([2 x i16]* bitcast (%struct.cpu* @cpu to [2 x i16]*), i{{[0-9]+}} 0, i{{[0-9]+}} 1) to i8*) }]
 struct svar svars3[] =
 {
     { &((cpu.pc).w[1]) }
 };
-// CHECK: @svars4 = unnamed_addr global [1 x %struct.svar] [%struct.svar { i8* getelementptr ([2 x [2 x i8]]* bitcast (%struct.cpu* @cpu to [2 x [2 x i8]]*), i{{[0-9]+}} 0, i{{[0-9]+}} 1, i{{[0-9]+}} 1) }]
+// CHECK: @svars4 = global [1 x %struct.svar] [%struct.svar { i8* getelementptr ([2 x [2 x i8]]* bitcast (%struct.cpu* @cpu to [2 x [2 x i8]]*), i{{[0-9]+}} 0, i{{[0-9]+}} 1, i{{[0-9]+}} 1) }]
 struct svar svars4[] =
 {
     { &((cpu.pc).b[1][1]) }
diff --git a/test/FrontendC/struct-matching-constraint.c b/test/FrontendC/struct-matching-constraint.c
new file mode 100644
index 0000000..d002cdd
--- /dev/null
+++ b/test/FrontendC/struct-matching-constraint.c
@@ -0,0 +1,19 @@
+// RUN: %llvmgcc -S -march=armv7a %s 
+
+// XFAIL: *
+// XTARGET: arm
+
+typedef struct __simd128_uint16_t
+{
+  __neon_uint16x8_t val;
+} uint16x8_t;
+
+void b(uint16x8_t sat, uint16x8_t luma)
+{
+  __asm__("vmov.16 %1, %0   \n\t"
+                                           "vtrn.16 %0, %1   \n\t"
+   :"=w"(luma), "=w"(sat)
+   :"0"(luma)
+   );
+
+}
diff --git a/test/FrontendObjC/2009-08-17-DebugInfo.m b/test/FrontendObjC/2009-08-17-DebugInfo.m
index 8ed7c24..825bbd7 100644
--- a/test/FrontendObjC/2009-08-17-DebugInfo.m
+++ b/test/FrontendObjC/2009-08-17-DebugInfo.m
@@ -1,6 +1,6 @@
 // This is a regression test on debug info to make sure that we can set a
 // breakpoint on a objective message.
-// RUN: %llvmgcc -S -O0 -g %s -o - | llc -o %t.s -O0
+// RUN: %llvmgcc -S -O0 -g %s -o - | llc -disable-cfi -o %t.s -O0
 // RUN: %compile_c %t.s -o %t.o
 // RUN: %link %t.o -o %t.exe -framework Foundation
 // RUN: echo {break randomFunc\n} > %t.in 
diff --git a/test/MC/ARM/arm_instructions.s b/test/MC/ARM/arm_instructions.s
index 50a2b70..f789441 100644
--- a/test/MC/ARM/arm_instructions.s
+++ b/test/MC/ARM/arm_instructions.s
@@ -270,6 +270,9 @@
 @ CHECK: msr  cpsr_fc, r0 @ encoding: [0x00,0xf0,0x29,0xe1]
         msr  cpsr_fc, r0
 
+@ CHECK: msr  cpsr_fc, r0 @ encoding: [0x00,0xf0,0x29,0xe1]
+        msr  cpsr_all, r0
+
 @ CHECK: msr  cpsr_fsx, r0 @ encoding: [0x00,0xf0,0x2e,0xe1]
         msr  cpsr_fsx, r0
 
@@ -309,3 +312,6 @@
 @ CHECK: ldrexd  r0, r1, [r0] @ encoding: [0x9f,0x0f,0xb0,0xe1]
         ldrexd  r0, r1, [r0]
 
+@ CHECK: ssat16  r0, #7, r0 @ encoding: [0x30,0x0f,0xa6,0xe6]
+        ssat16  r0, #7, r0
+
diff --git a/test/MC/ARM/elf-movt.s b/test/MC/ARM/elf-movt.s
index 0fe7c50..18061f5 100644
--- a/test/MC/ARM/elf-movt.s
+++ b/test/MC/ARM/elf-movt.s
@@ -9,10 +9,10 @@
 barf:                                   @ @barf
 @ BB#0:                                 @ %entry
 	movw	r0, :lower16:GOT-(.LPC0_2+8)
-	movt	r0, :upper16:GOT-(.LPC0_2+16)
+	movt	r0, :upper16:GOT-(.LPC0_2+8)
 .LPC0_2:
 @ ASM:          movw    r0, :lower16:(GOT-(.LPC0_2+8))
-@ ASM-NEXT:     movt    r0, :upper16:(GOT-(.LPC0_2+16))
+@ ASM-NEXT:     movt    r0, :upper16:(GOT-(.LPC0_2+8))
 
 @@ make sure that the text section fixups are sane too
 @ OBJ:                 '.text'
@@ -25,7 +25,7 @@ barf:                                   @ @barf
 @ OBJ-NEXT:            'sh_info', 0x00000000
 @ OBJ-NEXT:            'sh_addralign', 0x00000004
 @ OBJ-NEXT:            'sh_entsize', 0x00000000
-@ OBJ-NEXT:            '_section_data', 'f00f0fe3 ff0f4fe3'
+@ OBJ-NEXT:            '_section_data', 'f00f0fe3 f40f4fe3'
 
 @ OBJ:              Relocation 0x00000000
 @ OBJ-NEXT:         'r_offset', 0x00000000
diff --git a/test/MC/ARM/elf-thumbfunc-reloc.ll b/test/MC/ARM/elf-thumbfunc-reloc.ll
new file mode 100644
index 0000000..6fce4038
--- /dev/null
+++ b/test/MC/ARM/elf-thumbfunc-reloc.ll
@@ -0,0 +1,37 @@
+; RUN: llc %s -mtriple=thumbv7-linux-gnueabi -relocation-model=pic \
+; RUN: -filetype=obj -o - | elf-dump --dump-section-data | \
+; RUN: FileCheck %s
+
+; FIXME: This file needs to be in .s form!
+; We wanna test relocatable thumb function call,
+; but ARMAsmParser cannot handle "bl foo(PLT)" yet
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n32"
+target triple = "thumbv7-none--gnueabi"
+
+define void @foo() nounwind {
+entry:
+  ret void
+}
+
+define void @bar() nounwind {
+entry:
+  call void @foo()
+  ret void
+}
+
+
+; make sure that bl 0 <foo> (fff7feff) is correctly encoded
+; CHECK: '_section_data', '70470000 2de90048 fff7feff bde80088'
+
+;  Offset     Info    Type            Sym.Value  Sym. Name
+; 00000008  0000070a R_ARM_THM_CALL    00000001   foo
+; CHECK:           Relocation 0x00000000 
+; CHECK-NEXT:      'r_offset', 0x00000008
+; CHECK-NEXT:      'r_sym', 0x00000007
+; CHECK-NEXT:      'r_type', 0x0000000a
+
+; make sure foo is thumb function: bit 0 = 1
+; CHECK:           Symbol 0x00000007
+; CHECK-NEXT:      'foo'
+; CHECK-NEXT:      'st_value', 0x00000001
diff --git a/test/MC/ARM/elf-thumbfunc.s b/test/MC/ARM/elf-thumbfunc.s
new file mode 100644
index 0000000..a1b3c31
--- /dev/null
+++ b/test/MC/ARM/elf-thumbfunc.s
@@ -0,0 +1,20 @@
+@@ test st_value bit 0 of thumb function
+@ RUN: llvm-mc %s -triple=thumbv7-linux-gnueabi -filetype=obj -o - | \
+@ RUN: elf-dump  | FileCheck %s
+	.syntax unified
+	.text
+	.globl	foo
+	.align	2
+	.type	foo,%function
+	.code	16
+	.thumb_func
+foo:
+	bx	lr
+
+@@ make sure foo is thumb function: bit 0 = 1 (st_value)
+@CHECK:           Symbol 0x00000004
+@CHECK-NEXT:      'st_name', 0x00000001
+@CHECK-NEXT:      'st_value', 0x00000001
+@CHECK-NEXT:      'st_size', 0x00000000
+@CHECK-NEXT:      'st_bind', 0x00000001
+@CHECK-NEXT:      'st_type', 0x00000002
diff --git a/test/MC/ARM/simple-encoding.ll b/test/MC/ARM/simple-encoding.ll
index 7b581b3..3322803 100644
--- a/test/MC/ARM/simple-encoding.ll
+++ b/test/MC/ARM/simple-encoding.ll
@@ -1,4 +1,4 @@
-;RUN: llc -mtriple=armv7-apple-darwin -show-mc-encoding -disable-cgp-branch-opts < %s | FileCheck %s
+;RUN: llc -mtriple=armv7-apple-darwin -show-mc-encoding -disable-cgp-branch-opts -join-physregs < %s | FileCheck %s
 
 
 ;FIXME: Once the ARM integrated assembler is up and going, these sorts of tests
diff --git a/test/MC/ARM/thumb.s b/test/MC/ARM/thumb.s
index 342a390..55d9789 100644
--- a/test/MC/ARM/thumb.s
+++ b/test/MC/ARM/thumb.s
@@ -12,6 +12,8 @@
 
 @ CHECK: blx	r9                   @ encoding: [0xc8,0x47]
 	blx	r9
+@ CHECK: blx	r10                     @ encoding: [0xd0,0x47]
+  blx r10
 
 @ CHECK: rev	r2, r3               @ encoding: [0x1a,0xba]
 @ CHECK: rev16	r3, r4               @ encoding: [0x63,0xba]
@@ -68,3 +70,6 @@
 
 @ CHECK: cpsie aif @ encoding: [0x67,0xb6]
         cpsie aif
+
+@ CHECK: mov  r0, pc @ encoding: [0x78,0x46]
+        mov  r0, pc
diff --git a/test/MC/ARM/thumb2.s b/test/MC/ARM/thumb2.s
index 5342b90..4e9d4e1 100644
--- a/test/MC/ARM/thumb2.s
+++ b/test/MC/ARM/thumb2.s
@@ -300,3 +300,5 @@
   ldrex  r0, [r0]
 @ CHECK: ldrexd  r0, r1, [r0] @ encoding: [0xd0,0xe8,0x7f,0x01]
   ldrexd  r0, r1, [r0]
+@ CHECK: ssat16  r0, #7, r0 @ encoding: [0x20,0xf3,0x06,0x00]
+  ssat16  r0, #7, r0
diff --git a/test/MC/ARM/xscale-attributes.ll b/test/MC/ARM/xscale-attributes.ll
new file mode 100644
index 0000000..e576278
--- /dev/null
+++ b/test/MC/ARM/xscale-attributes.ll
@@ -0,0 +1,31 @@
+; RUN: llc %s -mtriple=thumbv5-linux-gnueabi -mcpu=xscale -o - | \
+; RUN: FileCheck -check-prefix=ASM %s
+
+; RUN: llc %s -mtriple=thumbv5-linux-gnueabi -filetype=obj \
+; RUN: -mcpu=xscale -o - | elf-dump --dump-section-data | \
+; RUN: FileCheck -check-prefix=OBJ %s
+
+; FIXME: The OBJ test should be a .s to .o test and the ASM test should
+; be moved to test/CodeGen/ARM.
+
+define void @foo() nounwind {
+entry:
+  ret void
+}
+
+; ASM:           .eabi_attribute 6, 5
+; ASM-NEXT:      .eabi_attribute 8, 1
+; ASM-NEXT:      .eabi_attribute 9, 1
+
+; OBJ:           Section 0x00000004
+; OBJ-NEXT:      'sh_name', 0x0000000c
+; OBJ-NEXT:      'sh_type', 0x70000003
+; OBJ-NEXT:	   'sh_flags', 0x00000000
+; OBJ-NEXT:	   'sh_addr', 0x00000000
+; OBJ-NEXT:	   'sh_offset', 0x00000038
+; OBJ-NEXT:	   'sh_size', 0x00000020
+; OBJ-NEXT:	   'sh_link', 0x00000000
+; OBJ-NEXT:	   'sh_info', 0x00000000
+; OBJ-NEXT:	   'sh_addralign', 0x00000001
+; OBJ-NEXT:	   'sh_entsize', 0x00000000
+; OBJ-NEXT:      '_section_data', '411f0000 00616561 62690001 15000000 06050801 09011401 15011703 18011901'
diff --git a/test/MC/AsmParser/directive_seh.s b/test/MC/AsmParser/directive_seh.s
new file mode 100644
index 0000000..98fc606
--- /dev/null
+++ b/test/MC/AsmParser/directive_seh.s
@@ -0,0 +1,48 @@
+# RUN: llvm-mc -triple x86_64-pc-win32 %s | FileCheck %s
+
+# CHECK: .seh_proc func
+# CHECK: .seh_pushframe @code
+# CHECK: .seh_stackalloc 24
+# CHECK: .seh_savereg 6, 16
+# CHECK: .seh_savexmm 8, 0
+# CHECK: .seh_pushreg 3
+# CHECK: .seh_setframe 3, 0
+# CHECK: .seh_endprologue
+# CHECK: .seh_handler __C_specific_handler, @except
+# CHECK-NOT: .section{{.*}}.xdata
+# CHECK: .seh_handlerdata
+# CHECK: .text
+# CHECK: .seh_startchained
+# CHECK: .seh_endprologue
+# CHECK: .seh_endchained
+# CHECK: .seh_endproc
+
+    .text
+    .globl func
+    .def func; .scl 2; .type 32; .endef
+    .seh_proc func
+func:
+    .seh_pushframe @code
+    subq $24, %rsp
+    .seh_stackalloc 24
+    movq %rsi, 16(%rsp)
+    .seh_savereg %rsi, 16
+    movups %xmm8, (%rsp)
+    .seh_savexmm %xmm8, 0
+    pushq %rbx
+    .seh_pushreg 3
+    mov %rsp, %rbx
+    .seh_setframe 3, 0
+    .seh_endprologue
+    .seh_handler __C_specific_handler, @except
+    .seh_handlerdata
+    .long 0
+    .text
+    .seh_startchained
+    .seh_endprologue
+    .seh_endchained
+    lea (%rbx), %rsp
+    pop %rbx
+    addq $24, %rsp
+    ret
+    .seh_endproc
diff --git a/test/MC/AsmParser/directive_values.s b/test/MC/AsmParser/directive_values.s
index 98259bd..6c79c38 100644
--- a/test/MC/AsmParser/directive_values.s
+++ b/test/MC/AsmParser/directive_values.s
@@ -56,3 +56,10 @@ TEST6:
 # CHECK:        .byte   35
 # CHECK:        .byte   9
 # CHECK:        .byte   10
+
+TEST7:
+        .byte 1, 2, 3, 4
+# CHECK:        .byte   1
+# CHECK-NEXT:   .byte   2
+# CHECK-NEXT:   .byte   3
+# CHECK-NEXT:   .byte   4
diff --git a/test/MC/AsmParser/macro-args.s b/test/MC/AsmParser/macro-args.s
new file mode 100644
index 0000000..808b6eb
--- /dev/null
+++ b/test/MC/AsmParser/macro-args.s
@@ -0,0 +1,10 @@
+// RUN: llvm-mc -triple x86_64-apple-darwin10 %s | FileCheck %s
+
+.macro GET   var,re2g
+    movl   \var@GOTOFF(%ebx),\re2g
+.endm
+
+
+GET    is_sse, %eax
+
+// CHECK: movl	is_sse@GOTOFF(%ebx), %eax
diff --git a/test/MC/COFF/seh-section.s b/test/MC/COFF/seh-section.s
new file mode 100644
index 0000000..802cba5
--- /dev/null
+++ b/test/MC/COFF/seh-section.s
@@ -0,0 +1,37 @@
+// This test ensures that, if the section containing a function has a suffix
+// (e.g. .text$foo), its unwind info section also has a suffix (.xdata$foo).
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-win32 %s | coff-dump.py | FileCheck %s
+// XFAIL: *
+
+// CHECK:      Name                 = .xdata$foo
+// CHECK-NEXT: VirtualSize
+// CHECK-NEXT: VirtualAddress
+// CHECK-NEXT: SizeOfRawData        = 8
+// CHECK-NEXT: PointerToRawData
+// CHECK-NEXT: PointerToRelocations
+// CHECK-NEXT: PointerToLineNumbers
+// CHECK-NEXT: NumberOfRelocations  = 0
+// CHECK-NEXT: NumberOfLineNumbers  = 0
+// CHECK-NEXT: Charateristics
+// CHECK-NEXT:   IMAGE_SCN_CNT_INITIALIZED_DATA
+// CHECK-NEXT:   IMAGE_SCN_ALIGN_4BYTES
+// CHECK-NEXT:   IMAGE_SCN_MEM_READ
+// CHECK-NEXT:   IMAGE_SCN_MEM_WRITE
+// CHECK-NEXT: SectionData
+// CHECK-NEXT:   01 05 02 00 05 50 04 02
+
+    .section .text$foo,"x"
+    .globl foo
+    .def foo; .scl 2; .type 32; .endef
+    .seh_proc foo
+foo:
+    subq $8, %rsp
+    .seh_stackalloc 8
+    pushq %rbp
+    .seh_pushreg %rbp
+    .seh_endprologue
+    popq %rbp
+    addq $8, %rsp
+    ret
+    .seh_endproc
+
diff --git a/test/MC/COFF/seh.s b/test/MC/COFF/seh.s
new file mode 100644
index 0000000..8cafcb3
--- /dev/null
+++ b/test/MC/COFF/seh.s
@@ -0,0 +1,60 @@
+// This test checks that the SEH directives emit the correct unwind data.
+// RUN: llvm-mc -triple x86_64-pc-win32 -filetype=obj %s | coff-dump.py | FileCheck %s
+
+// CHECK:      Name                 = .xdata
+// CHECK-NEXT: VirtualSize
+// CHECK-NEXT: VirtualAddress
+// CHECK-NEXT: SizeOfRawData        = 52
+// CHECK-NEXT: PointerToRawData
+// CHECK-NEXT: PointerToRelocations
+// CHECK-NEXT: PointerToLineNumbers
+// CHECK-NEXT: NumberOfRelocations  = 4
+// CHECK-NEXT: NumberOfLineNumbers  = 0
+// CHECK-NEXT: Charateristics
+// CHECK-NEXT:   IMAGE_SCN_CNT_INITIALIZED_DATA
+// CHECK-NEXT:   IMAGE_SCN_ALIGN_4BYTES
+// CHECK-NEXT:   IMAGE_SCN_MEM_READ
+// CHECK-NEXT:   IMAGE_SCN_MEM_WRITE
+// CHECK-NEXT: SectionData
+// CHECK-NEXT:   09 12 08 03 00 03 0F 30 - 0E 88 00 00 09 64 02 00
+// CHECK-NEXT:   04 22 00 1A 00 00 00 00 - 00 00 00 00 21 00 00 00
+// CHECK-NEXT:   00 00 00 00 1B 00 00 00 - 00 00 00 00 01 00 00 00
+// CHECK-NEXT:   00 00 00 00
+
+    .text
+    .globl func
+    .def func; .scl 2; .type 32; .endef
+    .seh_proc func
+func:
+    .seh_pushframe @code
+    subq $24, %rsp
+    .seh_stackalloc 24
+    movq %rsi, 16(%rsp)
+    .seh_savereg %rsi, 16
+    movups %xmm8, (%rsp)
+    .seh_savexmm %xmm8, 0
+    pushq %rbx
+    .seh_pushreg 3
+    mov %rsp, %rbx
+    .seh_setframe 3, 0
+    .seh_endprologue
+    .seh_handler __C_specific_handler, @except
+    .seh_handlerdata
+    .long 0
+    .text
+    .seh_startchained
+    .seh_endprologue
+    .seh_endchained
+    lea (%rbx), %rsp
+    pop %rbx
+    addq $24, %rsp
+    ret
+    .seh_endproc
+
+// Test emission of small functions.
+    .globl smallFunc
+    .def smallFunc; .scl 2; .type 32; .endef
+    .seh_proc smallFunc
+smallFunc:
+    ret
+    .seh_endproc
diff --git a/test/MC/Disassembler/ARM/arm-tests.txt b/test/MC/Disassembler/ARM/arm-tests.txt
index ade2952..ca072c7 100644
--- a/test/MC/Disassembler/ARM/arm-tests.txt
+++ b/test/MC/Disassembler/ARM/arm-tests.txt
@@ -164,6 +164,9 @@
 # CHECK: bx r12
 0x1c 0xff 0x2f 0xe1
 
+# CHECK: bxeq r5
+0x15 0xff 0x2f 0x01
+
 # CHECK:	uqadd16mi	r6, r11, r8
 0x18 0x60 0x6b 0x46
 
diff --git a/test/MC/Disassembler/ARM/thumb-tests.txt b/test/MC/Disassembler/ARM/thumb-tests.txt
index 774dbe4..895a5bb 100644
--- a/test/MC/Disassembler/ARM/thumb-tests.txt
+++ b/test/MC/Disassembler/ARM/thumb-tests.txt
@@ -262,3 +262,6 @@
 
 # CHECK:	nop.w
 0xaf 0xf3 0x00 0x80
+
+# CHECK:	bne	#24
+0x0c 0xd1
diff --git a/test/MC/ELF/cfi-offset.s b/test/MC/ELF/cfi-offset.s
index 963a76c..f54dec0 100644
--- a/test/MC/ELF/cfi-offset.s
+++ b/test/MC/ELF/cfi-offset.s
@@ -3,7 +3,7 @@
 f:
 	.cfi_startproc
         nop
-	.cfi_offset %ebp, -16
+	.cfi_offset %rbp, -16
         nop
 	.cfi_endproc
 
diff --git a/test/MC/ELF/cfi-sections.s b/test/MC/ELF/cfi-sections.s
new file mode 100644
index 0000000..a73f3a9
--- /dev/null
+++ b/test/MC/ELF/cfi-sections.s
@@ -0,0 +1,38 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | elf-dump  --dump-section-data | FileCheck -check-prefix=ELF_64 %s
+// RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu %s -o - | elf-dump  --dump-section-data | FileCheck -check-prefix=ELF_32 %s
+
+.cfi_sections .debug_frame
+
+f1:
+        .cfi_startproc
+        nop
+        .cfi_endproc
+
+f2:
+        .cfi_startproc
+        nop
+        .cfi_endproc
+
+// ELF_64:      (('sh_name', 0x00000011) # '.debug_frame'
+// ELF_64-NEXT:  ('sh_type', 0x00000001)
+// ELF_64-NEXT:  ('sh_flags', 0x00000000)
+// ELF_64-NEXT:  ('sh_addr', 0x00000000)
+// ELF_64-NEXT:  ('sh_offset', 0x00000048)
+// ELF_64-NEXT:  ('sh_size', 0x00000048)
+// ELF_64-NEXT:  ('sh_link', 0x00000000)
+// ELF_64-NEXT:  ('sh_info', 0x00000000)
+// ELF_64-NEXT:  ('sh_addralign', 0x00000008)
+// ELF_64-NEXT:  ('sh_entsize', 0x00000000)
+// ELF_64-NEXT:  ('_section_data', '14000000 ffffffff 01000178 100c0708 90010000 00000000 14000000 00000000 00000000 00000000 01000000 00000000 14000000 00000000 00000000 00000000 01000000 00000000')
+
+// ELF_32:      (('sh_name', 0x00000010) # '.debug_frame'
+// ELF_32-NEXT:  ('sh_type', 0x00000001)
+// ELF_32-NEXT:  ('sh_flags', 0x00000000)
+// ELF_32-NEXT:  ('sh_addr', 0x00000000)
+// ELF_32-NEXT:  ('sh_offset', 0x00000038)
+// ELF_32-NEXT:  ('sh_size', 0x00000034)
+// ELF_32-NEXT:  ('sh_link', 0x00000000)
+// ELF_32-NEXT:  ('sh_info', 0x00000000)
+// ELF_32-NEXT:  ('sh_addralign', 0x00000004)
+// ELF_32-NEXT:  ('sh_entsize', 0x00000000)
+// ELF_32-NEXT:  ('_section_data', '10000000 ffffffff 0100017c 080c0404 88010000 0c000000 00000000 00000000 01000000 0c000000 00000000 01000000 01000000')
diff --git a/test/MC/ELF/relocation-386.s b/test/MC/ELF/relocation-386.s
index f7b20b5..25f3450 100644
--- a/test/MC/ELF/relocation-386.s
+++ b/test/MC/ELF/relocation-386.s
@@ -153,6 +153,13 @@
 // CHECK-NEXT:  ('r_sym',
 // CHECK-NEXT:  ('r_type', 0x00000001)
 // CHECK-NEXT: ),
+// Relocation 24 (foo@GOTTPOFF(%edx)) is of type R_386_TLS_IE_32 and uses the
+// symbol
+// CHECK-NEXT: Relocation 0x00000018
+// CHECK-NEXT: (('r_offset', 0x0000008e)
+// CHECK-NEXT:  ('r_sym', 0x0000000d)
+// CHECK-NEXT:  ('r_type', 0x00000021)
+// CHECK-NEXT: ),
 
 // Section 4 is bss
 // CHECK:      # Section 0x00000004
@@ -217,6 +224,7 @@ bar2:
         movl zed@TPOFF(%eax), %eax
         movl zed@DTPOFF(%eax), %eax
         pushl $bar
+        addl foo@GOTTPOFF(%edx), %eax
 
         .section        zedsec,"awT",@progbits
 zed:
diff --git a/test/MC/ELF/relocation.s b/test/MC/ELF/relocation.s
index 4df09e1..2760232 100644
--- a/test/MC/ELF/relocation.s
+++ b/test/MC/ELF/relocation.s
@@ -17,6 +17,7 @@ bar:
         pushq    $bar
         movq	foo(%rip), %rdx
         leaq    foo-bar(%r14),%r14
+        addq	$bar,%rax         # R_X86_64_32S
 
 
 // CHECK:  # Section 0x00000001
@@ -106,6 +107,12 @@ bar:
 // CHECK-NEXT:  ('r_type', 0x00000002)
 // CHECK-NEXT:  ('r_addend', 0x0000005c)
 
+// CHECK: # Relocation 0x0000000e
+// CHECK-NEXT: (('r_offset', 0x00000063)
+// CHECK-NEXT:  ('r_sym', 0x00000002)
+// CHECK-NEXT:  ('r_type', 0x0000000b)
+// CHECK-NEXT:  ('r_addend', 0x00000000)
+
 // CHECK:   # Symbol 0x00000002
 // CHECK: (('st_name', 0x00000000) # ''
 // CHECK:  ('st_bind', 0x00000000)
diff --git a/test/MC/MachO/debug_frame.s b/test/MC/MachO/debug_frame.s
new file mode 100644
index 0000000..47264ef
--- /dev/null
+++ b/test/MC/MachO/debug_frame.s
@@ -0,0 +1,38 @@
+// RUN: llvm-mc -triple i386-apple-darwin %s -filetype=obj -o - | macho-dump | FileCheck %s
+
+// Check that we don't produce a relocation for the CIE pointer and therefore
+// we have only one relocation in __debug_frame.
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.globl	_f
+	.align	4, 0x90
+_f:                                     ## @f
+Ltmp0:
+	.cfi_startproc
+## BB#0:                                ## %entry
+	movl	$42, %eax
+	ret
+Ltmp1:
+	.cfi_endproc
+Leh_func_end0:
+
+	.cfi_sections .debug_frame
+Ltext_end:
+
+// CHECK:       (('section_name', '__debug_frame\x00\x00\x00')
+// CHECK-NEXT:   ('segment_name', '__DWARF\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK-NEXT:   ('address', 8)
+// CHECK-NEXT:   ('size', 36)
+// CHECK-NEXT:   ('offset', 332)
+// CHECK-NEXT:   ('alignment', 2)
+// CHECK-NEXT:   ('reloc_offset', 368)
+// CHECK-NEXT:   ('num_reloc', 1)
+// CHECK-NEXT:   ('flags', 0x2000000)
+// CHECK-NEXT:   ('reserved1', 0)
+// CHECK-NEXT:   ('reserved2', 0)
+// CHECK-NEXT:  ),
+// CHECK-NEXT: ('_relocations', [
+// CHECK-NEXT:   # Relocation 0
+// CHECK-NEXT:   (('word-0', 0x1c),
+// CHECK-NEXT:    ('word-1', 0x4000001)),
+// CHECK-NEXT: ])
diff --git a/test/MC/X86/x86-32-coverage.s b/test/MC/X86/x86-32-coverage.s
index 4ac7efd..d2dd78d 100644
--- a/test/MC/X86/x86-32-coverage.s
+++ b/test/MC/X86/x86-32-coverage.s
@@ -19570,3 +19570,8 @@
 
 // CHECK: 	aeskeygenassist	$125, (%edx,%eax,4), %xmm2
                 aeskeygenassist $125, (%edx,%eax,4), %xmm2
+
+// CHECK:   blendvps	(%rax), %xmm1   # encoding: [0x66,0x0f,0x38,0x14,0x08]
+            blendvps (%rax), %xmm1
+// CHECK:   blendvps	%xmm2, %xmm1    # encoding: [0x66,0x0f,0x38,0x14,0xca]
+            blendvps %xmm2, %xmm1
diff --git a/test/MC/X86/x86-32.s b/test/MC/X86/x86-32.s
index 28900bb..6017880 100644
--- a/test/MC/X86/x86-32.s
+++ b/test/MC/X86/x86-32.s
@@ -934,3 +934,15 @@ pshufw $90, %mm4, %mm0
 // CHECK: strl
 // CHECK: encoding: [0x0f,0x00,0xc8]
 	str %eax
+
+
+// PR9378
+// CHECK: fsubp
+// CHECK: encoding: [0xde,0xe1]
+fsubp %st,%st(1)
+
+// PR9164
+// CHECK: fsubp	%st(2)
+// CHECK: encoding: [0xde,0xe2]
+fsubp   %st, %st(2)
+
diff --git a/test/Other/X86/dg.exp b/test/Other/X86/dg.exp
new file mode 100644
index 0000000..7b7bd4e
--- /dev/null
+++ b/test/Other/X86/dg.exp
@@ -0,0 +1,5 @@
+load_lib llvm.exp
+
+if { [llvm_supports_target X86] } {
+  RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll}]]
+}
diff --git a/test/Other/inline-asm-newline-terminator.ll b/test/Other/X86/inline-asm-newline-terminator.ll
index af93cc0..af93cc0 100644
--- a/test/Other/inline-asm-newline-terminator.ll
+++ b/test/Other/X86/inline-asm-newline-terminator.ll
diff --git a/test/TableGen/SetTheory.td b/test/TableGen/SetTheory.td
new file mode 100644
index 0000000..e0abc63
--- /dev/null
+++ b/test/TableGen/SetTheory.td
@@ -0,0 +1,167 @@
+// Test evaluation of set operations in dags.
+// RUN: tblgen -print-sets %s | FileCheck %s
+// XFAIL: vg_leak
+//
+// The -print-sets driver configures a primitive SetTheory instance that
+// understands these sets:
+
+class Set<dag d> {
+  dag Elements = d;
+}
+
+// It prints all Set instances and their ordered set interpretation.
+
+// Define some elements.
+def a;
+def b;
+def c;
+def d;
+
+// The 'add' operator evaluates and concatenates its arguments.
+def add;
+def S0a : Set<(add)>;
+def S0b : Set<(add a)>;
+def S0c : Set<(add a, b)>;
+def S0d : Set<(add b, a)>;
+def S0e : Set<(add a, a)>;
+def S0f : Set<(add a, a, b, a, c, b, d, a)>;
+def S0g : Set<(add b, a, b)>;
+// CHECK: S0a = [ ]
+// CHECK: S0b = [ a ]
+// CHECK: S0c = [ a b ]
+// CHECK: S0d = [ b a ]
+// CHECK: S0e = [ a ]
+// CHECK: S0f = [ a b c d ]
+// CHECK: S0g = [ b a ]
+
+// Defs of Set class expand into their elements.
+// Mixed sets and elements are flattened.
+def S1a : Set<(add S0a)>;
+def S1b : Set<(add S0a, S0a)>;
+def S1c : Set<(add S0d, S0f)>;
+def S1d : Set<(add d, S0d, S0f)>;
+// CHECK: S1a = [ ]
+// CHECK: S1b = [ ]
+// CHECK: S1c = [ b a c d ]
+// CHECK: S1d = [ d b a c ]
+
+// The 'sub' operator returns the first argument with the following arguments
+// removed.
+def sub;
+def S2a : Set<(sub S1a, S1c)>;
+def S2b : Set<(sub S1c, S1d)>;
+def S2c : Set<(sub S1c, b)>;
+def S2d : Set<(sub S1c, S0c)>;
+def S2e : Set<(sub S1c, S2d)>;
+// CHECK: S2a = [ ]
+// CHECK: S2b = [ ]
+// CHECK: S2c = [ a c d ]
+// CHECK: S2d = [ c d ]
+// CHECK: S2e = [ b a ]
+
+// The 'and' operator intersects two sets. The result has the same order as the
+// first argument.
+def and;
+def S3a : Set<(and S2d, S2e)>;
+def S3b : Set<(and S2d, S1d)>;
+// CHECK: S3a = [ ]
+// CHECK: S3b = [ c d ]
+
+// The 'shl' operator removes the first N elements.
+def shl;
+def S4a : Set<(shl S0f, 0)>;
+def S4b : Set<(shl S0f, 1)>;
+def S4c : Set<(shl S0f, 3)>;
+def S4d : Set<(shl S0f, 4)>;
+def S4e : Set<(shl S0f, 5)>;
+// CHECK: S4a = [ a b c d ]
+// CHECK: S4b = [ b c d ]
+// CHECK: S4c = [ d ]
+// CHECK: S4d = [ ]
+// CHECK: S4e = [ ]
+
+// The 'trunc' operator truncates after the first N elements.
+def trunc;
+def S5a : Set<(trunc S0f, 0)>;
+def S5b : Set<(trunc S0f, 1)>;
+def S5c : Set<(trunc S0f, 3)>;
+def S5d : Set<(trunc S0f, 4)>;
+def S5e : Set<(trunc S0f, 5)>;
+// CHECK: S5a = [ ]
+// CHECK: S5b = [ a ]
+// CHECK: S5c = [ a b c ]
+// CHECK: S5d = [ a b c d ]
+// CHECK: S5e = [ a b c d ]
+
+// The 'rotl' operator rotates left, but also accepts a negative shift.
+def rotl;
+def S6a : Set<(rotl S0f, 0)>;
+def S6b : Set<(rotl S0f, 1)>;
+def S6c : Set<(rotl S0f, 3)>;
+def S6d : Set<(rotl S0f, 4)>;
+def S6e : Set<(rotl S0f, 5)>;
+def S6f : Set<(rotl S0f, -1)>;
+def S6g : Set<(rotl S0f, -4)>;
+def S6h : Set<(rotl S0f, -5)>;
+// CHECK: S6a = [ a b c d ]
+// CHECK: S6b = [ b c d a ]
+// CHECK: S6c = [ d a b c ]
+// CHECK: S6d = [ a b c d ]
+// CHECK: S6e = [ b c d a ]
+// CHECK: S6f = [ d a b c ]
+// CHECK: S6g = [ a b c d ]
+// CHECK: S6h = [ d a b c ]
+
+// The 'rotr' operator rotates right, but also accepts a negative shift.
+def rotr;
+def S7a : Set<(rotr S0f, 0)>;
+def S7b : Set<(rotr S0f, 1)>;
+def S7c : Set<(rotr S0f, 3)>;
+def S7d : Set<(rotr S0f, 4)>;
+def S7e : Set<(rotr S0f, 5)>;
+def S7f : Set<(rotr S0f, -1)>;
+def S7g : Set<(rotr S0f, -4)>;
+def S7h : Set<(rotr S0f, -5)>;
+// CHECK: S7a = [ a b c d ]
+// CHECK: S7b = [ d a b c ]
+// CHECK: S7c = [ b c d a ]
+// CHECK: S7d = [ a b c d ]
+// CHECK: S7e = [ d a b c ]
+// CHECK: S7f = [ b c d a ]
+// CHECK: S7g = [ a b c d ]
+// CHECK: S7h = [ b c d a ]
+
+// The 'decimate' operator picks every N'th element.
+def decimate;
+def e0;
+def e1;
+def e2;
+def e3;
+def e4;
+def e5;
+def e6;
+def e7;
+def e8;
+def e9;
+def E : Set<(add e0, e1, e2, e3, e4, e5, e6, e7, e8, e9)>;
+def S8a : Set<(decimate E, 3)>;
+def S8b : Set<(decimate E, 9)>;
+def S8c : Set<(decimate E, 10)>;
+def S8d : Set<(decimate (rotl E, 1), 2)>;
+def S8e : Set<(add (decimate E, 2), (decimate (rotl E, 1), 2))>;
+// CHECK: S8a = [ e0 e3 e6 e9 ]
+// CHECK: S8b = [ e0 e9 ]
+// CHECK: S8c = [ e0 ]
+// CHECK: S8d = [ e1 e3 e5 e7 e9 ]
+// CHECK: S8e = [ e0 e2 e4 e6 e8 e1 e3 e5 e7 e9 ]
+
+// The 'sequence' operator finds a sequence of records from their name.
+def sequence;
+def S9a : Set<(sequence "e%u", 3, 7)>;
+def S9b : Set<(sequence "e%u", 7, 3)>;
+def S9c : Set<(sequence "e%u", 0, 0)>;
+def S9d : Set<(sequence "S%ua", 7, 9)>;
+// CHECK: S9a = [ e3 e4 e5 e6 e7 ]
+// CHECK: S9b = [ e7 e6 e5 e4 e3 ]
+// CHECK: S9c = [ e0 ]
+// CHECK: S9d = [ a b c d e0 e3 e6 e9 e4 e5 e7 ]
diff --git a/test/Transforms/ConstProp/calls.ll b/test/Transforms/ConstProp/calls.ll
index 82d7324..3b6010a 100644
--- a/test/Transforms/ConstProp/calls.ll
+++ b/test/Transforms/ConstProp/calls.ll
@@ -7,6 +7,7 @@ declare double @sin(double)
 declare double @tan(double)
 
 declare double @sqrt(double)
+declare double @exp2(double)
 
 define double @T() {
 ; CHECK: @T
@@ -19,7 +20,11 @@ define double @T() {
   %b = fadd double %a, %C
   %D = call double @sqrt(double 4.000000e+00)
   %c = fadd double %b, %D
-  ret double %c
+
+  ; PR9315
+  %E = call double @exp2(double 4.0)
+  %d = fadd double %c, %E 
+  ret double %d
 }
 
 define i1 @test_sse_cvt() nounwind readnone {
diff --git a/test/Transforms/DeadArgElim/2008-06-23-DeadAfterLive.ll b/test/Transforms/DeadArgElim/2008-06-23-DeadAfterLive.ll
index adfd019..858c935 100644
--- a/test/Transforms/DeadArgElim/2008-06-23-DeadAfterLive.ll
+++ b/test/Transforms/DeadArgElim/2008-06-23-DeadAfterLive.ll
@@ -2,7 +2,7 @@
 ; RUN: cat %t | grep 123
 
 ; This test tries to catch wrongful removal of return values for a specific case
-; that was break llvm-gcc builds.
+; that was breaking llvm-gcc builds.
 
 ; This function has a live return value, it is used by @alive.
 define internal i32 @test5() {
diff --git a/test/Transforms/DeadStoreElimination/simple.ll b/test/Transforms/DeadStoreElimination/simple.ll
index a61eac9..23576dad 100644
--- a/test/Transforms/DeadStoreElimination/simple.ll
+++ b/test/Transforms/DeadStoreElimination/simple.ll
@@ -236,3 +236,20 @@ define void @test18(i8* %P, i8* %Q, i8* %R) nounwind ssp {
 ; CHECK-NEXT: call void @llvm.memcpy
 ; CHECK-NEXT: ret
 }
+
+
+; The store here is not dead because the byval call reads it.
+declare void @test19f({i32}* byval align 4 %P)
+
+define void @test19({i32} * nocapture byval align 4 %arg5) nounwind ssp {
+bb:
+  %tmp7 = getelementptr inbounds {i32}* %arg5, i32 0, i32 0
+  store i32 912, i32* %tmp7
+  call void @test19f({i32}* byval align 4 %arg5)
+  ret void
+
+; CHECK: @test19(
+; CHECK: store i32 912
+; CHECK: call void @test19f
+}
+
diff --git a/test/Transforms/GVN/2007-07-26-InterlockingLoops.ll b/test/Transforms/GVN/2007-07-26-InterlockingLoops.ll
index 14cb91b..a1cc008 100644
--- a/test/Transforms/GVN/2007-07-26-InterlockingLoops.ll
+++ b/test/Transforms/GVN/2007-07-26-InterlockingLoops.ll
@@ -4,8 +4,11 @@
 
 define i32 @NextRootMove(i32 %wtm) {
 entry:
+        %A = alloca i32*
 	%tmp17618 = load i32** getelementptr ([65 x i32*]* @last, i32 0, i32 1), align 4
+        store i32* %tmp17618, i32** %A
 ; CHECK: entry:
+; CHECK-NEXT: alloca i32
 ; CHECK-NEXT: %tmp17618 = load
 ; CHECK-NOT: load
 ; CHECK-NOT: phi
@@ -16,6 +19,7 @@ cond_true116:
 
 cond_true128:
 	%tmp17625 = load i32** getelementptr ([65 x i32*]* @last, i32 0, i32 1), align 4
+        store i32* %tmp17625, i32** %A
 	br i1 false, label %bb98.backedge, label %return.loopexit
 
 bb98.backedge:
@@ -23,6 +27,7 @@ bb98.backedge:
 
 cond_true145:
 	%tmp17631 = load i32** getelementptr ([65 x i32*]* @last, i32 0, i32 1), align 4
+        store i32* %tmp17631, i32** %A
 	br i1 false, label %bb98.backedge, label %return.loopexit
 
 return.loopexit:
diff --git a/test/Transforms/GVN/2008-07-02-Unreachable.ll b/test/Transforms/GVN/2008-07-02-Unreachable.ll
index be69cfc..407940b 100644
--- a/test/Transforms/GVN/2008-07-02-Unreachable.ll
+++ b/test/Transforms/GVN/2008-07-02-Unreachable.ll
@@ -5,6 +5,7 @@
 
 define i8 @func_1() nounwind  {
 entry:
+  %A = alloca i8
 	br i1 false, label %ifelse, label %ifthen
 
 ifthen:		; preds = %entry
@@ -12,6 +13,7 @@ ifthen:		; preds = %entry
 
 ifelse:		; preds = %entry
 	%tmp3 = load i8* @g_3		; <i8> [#uses=0]
+        store i8 %tmp3, i8* %A
 	br label %forcond.thread
 
 forcond.thread:		; preds = %ifelse
diff --git a/test/Transforms/GVN/2011-06-01-NonLocalMemdepMiscompile.ll b/test/Transforms/GVN/2011-06-01-NonLocalMemdepMiscompile.ll
new file mode 100644
index 0000000..f24e956
--- /dev/null
+++ b/test/Transforms/GVN/2011-06-01-NonLocalMemdepMiscompile.ll
@@ -0,0 +1,61 @@
+; RUN: opt < %s -basicaa -gvn -S | FileCheck %s
+; This test is checking that (a) this doesn't crash, and (b) we don't
+; conclude the value of %tmp17 is available in bb1.bb15_crit_edge.
+; rdar://9429882
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.7.0"
+
+define i1 @rb_intern() nounwind ssp {
+; CHECK: @rb_intern
+
+bb:
+  %tmp = alloca i8*, align 8
+  store i8* null, i8** %tmp, align 8
+  store i8 undef, i8* null, align 536870912
+  br label %bb1
+
+bb1:
+  br i1 undef, label %bb3, label %bb15
+
+; CHECK: bb1:
+; CHECK: %tmp16 = phi i8* [ getelementptr (i8* null, i64 undef), %bb10 ], [ null, %bb ]
+
+; CHECK: bb1.bb15_crit_edge:
+; CHECK: %tmp17.pre = load i8* %tmp16, align 1
+
+bb3:
+  call void @isalnum()
+  br i1 undef, label %bb10, label %bb5
+
+bb5:
+  br i1 undef, label %bb10, label %bb6
+
+bb6:
+  %tmp7 = load i8** %tmp, align 8
+  %tmp8 = load i8* %tmp7, align 1
+  %tmp9 = zext i8 %tmp8 to i64
+  br i1 undef, label %bb15, label %bb10
+
+bb10:
+  %tmp11 = load i8** %tmp, align 8
+  %tmp12 = load i8* %tmp11, align 1
+  %tmp13 = zext i8 %tmp12 to i64
+  %tmp14 = getelementptr inbounds i8* null, i64 undef
+  store i8* %tmp14, i8** %tmp, align 8
+  br label %bb1
+
+bb15:
+  %tmp16 = load i8** %tmp, align 8
+  %tmp17 = load i8* %tmp16, align 1
+  %tmp18 = icmp eq i8 %tmp17, 0
+  br label %bb19
+
+; CHECK: bb15:
+; CHECK: %tmp17 = phi i8 [ %tmp17.pre, %bb1.bb15_crit_edge ], [ %tmp8, %bb6 ]
+
+bb19:                                             ; preds = %bb15
+  ret i1 %tmp18
+}
+
+declare void @isalnum() nounwind inlinehint ssp
diff --git a/test/Transforms/GVN/crash.ll b/test/Transforms/GVN/crash.ll
index 4a3aa1c..31eae25 100644
--- a/test/Transforms/GVN/crash.ll
+++ b/test/Transforms/GVN/crash.ll
@@ -151,3 +151,15 @@ dead:
 dead2:
   ret i32 %A
 }
+
+
+; PR9841
+define fastcc i8 @test5(i8* %P) nounwind {
+entry:
+  %0 = load i8* %P, align 2
+
+  %Q = getelementptr i8* %P, i32 1
+  %1 = load i8* %Q, align 1
+  ret i8 %1
+}
+
diff --git a/test/Transforms/GVN/mixed.ll b/test/Transforms/GVN/mixed.ll
deleted file mode 100644
index 6bfada2..0000000
--- a/test/Transforms/GVN/mixed.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: opt < %s -basicaa -gvn -S | not grep DEADLOAD
-; RUN: opt < %s -basicaa -gvn -S | not grep DEADGEP
-
-define i32 @main(i32** %p) {
-block1:
-	%z1 = load i32** %p
-	%z2 = getelementptr i32* %z1, i32 0
-	%z3 = load i32* %z2
-	%DEADLOAD = load i32** %p
-	%DEADGEP = getelementptr i32* %DEADLOAD, i32 0
-	%DEADLOAD2 = load i32* %DEADGEP
-	ret i32 %DEADLOAD2
-}
diff --git a/test/Transforms/GVN/phi-translate-partial-alias.ll b/test/Transforms/GVN/phi-translate-partial-alias.ll
new file mode 100644
index 0000000..47bec41
--- /dev/null
+++ b/test/Transforms/GVN/phi-translate-partial-alias.ll
@@ -0,0 +1,27 @@
+; RUN: opt -basicaa -gvn -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64"
+
+; GVN shouldn't PRE the load around the loop backedge because it's
+; not actually redundant around the loop backedge, despite appearances
+; if phi-translation is ignored.
+
+; CHECK: define void @test0(i8* %begin)
+; CHECK: loop:
+; CHECK:   %l0 = load i8* %phi
+; CHECK:   call void @bar(i8 %l0)
+; CHECK:   %l1 = load i8* %phi
+define void @test0(i8* %begin) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi i8* [ %begin, %entry ], [ %next, %loop ]
+  %l0 = load i8* %phi
+  call void @bar(i8 %l0)
+  %l1 = load i8* %phi
+  %next = getelementptr inbounds i8* %phi, i8 %l1
+  br label %loop
+}
+
+declare void @bar(i8)
diff --git a/test/Transforms/GVN/preserve-tbaa.ll b/test/Transforms/GVN/preserve-tbaa.ll
index 2fcfc47..a936755 100644
--- a/test/Transforms/GVN/preserve-tbaa.ll
+++ b/test/Transforms/GVN/preserve-tbaa.ll
@@ -5,9 +5,9 @@ target datalayout = "e-p:64:64:64"
 ; GVN should preserve the TBAA tag on loads when doing PRE.
 
 ; CHECK: @test
-; CHECK: %tmp33.pre = load i16* undef, align 2, !tbaa !0
+; CHECK: %tmp33.pre = load i16* %P, align 2, !tbaa !0
 ; CHECK: br label %for.body
-define void @test() nounwind {
+define void @test(i16 *%P, i16* %Q) nounwind {
 entry:
   br i1 undef, label %bb.nph, label %for.end
 
@@ -15,8 +15,10 @@ bb.nph:                                           ; preds = %entry
   br label %for.body
 
 for.body:                                         ; preds = %for.body, %bb.nph
-  %tmp33 = load i16* undef, align 2, !tbaa !0
-  store i16 undef, i16* undef, align 2, !tbaa !0
+  %tmp33 = load i16* %P, align 2, !tbaa !0
+  store i16 %tmp33, i16* %Q
+
+  store i16 0, i16* %P, align 2, !tbaa !0
   br i1 false, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body, %entry
diff --git a/test/Transforms/GVN/rle.ll b/test/Transforms/GVN/rle.ll
index 4ff5bec..28b1fc7 100644
--- a/test/Transforms/GVN/rle.ll
+++ b/test/Transforms/GVN/rle.ll
@@ -360,8 +360,11 @@ Cont:
 
 define i32 @chained_load(i32** %p) {
 block1:
+  %A = alloca i32*
+
   %z = load i32** %p
-	br i1 true, label %block2, label %block3
+  store i32* %z, i32** %A
+  br i1 true, label %block2, label %block3
 
 block2:
  %a = load i32** %p
@@ -559,14 +562,14 @@ entry:
   %add = add nsw i32 %tmp2, %conv
   ret i32 %add
 
-; CHECK: @load_load_partial_alias
-; CHECK: load i32*
-; CHECK-NOT: load
-; CHECK: lshr i32 {{.*}}, 8
-; CHECK-NOT: load
-; CHECK: trunc i32 {{.*}} to i8
-; CHECK-NOT: load
-; CHECK: ret i32
+; TEMPORARILYDISABLED: @load_load_partial_alias
+; TEMPORARILYDISABLED: load i32*
+; TEMPORARILYDISABLED-NOT: load
+; TEMPORARILYDISABLED: lshr i32 {{.*}}, 8
+; TEMPORARILYDISABLED-NOT: load
+; TEMPORARILYDISABLED: trunc i32 {{.*}} to i8
+; TEMPORARILYDISABLED-NOT: load
+; TEMPORARILYDISABLED: ret i32
 }
 
 
@@ -586,10 +589,10 @@ land.lhs.true:                                    ; preds = %entry
 
 if.end:
   ret i32 52
-; CHECK: @load_load_partial_alias_cross_block
-; CHECK: land.lhs.true:
-; CHECK-NOT: load i8
-; CHECK: ret i32 %conv6
+; TEMPORARILY_DISABLED: @load_load_partial_alias_cross_block
+; TEMPORARILY_DISABLED: land.lhs.true:
+; TEMPORARILY_DISABLED-NOT: load i8
+; TEMPORARILY_DISABLED: ret i32 %conv6
 }
 
 
diff --git a/test/Transforms/GlobalOpt/crash.ll b/test/Transforms/GlobalOpt/crash.ll
index 9da5a5e..366a874 100644
--- a/test/Transforms/GlobalOpt/crash.ll
+++ b/test/Transforms/GlobalOpt/crash.ll
@@ -64,3 +64,17 @@ define void @memset_with_strange_user() ssp {
   ret void
 }
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+
+; PR9856
+@g_52 = internal global i32** null, align 8
+@g_90 = external global i32*, align 8
+
+define void @icmp_user_of_stored_once() nounwind ssp {
+entry:
+  %tmp4 = load i32*** @g_52, align 8
+  store i32** @g_90, i32*** @g_52
+  %cmp17 = icmp ne i32*** undef, @g_52
+  ret void
+}
+
diff --git a/test/Transforms/GlobalOpt/memset-null.ll b/test/Transforms/GlobalOpt/memset-null.ll
new file mode 100644
index 0000000..01534025
--- /dev/null
+++ b/test/Transforms/GlobalOpt/memset-null.ll
@@ -0,0 +1,29 @@
+; RUN: opt -globalopt %s -S -o - | FileCheck %s
+; PR10047
+
+%0 = type { i32, void ()* }
+%struct.A = type { [100 x i32] }
+
+; CHECK: @a
+@a = global %struct.A zeroinitializer, align 4
+@llvm.global_ctors = appending global [2 x %0] [%0 { i32 65535, void ()* @_GLOBAL__I_a }, %0 { i32 65535, void ()* @_GLOBAL__I_b }]
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+; CHECK-NOT: GLOBAL__I_a
+define internal void @_GLOBAL__I_a() nounwind {
+entry:
+  tail call void @llvm.memset.p0i8.i64(i8* bitcast (%struct.A* @a to i8*), i8 0, i64 400, i32 4, i1 false) nounwind
+  ret void
+}
+
+%struct.X = type { i8 }
+@y = global i8* null, align 8
+@x = global %struct.X zeroinitializer, align 1
+
+define internal void @_GLOBAL__I_b() nounwind {
+entry:
+  %tmp.i.i.i = load i8** @y, align 8
+  tail call void @llvm.memset.p0i8.i64(i8* %tmp.i.i.i, i8 0, i64 10, i32 1, i1 false) nounwind
+  ret void
+}
diff --git a/test/Transforms/IndVarSimplify/elim-extend.ll b/test/Transforms/IndVarSimplify/elim-extend.ll
new file mode 100644
index 0000000..0367e11
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/elim-extend.ll
@@ -0,0 +1,153 @@
+; RUN: opt < %s -indvars -disable-iv-rewrite -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+; IV with constant start, preinc and postinc sign extends, with and without NSW.
+; IV rewrite only removes one sext. WidenIVs removes all three.
+define void @postincConstIV(i8* %base, i32 %limit) nounwind {
+entry:
+  br label %loop
+; CHECK: loop:
+; CHECK-NOT: sext
+; CHECK: exit:
+loop:
+  %iv = phi i32 [ %postiv, %loop ], [ 0, %entry ]
+  %ivnsw = phi i32 [ %postivnsw, %loop ], [ 0, %entry ]
+  %preofs = sext i32 %iv to i64
+  %preadr = getelementptr i8* %base, i64 %preofs
+  store i8 0, i8* %preadr
+  %postiv = add i32 %iv, 1
+  %postofs = sext i32 %postiv to i64
+  %postadr = getelementptr i8* %base, i64 %postofs
+  store i8 0, i8* %postadr
+  %postivnsw = add nsw i32 %ivnsw, 1
+  %postofsnsw = sext i32 %postivnsw to i64
+  %postadrnsw = getelementptr i8* %base, i64 %postofsnsw
+  store i8 0, i8* %postadrnsw
+  %cond = icmp sgt i32 %limit, %iv
+  br i1 %cond, label %loop, label %exit
+exit:
+  br label %return
+return:
+  ret void
+}
+
+; IV with nonconstant start, preinc and postinc sign extends,
+; with and without NSW.
+; As with postincConstIV, WidenIVs removes all three sexts.
+define void @postincVarIV(i8* %base, i32 %init, i32 %limit) nounwind {
+entry:
+  %precond = icmp sgt i32 %limit, %init
+  br i1 %precond, label %loop, label %return
+; CHECK: loop:
+; CHECK-NOT: sext
+; CHECK: exit:
+loop:
+  %iv = phi i32 [ %postiv, %loop ], [ %init, %entry ]
+  %ivnsw = phi i32 [ %postivnsw, %loop ], [ %init, %entry ]
+  %preofs = sext i32 %iv to i64
+  %preadr = getelementptr i8* %base, i64 %preofs
+  store i8 0, i8* %preadr
+  %postiv = add i32 %iv, 1
+  %postofs = sext i32 %postiv to i64
+  %postadr = getelementptr i8* %base, i64 %postofs
+  store i8 0, i8* %postadr
+  %postivnsw = add nsw i32 %ivnsw, 1
+  %postofsnsw = sext i32 %postivnsw to i64
+  %postadrnsw = getelementptr i8* %base, i64 %postofsnsw
+  store i8 0, i8* %postadrnsw
+  %cond = icmp sgt i32 %limit, %postiv
+  br i1 %cond, label %loop, label %exit
+exit:
+  br label %return
+return:
+  ret void
+}
+
+; Test sign extend elimination in the inner and outer loop.
+; %outercount is straightforward to widen, besides being in an outer loop.
+; %innercount is currently blocked by lcssa, so is not widened.
+; %inneriv can be widened only after proving it has no signed-overflow
+;   based on the loop test.
+define void @nestedIV(i8* %address, i32 %limit) nounwind {
+entry:
+  %limitdec = add i32 %limit, -1
+  br label %outerloop
+
+; CHECK: outerloop:
+;
+; Eliminate %ofs1 after widening outercount.
+; CHECK-NOT: sext
+; CHECK: getelementptr
+;
+; IV rewriting hoists a gep into this block. We don't like that.
+; CHECK-NOT: getelementptr
+outerloop:
+  %outercount   = phi i32 [ %outerpostcount, %outermerge ], [ 0, %entry ]
+  %innercount = phi i32 [ %innercount.merge, %outermerge ], [ 0, %entry ]
+
+  %outercountdec = add i32 %outercount, -1
+  %ofs1 = sext i32 %outercountdec to i64
+  %adr1 = getelementptr i8* %address, i64 %ofs1
+  store i8 0, i8* %adr1
+
+  br label %innerpreheader
+
+innerpreheader:
+  %innerprecmp = icmp sgt i32 %limitdec, %innercount
+  br i1 %innerprecmp, label %innerloop, label %outermerge
+
+; CHECK: innerloop:
+;
+; Eliminate %ofs2 after widening inneriv.
+; Eliminate %ofs3 after normalizing sext(innerpostiv)
+; CHECK-NOT: sext
+; CHECK: getelementptr
+;
+; FIXME: We should check that indvars does not increase the number of
+; IVs in this loop. sext elimination plus LFTR currently results in 2 final
+; IVs. Waiting to remove LFTR.
+innerloop:
+  %inneriv = phi i32 [ %innerpostiv, %innerloop ], [ %innercount, %innerpreheader ]
+  %innerpostiv = add i32 %inneriv, 1
+
+  %ofs2 = sext i32 %inneriv to i64
+  %adr2 = getelementptr i8* %address, i64 %ofs2
+  store i8 0, i8* %adr2
+
+  %ofs3 = sext i32 %innerpostiv to i64
+  %adr3 = getelementptr i8* %address, i64 %ofs3
+  store i8 0, i8* %adr3
+
+  %innercmp = icmp sgt i32 %limitdec, %innerpostiv
+  br i1 %innercmp, label %innerloop, label %innerexit
+
+innerexit:
+  %innercount.lcssa = phi i32 [ %innerpostiv, %innerloop ]
+  br label %outermerge
+
+; CHECK: outermerge:
+;
+; Eliminate %ofs4 after widening outercount
+; CHECK-NOT: sext
+; CHECK: getelementptr
+;
+; TODO: Eliminate %ofs5 after removing lcssa
+outermerge:
+  %innercount.merge = phi i32 [ %innercount.lcssa, %innerexit ], [ %innercount, %innerpreheader ]
+
+  %ofs4 = sext i32 %outercount to i64
+  %adr4 = getelementptr i8* %address, i64 %ofs4
+  store i8 0, i8* %adr4
+
+  %ofs5 = sext i32 %innercount.merge to i64
+  %adr5 = getelementptr i8* %address, i64 %ofs5
+  store i8 0, i8* %adr5
+
+  %outerpostcount = add i32 %outercount, 1
+  %tmp47 = icmp slt i32 %outerpostcount, %limit
+  br i1 %tmp47, label %outerloop, label %return
+
+return:
+  ret void
+}
diff --git a/test/Transforms/IndVarSimplify/iv-sext.ll b/test/Transforms/IndVarSimplify/iv-sext.ll
index 3e90873..6c7a627 100644
--- a/test/Transforms/IndVarSimplify/iv-sext.ll
+++ b/test/Transforms/IndVarSimplify/iv-sext.ll
@@ -1,6 +1,4 @@
-; RUN: opt < %s -indvars -S > %t
-; RUN: grep {= sext} %t | count 4
-; RUN: grep {phi i64} %t | count 2
+; RUN: opt < %s -indvars -S | FileCheck %s
 
 ; Indvars should be able to promote the hiPart induction variable in the
 ; inner loop to i64.
@@ -18,6 +16,9 @@ bb.nph22:		; preds = %entry
 	%tmp3 = add i32 %bandEdgeIndex, -1		; <i32> [#uses=2]
 	br label %bb
 
+; CHECK: bb:
+; CHECK: phi i64
+; CHECK-NOT: phi i64
 bb:		; preds = %bb8, %bb.nph22
 	%distERBhi.121 = phi float [ %distERBhi.2.lcssa, %bb8 ], [ 0.000000e+00, %bb.nph22 ]		; <float> [#uses=2]
 	%distERBlo.120 = phi float [ %distERBlo.0.lcssa, %bb8 ], [ 0.000000e+00, %bb.nph22 ]		; <float> [#uses=2]
@@ -28,6 +29,7 @@ bb:		; preds = %bb8, %bb.nph22
 	%tmp4 = icmp sgt i32 %part.016, 0		; <i1> [#uses=1]
 	br i1 %tmp4, label %bb1, label %bb3.preheader
 
+; CHECK: bb1:
 bb1:		; preds = %bb
 	%tmp5 = add i32 %part.016, -1		; <i32> [#uses=1]
 	%tmp6 = sext i32 %tmp5 to i64		; <i64> [#uses=1]
@@ -86,7 +88,10 @@ bb5.preheader:		; preds = %bb3.bb5.preheader_crit_edge, %bb3.preheader
 
 bb.nph12:		; preds = %bb5.preheader
 	br label %bb4
-
+; CHECK: bb4:
+; CHECK: phi i64
+; CHECK-NOT: phi i64
+; CHECK-NOT: sext
 bb4:		; preds = %bb5, %bb.nph12
 	%distERBhi.29 = phi float [ %tmp30, %bb5 ], [ %distERBhi.0.ph, %bb.nph12 ]		; <float> [#uses=1]
 	%hiPart.08 = phi i32 [ %tmp31, %bb5 ], [ %hiPart.119, %bb.nph12 ]		; <i32> [#uses=2]
@@ -102,6 +107,7 @@ bb4:		; preds = %bb5, %bb.nph12
 	%tmp35 = fadd float %tmp34, %peakCount.27		; <float> [#uses=2]
 	br label %bb5
 
+; CHECK: bb5:
 bb5:		; preds = %bb4
 	%.not = fcmp olt float %tmp30, 2.500000e+00		; <i1> [#uses=1]
 	%tmp36 = icmp sgt i32 %tmp3, %tmp31		; <i1> [#uses=1]
diff --git a/test/Transforms/IndVarSimplify/iv-zext.ll b/test/Transforms/IndVarSimplify/iv-zext.ll
index 80a77b6..00018ec 100644
--- a/test/Transforms/IndVarSimplify/iv-zext.ll
+++ b/test/Transforms/IndVarSimplify/iv-zext.ll
@@ -1,6 +1,6 @@
-; RUN: opt < %s -indvars -S > %t
-; RUN: not grep and %t
-; RUN: not grep zext %t
+; RUN: opt < %s -indvars -S | FileCheck %s
+; CHECK-NOT: and
+; CHECK-NOT: zext
 
 target datalayout = "-p:64:64:64-n:32:64"
 
diff --git a/test/Transforms/IndVarSimplify/no-iv-rewrite.ll b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
new file mode 100644
index 0000000..c35feef
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
@@ -0,0 +1,123 @@
+; RUN: opt < %s -indvars -disable-iv-rewrite -S | FileCheck %s
+;
+; Make sure that indvars isn't inserting canonical IVs.
+; This is kinda hard to do until linear function test replacement is removed.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+define i32 @sum(i32* %arr, i32 %n) nounwind {
+entry:
+  %precond = icmp slt i32 0, %n
+  br i1 %precond, label %ph, label %return
+
+ph:
+  br label %loop
+
+; CHECK: loop:
+;
+; We should only have 2 IVs.
+; CHECK: phi
+; CHECK: phi
+; CHECK-NOT: phi
+;
+; sext should be eliminated while preserving gep inboundsness.
+; CHECK-NOT: sext
+; CHECK: getelementptr inbounds
+loop:
+  %i.02 = phi i32 [ 0, %ph ], [ %iinc, %loop ]
+  %s.01 = phi i32 [ 0, %ph ], [ %sinc, %loop ]
+  %ofs = sext i32 %i.02 to i64
+  %adr = getelementptr inbounds i32* %arr, i64 %ofs
+  %val = load i32* %adr
+  %sinc = add nsw i32 %s.01, %val
+  %iinc = add nsw i32 %i.02, 1
+  %cond = icmp slt i32 %iinc, %n
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  %s.lcssa = phi i32 [ %sinc, %loop ]
+  br label %return
+
+return:
+  %s.0.lcssa = phi i32 [ %s.lcssa, %exit ], [ 0, %entry ]
+  ret i32 %s.0.lcssa
+}
+
+define i64 @suml(i32* %arr, i32 %n) nounwind {
+entry:
+  %precond = icmp slt i32 0, %n
+  br i1 %precond, label %ph, label %return
+
+ph:
+  br label %loop
+
+; CHECK: loop:
+;
+; We should only have 2 IVs.
+; CHECK: phi
+; CHECK: phi
+; CHECK-NOT: phi
+;
+; %ofs sext should be eliminated while preserving gep inboundsness.
+; CHECK-NOT: sext
+; CHECK: getelementptr inbounds
+; %vall sext should obviously not be eliminated
+; CHECK: sext
+loop:
+  %i.02 = phi i32 [ 0, %ph ], [ %iinc, %loop ]
+  %s.01 = phi i64 [ 0, %ph ], [ %sinc, %loop ]
+  %ofs = sext i32 %i.02 to i64
+  %adr = getelementptr inbounds i32* %arr, i64 %ofs
+  %val = load i32* %adr
+  %vall = sext i32 %val to i64
+  %sinc = add nsw i64 %s.01, %vall
+  %iinc = add nsw i32 %i.02, 1
+  %cond = icmp slt i32 %iinc, %n
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  %s.lcssa = phi i64 [ %sinc, %loop ]
+  br label %return
+
+return:
+  %s.0.lcssa = phi i64 [ %s.lcssa, %exit ], [ 0, %entry ]
+  ret i64 %s.0.lcssa
+}
+
+define void @outofbounds(i32* %first, i32* %last, i32 %idx) nounwind {
+  %precond = icmp ne i32* %first, %last
+  br i1 %precond, label %ph, label %return
+
+; CHECK: ph:
+; It's not indvars' job to perform LICM on %ofs
+; CHECK-NOT: sext
+ph:
+  br label %loop
+
+; CHECK: loop:
+;
+; Preserve exactly one pointer type IV.
+; CHECK: phi i32*
+; CHECK-NOT: phi
+;
+; Don't create any extra adds.
+; CHECK-NOT: add
+;
+; Preserve gep inboundsness, and don't factor it.
+; CHECK: getelementptr inbounds i32* %ptriv, i32 1
+; CHECK-NOT: add
+loop:
+  %ptriv = phi i32* [ %first, %ph ], [ %ptrpost, %loop ]
+  %ofs = sext i32 %idx to i64
+  %adr = getelementptr inbounds i32* %ptriv, i64 %ofs
+  store i32 3, i32* %adr
+  %ptrpost = getelementptr inbounds i32* %ptriv, i32 1
+  %cond = icmp ne i32* %ptrpost, %last
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  br label %return
+
+return:
+  ret void
+}
diff --git a/test/Transforms/IndVarSimplify/phi-uses-value-multiple-times.ll b/test/Transforms/IndVarSimplify/phi-uses-value-multiple-times.ll
index 34d432b..52c9e5c 100644
--- a/test/Transforms/IndVarSimplify/phi-uses-value-multiple-times.ll
+++ b/test/Transforms/IndVarSimplify/phi-uses-value-multiple-times.ll
@@ -1,4 +1,8 @@
-; RUN: opt < %s -indvars
+; RUN: opt < %s -indvars -disable-output -stats -info-output-file - | FileCheck %s
+; Check that IndVarSimplify is not creating unnecessary canonical IVs
+; that will never be used.
+; CHECK-NOT: indvars
+
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
 @ue = external global i64
diff --git a/test/Transforms/Inline/array_merge.ll b/test/Transforms/Inline/array_merge.ll
index 0d176b8..b2eafeb 100644
--- a/test/Transforms/Inline/array_merge.ll
+++ b/test/Transforms/Inline/array_merge.ll
@@ -19,7 +19,7 @@ entry:
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT: %A.i = alloca
 ; CHECK-NEXT: %B.i = alloca
-; CHECK-NEXT: call void
+; CHECK-NOT: alloca
   call void @foo() nounwind
   call void @foo() nounwind
   ret void
diff --git a/test/Transforms/Inline/inline_invoke.ll b/test/Transforms/Inline/inline_invoke.ll
new file mode 100644
index 0000000..2a1b883
--- /dev/null
+++ b/test/Transforms/Inline/inline_invoke.ll
@@ -0,0 +1,336 @@
+; RUN: opt < %s -inline -S | FileCheck %s
+
+; Test that the inliner correctly handles inlining into invoke sites
+; by appending selectors and forwarding _Unwind_Resume directly to the
+; enclosing landing pad.
+
+;; Test 0 - basic functionality.
+
+%struct.A = type { i8 }
+
+@_ZTIi = external constant i8*
+
+declare void @_ZN1AC1Ev(%struct.A*)
+
+declare void @_ZN1AD1Ev(%struct.A*)
+
+declare void @use(i32) nounwind
+
+declare void @opaque()
+
+declare i8* @llvm.eh.exception() nounwind readonly
+
+declare i32 @llvm.eh.selector(i8*, i8*, ...) nounwind
+
+declare i32 @llvm.eh.typeid.for(i8*) nounwind
+
+declare void @llvm.eh.resume(i8*, i32)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+declare void @_ZSt9terminatev()
+
+define internal void @test0_in() alwaysinline uwtable ssp {
+entry:
+  %a = alloca %struct.A, align 1
+  %b = alloca %struct.A, align 1
+  call void @_ZN1AC1Ev(%struct.A* %a)
+  invoke void @_ZN1AC1Ev(%struct.A* %b)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  invoke void @_ZN1AD1Ev(%struct.A* %b)
+          to label %invoke.cont1 unwind label %lpad
+
+invoke.cont1:
+  call void @_ZN1AD1Ev(%struct.A* %a)
+  ret void
+
+lpad:
+  %exn = call i8* @llvm.eh.exception() nounwind
+  %eh.selector = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i32 0) nounwind
+  invoke void @_ZN1AD1Ev(%struct.A* %a)
+          to label %invoke.cont2 unwind label %terminate.lpad
+
+invoke.cont2:
+  call void @llvm.eh.resume(i8* %exn, i32 %eh.selector) noreturn
+  unreachable
+
+terminate.lpad:
+  %exn3 = call i8* @llvm.eh.exception() nounwind
+  %eh.selector4 = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn3, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* null) nounwind
+  call void @_ZSt9terminatev() noreturn nounwind
+  unreachable
+}
+
+define void @test0_out() uwtable ssp {
+entry:
+  invoke void @test0_in()
+          to label %ret unwind label %lpad
+
+ret:
+  ret void
+
+lpad:                                             ; preds = %entry
+  %exn = call i8* @llvm.eh.exception() nounwind
+  %eh.selector = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  %0 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  %1 = icmp eq i32 %eh.selector, %0
+  br i1 %1, label %catch, label %eh.resume
+
+catch:
+  %ignored = call i8* @__cxa_begin_catch(i8* %exn) nounwind
+  call void @__cxa_end_catch() nounwind
+  br label %ret
+
+eh.resume:
+  call void @llvm.eh.resume(i8* %exn, i32 %eh.selector) noreturn
+  unreachable
+}
+
+; CHECK:    define void @test0_out()
+; CHECK:      [[A:%.*]] = alloca %struct.A,
+; CHECK:      [[B:%.*]] = alloca %struct.A,
+; CHECK:      invoke void @_ZN1AC1Ev(%struct.A* [[A]])
+; CHECK:      invoke void @_ZN1AC1Ev(%struct.A* [[B]])
+; CHECK:      invoke void @_ZN1AD1Ev(%struct.A* [[B]])
+; CHECK:      invoke void @_ZN1AD1Ev(%struct.A* [[A]])
+; CHECK:      call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* {{%.*}}, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i32 0, i8* bitcast (i8** @_ZTIi to i8*))
+; CHECK-NEXT: invoke void @_ZN1AD1Ev(%struct.A* [[A]])
+; CHECK-NEXT:   to label %[[LBL:[^\s]+]] unwind
+; CHECK: [[LBL]]:
+; CHECK-NEXT: br label %[[LPAD:[^\s]+]]
+; CHECK:      ret void
+; CHECK:      call i8* @llvm.eh.exception()
+; CHECK-NEXT: call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* {{%.*}}, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* bitcast (i8** @_ZTIi to i8*))
+; CHECK-NEXT: br label %[[LPAD]]
+; CHECK: [[LPAD]]:
+; CHECK-NEXT: phi i8* [
+; CHECK-NEXT: phi i32 [
+; CHECK-NEXT: call i32 @llvm.eh.typeid.for(
+
+
+;; Test 1 - Correctly handle phis in outer landing pads.
+
+define void @test1_out() uwtable ssp {
+entry:
+  invoke void @test0_in()
+          to label %cont unwind label %lpad
+
+cont:
+  invoke void @test0_in()
+          to label %ret unwind label %lpad
+
+ret:
+  ret void
+
+lpad:
+  %x = phi i32 [ 0, %entry ], [ 1, %cont ]
+  %y = phi i32 [ 1, %entry ], [ 4, %cont ]
+  %exn = call i8* @llvm.eh.exception() nounwind
+  %eh.selector = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  %0 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  %1 = icmp eq i32 %eh.selector, %0
+  br i1 %1, label %catch, label %eh.resume
+
+catch:
+  %ignored = call i8* @__cxa_begin_catch(i8* %exn) nounwind
+  call void @use(i32 %x)
+  call void @use(i32 %y)
+  call void @__cxa_end_catch() nounwind
+  br label %ret
+
+eh.resume:
+  call void @llvm.eh.resume(i8* %exn, i32 %eh.selector) noreturn
+  unreachable
+}
+
+; CHECK:    define void @test1_out()
+; CHECK:      [[A2:%.*]] = alloca %struct.A,
+; CHECK:      [[B2:%.*]] = alloca %struct.A,
+; CHECK:      [[A1:%.*]] = alloca %struct.A,
+; CHECK:      [[B1:%.*]] = alloca %struct.A,
+; CHECK:      invoke void @_ZN1AC1Ev(%struct.A* [[A1]])
+; CHECK-NEXT:   unwind label %[[LPAD:[^\s]+]]
+; CHECK:      invoke void @_ZN1AC1Ev(%struct.A* [[B1]])
+; CHECK-NEXT:   unwind label %[[LPAD1:[^\s]+]]
+; CHECK:      invoke void @_ZN1AD1Ev(%struct.A* [[B1]])
+; CHECK-NEXT:   unwind label %[[LPAD1]]
+; CHECK:      invoke void @_ZN1AD1Ev(%struct.A* [[A1]])
+; CHECK-NEXT:   unwind label %[[LPAD]]
+
+; Inner landing pad from first inlining.
+; CHECK:    [[LPAD1]]:
+; CHECK-NEXT: [[EXN1:%.*]] = call i8* @llvm.eh.exception()
+; CHECK-NEXT: [[SEL1:%.*]] = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* [[EXN1]], i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i32 0, i8* bitcast (i8** @_ZTIi to i8*))
+; CHECK-NEXT: invoke void @_ZN1AD1Ev(%struct.A* [[A1]])
+; CHECK-NEXT:   to label %[[RESUME1:[^\s]+]] unwind
+; CHECK: [[RESUME1]]:
+; CHECK-NEXT: br label %[[LPAD_JOIN1:[^\s]+]]
+
+; CHECK:      invoke void @_ZN1AC1Ev(%struct.A* [[A2]])
+; CHECK-NEXT:   unwind label %[[LPAD]]
+; CHECK:      invoke void @_ZN1AC1Ev(%struct.A* [[B2]])
+; CHECK-NEXT:   unwind label %[[LPAD2:[^\s]+]]
+; CHECK:      invoke void @_ZN1AD1Ev(%struct.A* [[B2]])
+; CHECK-NEXT:   unwind label %[[LPAD2]]
+; CHECK:      invoke void @_ZN1AD1Ev(%struct.A* [[A2]])
+; CHECK-NEXT:   unwind label %[[LPAD]]
+
+; Inner landing pad from second inlining.
+; CHECK:    [[LPAD2]]:
+; CHECK-NEXT: [[EXN2:%.*]] = call i8* @llvm.eh.exception()
+; CHECK-NEXT: [[SEL2:%.*]] = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* [[EXN2]], i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i32 0, i8* bitcast (i8** @_ZTIi to i8*))
+; CHECK-NEXT: invoke void @_ZN1AD1Ev(%struct.A* [[A2]])
+; CHECK-NEXT:   to label %[[RESUME2:[^\s]+]] unwind
+; CHECK: [[RESUME2]]:
+; CHECK-NEXT: br label %[[LPAD_JOIN2:[^\s]+]]
+
+; CHECK:      ret void
+
+; CHECK:    [[LPAD]]:
+; CHECK-NEXT: [[X:%.*]] = phi i32 [ 0, %entry ], [ 0, {{%.*}} ], [ 1, %cont ], [ 1, {{%.*}} ]
+; CHECK-NEXT: [[Y:%.*]] = phi i32 [ 1, %entry ], [ 1, {{%.*}} ], [ 4, %cont ], [ 4, {{%.*}} ]
+; CHECK-NEXT: [[EXN:%.*]] = call i8* @llvm.eh.exception()
+; CHECK-NEXT: [[SEL:%.*]] = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* [[EXN]], i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* bitcast (i8** @_ZTIi to i8*))
+; CHECK-NEXT: br label %[[LPAD_JOIN2]]
+
+; CHECK: [[LPAD_JOIN2]]:
+; CHECK-NEXT: [[XJ2:%.*]] = phi i32 [ [[X]], %[[LPAD]] ], [ 1, %[[RESUME2]] ]
+; CHECK-NEXT: [[YJ2:%.*]] = phi i32 [ [[Y]], %[[LPAD]] ], [ 4, %[[RESUME2]] ]
+; CHECK-NEXT: [[EXNJ2:%.*]] = phi i8* [ [[EXN]], %[[LPAD]] ], [ [[EXN2]], %[[RESUME2]] ]
+; CHECK-NEXT: [[SELJ2:%.*]] = phi i32 [ [[SEL]], %[[LPAD]] ], [ [[SEL2]], %[[RESUME2]] ]
+; CHECK-NEXT: br label %[[LPAD_JOIN1]]
+
+; CHECK: [[LPAD_JOIN1]]:
+; CHECK-NEXT: [[XJ1:%.*]] = phi i32 [ [[XJ2]], %[[LPAD_JOIN2]] ], [ 0, %[[RESUME1]] ]
+; CHECK-NEXT: [[YJ1:%.*]] = phi i32 [ [[YJ2]], %[[LPAD_JOIN2]] ], [ 1, %[[RESUME1]] ]
+; CHECK-NEXT: [[EXNJ1:%.*]] = phi i8* [ [[EXNJ2]], %[[LPAD_JOIN2]] ], [ [[EXN1]], %[[RESUME1]] ]
+; CHECK-NEXT: [[SELJ1:%.*]] = phi i32 [ [[SELJ2]], %[[LPAD_JOIN2]] ], [ [[SEL1]], %[[RESUME1]] ]
+; CHECK-NEXT: [[T:%.*]] = call i32 @llvm.eh.typeid.for(
+; CHECK-NEXT: icmp eq i32 [[SELJ1]], [[T]]
+
+; CHECK:      call void @use(i32 [[XJ1]])
+; CHECK:      call void @use(i32 [[YJ1]])
+
+; CHECK:      call void @llvm.eh.resume(i8* [[EXNJ1]], i32 [[SELJ1]])
+
+
+;; Test 2 - Don't make invalid IR for inlines into landing pads without eh.exception calls
+define void @test2_out() uwtable ssp {
+entry:
+  invoke void @test0_in()
+          to label %ret unwind label %lpad
+
+ret:
+  ret void
+
+lpad:
+  call void @_ZSt9terminatev()
+  unreachable
+}
+
+; CHECK: define void @test2_out()
+; CHECK:      [[A:%.*]] = alloca %struct.A,
+; CHECK:      [[B:%.*]] = alloca %struct.A,
+; CHECK:      invoke void @_ZN1AC1Ev(%struct.A* [[A]])
+; CHECK-NEXT:   unwind label %[[LPAD:[^\s]+]]
+; CHECK:      invoke void @_ZN1AC1Ev(%struct.A* [[B]])
+; CHECK-NEXT:   unwind label %[[LPAD2:[^\s]+]]
+; CHECK:      invoke void @_ZN1AD1Ev(%struct.A* [[B]])
+; CHECK-NEXT:   unwind label %[[LPAD2]]
+; CHECK:      invoke void @_ZN1AD1Ev(%struct.A* [[A]])
+; CHECK-NEXT:   unwind label %[[LPAD]]
+
+
+;; Test 3 - Deal correctly with split unwind edges.
+define void @test3_out() uwtable ssp {
+entry:
+  invoke void @test0_in()
+          to label %ret unwind label %lpad
+
+ret:
+  ret void
+
+lpad:
+  br label %lpad.cont
+
+lpad.cont:
+  %exn = call i8* @llvm.eh.exception() nounwind
+  %eh.selector = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  call void @_ZSt9terminatev()
+  unreachable
+}
+
+; CHECK: define void @test3_out()
+; CHECK:      call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* {{%.*}}, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i32 0, i8* bitcast (i8** @_ZTIi to i8*))
+; CHECK-NEXT: invoke void @_ZN1AD1Ev(
+; CHECK-NEXT:   to label %[[L:[^\s]+]] unwind
+; CHECK:    [[L]]:
+; CHECK-NEXT: br label %[[JOIN:[^\s]+]]
+; CHECK:    [[JOIN]]:
+; CHECK-NEXT: phi
+; CHECK-NEXT: phi
+; CHECK-NEXT: br label %lpad.cont
+; CHECK:    lpad.cont:
+; CHECK-NEXT: call void @_ZSt9terminatev()
+
+
+;; Test 4 - Split unwind edges with a dominance problem
+define void @test4_out() uwtable ssp {
+entry:
+  invoke void @test0_in()
+          to label %cont unwind label %lpad.crit
+
+cont:
+  invoke void @opaque()
+          to label %ret unwind label %lpad
+
+ret:
+  ret void
+
+lpad.crit:
+  call void @opaque() nounwind
+  br label %lpad
+
+lpad:
+  %phi = phi i32 [ 0, %lpad.crit ], [ 1, %cont ]
+  %exn = call i8* @llvm.eh.exception() nounwind
+  %eh.selector = call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  call void @use(i32 %phi)
+  call void @_ZSt9terminatev()
+  unreachable
+}
+
+; CHECK: define void @test4_out()
+; CHECK:      call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* {{%.*}}, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i32 0, i8* bitcast (i8** @_ZTIi to i8*))
+; CHECK-NEXT: invoke void @_ZN1AD1Ev(
+; CHECK-NEXT:   to label %[[L:[^\s]+]] unwind
+; CHECK:    [[L]]:
+; CHECK-NEXT: br label %[[JOIN:[^\s]+]]
+; CHECK:      invoke void @opaque()
+; CHECK-NEXT:                  unwind label %lpad
+; CHECK:    lpad.crit:
+; CHECK-NEXT: call i8* @llvm.eh.exception()
+; CHECK-NEXT: call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %4, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* bitcast (i8** @_ZTIi to i8*))
+; CHECK-NEXT: br label %[[JOIN]]
+; CHECK:    [[JOIN]]:
+; CHECK-NEXT: phi i8*
+; CHECK-NEXT: phi i32
+; CHECK-NEXT: call void @opaque() nounwind
+; CHECK-NEXT: br label %[[FIX:[^\s]+]]
+; CHECK:    lpad:
+; CHECK-NEXT: [[T0:%.*]] = phi i32 [ 1, %cont ]
+; CHECK-NEXT: call i8* @llvm.eh.exception() nounwind
+; CHECK-NEXT: call i32 (i8*, i8*, ...)* @llvm.eh.selector(i8* %exn, i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*), i8* bitcast (i8** @_ZTIi to i8*))
+; CHECK-NEXT: br label %[[FIX]]
+; CHECK:    [[FIX]]:
+; CHECK-NEXT: [[T1:%.*]] = phi i32 [ [[T0]], %lpad ], [ 0, %[[JOIN]] ]
+; CHECK-NEXT: phi i8*
+; CHECK-NEXT: phi i32
+; CHECK-NEXT: call void @use(i32 [[T1]])
+; CHECK-NEXT: call void @_ZSt9terminatev()
diff --git a/test/Transforms/Inline/lifetime.ll b/test/Transforms/Inline/lifetime.ll
new file mode 100644
index 0000000..a95c836
--- /dev/null
+++ b/test/Transforms/Inline/lifetime.ll
@@ -0,0 +1,78 @@
+; RUN: opt -inline %s -S -o - | FileCheck %s
+
+declare void @llvm.lifetime.start(i64, i8*)
+declare void @llvm.lifetime.end(i64, i8*)
+
+define void @helper_both_markers() {
+  %a = alloca i8
+  call void @llvm.lifetime.start(i64 1, i8* %a)
+  call void @llvm.lifetime.end(i64 1, i8* %a)
+  ret void
+}
+
+define void @test_both_markers() {
+; CHECK: @test_both_markers
+; CHECK: llvm.lifetime.start(i64 1
+; CHECK-NEXT: llvm.lifetime.end(i64 1
+  call void @helper_both_markers()
+; CHECK-NEXT: llvm.lifetime.start(i64 1
+; CHECK-NEXT: llvm.lifetime.end(i64 1
+  call void @helper_both_markers()
+; CHECK-NEXT: ret void
+  ret void
+}
+
+;; Without this, the inliner will simplify out @test_no_marker before adding
+;; any lifetime markers.
+declare void @use(i8* %a)
+
+define void @helper_no_markers() {
+  %a = alloca i8
+  call void @use(i8* %a)
+  ret void
+}
+
+;; We can't use CHECK-NEXT because there's an extra call void @use in between.
+;; Instead, we use CHECK-NOT to verify that there are no other lifetime calls.
+define void @test_no_marker() {
+; CHECK: @test_no_marker
+; CHECK-NOT: lifetime
+; CHECK: llvm.lifetime.start(i64 -1
+; CHECK-NOT: lifetime
+; CHECK: llvm.lifetime.end(i64 -1
+  call void @helper_no_markers()
+; CHECK-NOT: lifetime
+; CHECK: llvm.lifetime.start(i64 -1
+; CHECK-NOT: lifetime
+; CHECK: llvm.lifetime.end(i64 -1
+  call void @helper_no_markers()
+; CHECK-NOT: lifetime
+; CHECK: ret void
+  ret void
+}
+
+define void @helper_two_casts() {
+  %a = alloca i32
+  %b = bitcast i32* %a to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %b)
+  %c = bitcast i32* %a to i8*
+  call void @llvm.lifetime.end(i64 4, i8* %c)
+  ret void
+}
+
+define void @test_two_casts() {
+; CHECK: @test_two_casts
+; CHECK-NOT: lifetime
+; CHECK: llvm.lifetime.start(i64 4
+; CHECK-NOT: lifetime
+; CHECK: llvm.lifetime.end(i64 4
+  call void @helper_two_casts()
+; CHECK-NOT: lifetime
+; CHECK: llvm.lifetime.start(i64 4
+; CHECK-NOT: lifetime
+; CHECK: llvm.lifetime.end(i64 4
+  call void @helper_two_casts()
+; CHECK-NOT: lifetime
+; CHECK: ret void
+  ret void
+}
diff --git a/test/Transforms/InstCombine/2011-05-02-VectorBoolean.ll b/test/Transforms/InstCombine/2011-05-02-VectorBoolean.ll
new file mode 100644
index 0000000..02b64e3
--- /dev/null
+++ b/test/Transforms/InstCombine/2011-05-02-VectorBoolean.ll
@@ -0,0 +1,15 @@
+; RUN: opt < %s -instcombine
+; PR9579
+
+define <2 x i16> @entry(<2 x i16> %a) nounwind {
+entry:
+  %a.addr = alloca <2 x i16>, align 4
+  %.compoundliteral = alloca <2 x i16>, align 4
+  store <2 x i16> %a, <2 x i16>* %a.addr, align 4
+  %tmp = load <2 x i16>* %a.addr, align 4
+  store <2 x i16> zeroinitializer, <2 x i16>* %.compoundliteral
+  %tmp1 = load <2 x i16>* %.compoundliteral
+  %cmp = icmp uge <2 x i16> %tmp, %tmp1
+  %sext = sext <2 x i1> %cmp to <2 x i16>
+  ret <2 x i16> %sext
+}
diff --git a/test/Transforms/InstCombine/2011-05-13-InBoundsGEP.ll b/test/Transforms/InstCombine/2011-05-13-InBoundsGEP.ll
new file mode 100644
index 0000000..fba7239
--- /dev/null
+++ b/test/Transforms/InstCombine/2011-05-13-InBoundsGEP.ll
@@ -0,0 +1,21 @@
+; RUN: opt < %s -S -instcombine | FileCheck %s
+; rdar://problem/9267970
+; ideally this test will run on a 32-bit host
+; must not discard GEPs that might overflow at runtime (aren't inbounds)
+
+define i32 @main(i32 %argc) {
+entry:
+    %tmp1 = add i32 %argc, -2
+    %tmp2 = add i32 %argc, 1879048192
+    %p = alloca i8
+; CHECK: getelementptr
+    %p1 = getelementptr i8* %p, i32 %tmp1
+; CHECK: getelementptr
+    %p2 = getelementptr i8* %p, i32 %tmp2
+    %cmp = icmp ult i8* %p1, %p2
+    br i1 %cmp, label %bbtrue, label %bbfalse
+bbtrue:          ; preds = %entry
+    ret i32 -1
+bbfalse:         ; preds = %entry
+    ret i32 0
+}
diff --git a/test/Transforms/InstCombine/2011-05-28-swapmulsub.ll b/test/Transforms/InstCombine/2011-05-28-swapmulsub.ll
new file mode 100644
index 0000000..b096d1f
--- /dev/null
+++ b/test/Transforms/InstCombine/2011-05-28-swapmulsub.ll
@@ -0,0 +1,57 @@
+; ModuleID = 'test1.c'
+; RUN: opt -S -instcombine < %s | FileCheck %s
+target triple = "x86_64-apple-macosx10.6.6"
+
+define zeroext i16 @foo1(i32 %on_off) nounwind uwtable ssp {
+entry:
+  %on_off.addr = alloca i32, align 4
+  %a = alloca i32, align 4
+  store i32 %on_off, i32* %on_off.addr, align 4
+  %tmp = load i32* %on_off.addr, align 4
+  %sub = sub i32 1, %tmp
+; CHECK-NOT: mul i32
+  %mul = mul i32 %sub, -2
+; CHECK: shl
+; CHECK-NEXT: add
+  store i32 %mul, i32* %a, align 4
+  %tmp1 = load i32* %a, align 4
+  %conv = trunc i32 %tmp1 to i16
+  ret i16 %conv
+}
+
+define zeroext i16 @foo2(i32 %on_off, i32 %q) nounwind uwtable ssp {
+entry:
+  %on_off.addr = alloca i32, align 4
+  %q.addr = alloca i32, align 4
+  %a = alloca i32, align 4
+  store i32 %on_off, i32* %on_off.addr, align 4
+  store i32 %q, i32* %q.addr, align 4
+  %tmp = load i32* %q.addr, align 4
+  %tmp1 = load i32* %on_off.addr, align 4
+  %sub = sub i32 %tmp, %tmp1
+; CHECK-NOT: mul i32
+  %mul = mul i32 %sub, -4
+; CHECK: sub i32
+; CHECK-NEXT: shl
+  store i32 %mul, i32* %a, align 4
+  %tmp2 = load i32* %a, align 4
+  %conv = trunc i32 %tmp2 to i16
+  ret i16 %conv
+}
+
+define zeroext i16 @foo3(i32 %on_off) nounwind uwtable ssp {
+entry:
+  %on_off.addr = alloca i32, align 4
+  %a = alloca i32, align 4
+  store i32 %on_off, i32* %on_off.addr, align 4
+  %tmp = load i32* %on_off.addr, align 4
+  %sub = sub i32 7, %tmp
+; CHECK-NOT: mul i32
+  %mul = mul i32 %sub, -4
+; CHECK: shl
+; CHECK-NEXT: add
+  store i32 %mul, i32* %a, align 4
+  %tmp1 = load i32* %a, align 4
+  %conv = trunc i32 %tmp1 to i16
+  ret i16 %conv
+}
diff --git a/test/Transforms/InstCombine/call.ll b/test/Transforms/InstCombine/call.ll
index 2ef8dc0..d084873 100644
--- a/test/Transforms/InstCombine/call.ll
+++ b/test/Transforms/InstCombine/call.ll
@@ -53,8 +53,8 @@ define i8 @test4a() {
 define i32 @test4() {
         %X = call i32 bitcast (i8 ()* @test4a to i32 ()*)( )            ; <i32> [#uses=1]
         ret i32 %X
-; CHECK: %X1 = call i8 @test4a()
-; CHECK: %tmp = zext i8 %X1 to i32
+; CHECK: %X = call i8 @test4a()
+; CHECK: %tmp = zext i8 %X to i32
 ; CHECK: ret i32 %tmp
 }
 
@@ -77,8 +77,8 @@ declare i32 @test6a(i32)
 define i32 @test6() {
         %X = call i32 bitcast (i32 (i32)* @test6a to i32 ()*)( )
         ret i32 %X
-; CHECK: %X1 = call i32 @test6a(i32 0)
-; CHECK: ret i32 %X1
+; CHECK: %X = call i32 @test6a(i32 0)
+; CHECK: ret i32 %X
 }
 
 
diff --git a/test/Transforms/InstCombine/div.ll b/test/Transforms/InstCombine/div.ll
index 2e24f19..8a0897b 100644
--- a/test/Transforms/InstCombine/div.ll
+++ b/test/Transforms/InstCombine/div.ll
@@ -118,3 +118,17 @@ define i32 @test14(i8 %x) nounwind {
 ; CHECK: @test14
 ; CHECK-NEXT: ret i32 0
 }
+
+; PR9814
+define i32 @test15(i32 %a, i32 %b) nounwind {
+  %shl = shl i32 1, %b
+  %div = lshr i32 %shl, 2
+  %div2 = udiv i32 %a, %div
+  ret i32 %div2
+; CHECK: @test15
+; CHECK-NEXT: add i32 %b, -2
+; CHECK-NEXT: lshr i32 %a, 
+; CHECK-NEXT: ret i32
+}
+
+
diff --git a/test/Transforms/InstCombine/exact.ll b/test/Transforms/InstCombine/exact.ll
index 58f8b5d..14741e3 100644
--- a/test/Transforms/InstCombine/exact.ll
+++ b/test/Transforms/InstCombine/exact.ll
@@ -96,6 +96,22 @@ define i1 @ashr_icmp2(i64 %X) nounwind {
  ret i1 %Z
 }
 
+; PR9998
+; Make sure we don't transform the ashr here into an sdiv
+; CHECK: @pr9998
+; CHECK: = and i32 %V, 1
+; CHECK: %Z = icmp ne
+; CHECK: ret i1 %Z
+define i1 @pr9998(i32 %V) nounwind {
+entry:
+  %W = shl i32 %V, 31
+  %X = ashr exact i32 %W, 31
+  %Y = sext i32 %X to i64
+  %Z = icmp ugt i64 %Y, 7297771788697658747
+  ret i1 %Z
+}
+
+
 ; CHECK: @udiv_icmp1
 ; CHECK: icmp ne i64 %X, 0
 define i1 @udiv_icmp1(i64 %X) nounwind {
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index 099540a..1237ade 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -510,3 +510,14 @@ define i1 @test52(i32 %x1) nounwind {
   ret i1 %A
 }
 
+; PR9838
+; CHECK: @test53
+; CHECK-NEXT: ashr exact
+; CHECK-NEXT: ashr
+; CHECK-NEXT: icmp
+define i1 @test53(i32 %a, i32 %b) nounwind {
+ %x = ashr exact i32 %a, 30
+ %y = ashr i32 %b, 30
+ %z = icmp eq i32 %x, %y
+ ret i1 %z
+}
diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll
index 332cd46..107f313 100644
--- a/test/Transforms/InstCombine/intrinsics.ll
+++ b/test/Transforms/InstCombine/intrinsics.ll
@@ -30,9 +30,9 @@ define i8 @uaddtest2(i8 %A, i8 %B, i1* %overflowPtr) {
 ; CHECK: @uaddtest2
 ; CHECK-NEXT: %and.A = and i8 %A, 127
 ; CHECK-NEXT: %and.B = and i8 %B, 127
-; CHECK-NEXT: %1 = add nuw i8 %and.A, %and.B
+; CHECK-NEXT: %x = add nuw i8 %and.A, %and.B
 ; CHECK-NEXT: store i1 false, i1* %overflowPtr
-; CHECK-NEXT: ret i8 %1
+; CHECK-NEXT: ret i8 %x
 }
 
 define i8 @uaddtest3(i8 %A, i8 %B, i1* %overflowPtr) {
@@ -46,9 +46,9 @@ define i8 @uaddtest3(i8 %A, i8 %B, i1* %overflowPtr) {
 ; CHECK: @uaddtest3
 ; CHECK-NEXT: %or.A = or i8 %A, -128
 ; CHECK-NEXT: %or.B = or i8 %B, -128
-; CHECK-NEXT: %1 = add i8 %or.A, %or.B
+; CHECK-NEXT: %x = add i8 %or.A, %or.B
 ; CHECK-NEXT: store i1 true, i1* %overflowPtr
-; CHECK-NEXT: ret i8 %1
+; CHECK-NEXT: ret i8 %x
 }
 
 define i8 @uaddtest4(i8 %A, i1* %overflowPtr) {
diff --git a/test/Transforms/InstCombine/or.ll b/test/Transforms/InstCombine/or.ll
index 94a5732..c0bb28d 100644
--- a/test/Transforms/InstCombine/or.ll
+++ b/test/Transforms/InstCombine/or.ll
@@ -332,8 +332,8 @@ define i64 @test31(i64 %A) nounwind readnone ssp noredzone {
   %F = or i64 %D, %E
   ret i64 %F
 ; CHECK: @test31
-; CHECK-NEXT: %E1 = and i64 %A, 4294908984
-; CHECK-NEXT: %F = or i64 %E1, 32962
+; CHECK-NEXT: %E = and i64 %A, 4294908984
+; CHECK-NEXT: %F = or i64 %E, 32962
 ; CHECK-NEXT: ret i64 %F
 }
 
diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll
index 3925907..4ca9bd2 100644
--- a/test/Transforms/InstCombine/select.ll
+++ b/test/Transforms/InstCombine/select.ll
@@ -749,3 +749,53 @@ define i1 @test55(i1 %X, i32 %Y, i32 %Z) {
 ; CHECK: icmp eq
 ; CHECK: ret i1
 }
+
+define i32 @test56(i16 %x) nounwind {
+  %tobool = icmp eq i16 %x, 0
+  %conv = zext i16 %x to i32
+  %cond = select i1 %tobool, i32 0, i32 %conv
+  ret i32 %cond
+; CHECK: @test56
+; CHECK-NEXT: zext
+; CHECK-NEXT: ret
+}
+
+define i32 @test57(i32 %x, i32 %y) nounwind {
+  %and = and i32 %x, %y
+  %tobool = icmp eq i32 %x, 0
+  %.and = select i1 %tobool, i32 0, i32 %and
+  ret i32 %.and
+; CHECK: @test57
+; CHECK-NEXT: and i32 %x, %y
+; CHECK-NEXT: ret
+}
+
+define i32 @test58(i16 %x) nounwind {
+  %tobool = icmp ne i16 %x, 1
+  %conv = zext i16 %x to i32
+  %cond = select i1 %tobool, i32 %conv, i32 1
+  ret i32 %cond
+; CHECK: @test58
+; CHECK-NEXT: zext
+; CHECK-NEXT: ret
+}
+
+define i32 @test59(i32 %x, i32 %y) nounwind {
+  %and = and i32 %x, %y
+  %tobool = icmp ne i32 %x, %y
+  %.and = select i1 %tobool, i32 %and, i32 %y
+  ret i32 %.and
+; CHECK: @test59
+; CHECK-NEXT: and i32 %x, %y
+; CHECK-NEXT: ret
+}
+
+define i1 @test60(i32 %x, i1* %y) nounwind {
+  %cmp = icmp eq i32 %x, 0
+  %load = load i1* %y, align 1
+  %cmp1 = icmp slt i32 %x, 1
+  %sel = select i1 %cmp, i1 %load, i1 %cmp1
+  ret i1 %sel
+; CHECK: @test60
+; CHECK: select
+}
diff --git a/test/Transforms/InstCombine/shift.ll b/test/Transforms/InstCombine/shift.ll
index bded68a..d9ac9cb 100644
--- a/test/Transforms/InstCombine/shift.ll
+++ b/test/Transforms/InstCombine/shift.ll
@@ -506,3 +506,24 @@ define i32 @test41(i32 %a, i32 %b) nounwind {
 ; CHECK-NEXT: shl i32 8, %b
 ; CHECK-NEXT: ret i32
 }
+
+define i32 @test42(i32 %a, i32 %b) nounwind {
+  %div = lshr i32 4096, %b    ; must be exact otherwise we'd divide by zero
+  %div2 = udiv i32 %a, %div
+  ret i32 %div2
+; CHECK: @test42
+; CHECK-NEXT: lshr exact i32 4096, %b
+}
+
+define i32 @test43(i32 %a, i32 %b) nounwind {
+  %div = shl i32 4096, %b    ; must be exact otherwise we'd divide by zero
+  %div2 = udiv i32 %a, %div
+  ret i32 %div2
+; CHECK: @test43
+; CHECK-NEXT: add i32 %b, 12
+; CHECK-NEXT: lshr
+; CHECK-NEXT: ret
+}
+
+
+
diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll
index 9f308aa..e0188fe 100644
--- a/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/test/Transforms/InstCombine/vec_demanded_elts.ll
@@ -136,3 +136,19 @@ declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>)
 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>)
 declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>)
 declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>)
+
+; <rdar://problem/6945110>
+define <4 x i32> @kernel3_vertical(<4 x i16> * %src, <8 x i16> * %foo) nounwind {
+entry:
+	%tmp = load <4 x i16>* %src
+	%tmp1 = load <8 x i16>* %foo
+; CHECK: %tmp2 = shufflevector
+	%tmp2 = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; pmovzxwd ignores the upper 64-bits of its input; -instcombine should remove this shuffle:
+; CHECK-NOT: shufflevector
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: pmovzxwd
+	%0 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp3)
+	ret <4 x i32> %0
+}
+declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
diff --git a/test/Transforms/InstCombine/x86-crc32-demanded.ll b/test/Transforms/InstCombine/x86-crc32-demanded.ll
new file mode 100644
index 0000000..878b97d
--- /dev/null
+++ b/test/Transforms/InstCombine/x86-crc32-demanded.ll
@@ -0,0 +1,17 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; crc32 with 64-bit destination zeros high 32-bit.
+; rdar://9467055
+
+define i64 @test() nounwind {
+entry:
+; CHECK: test
+; CHECK: tail call i64 @llvm.x86.sse42.crc32.64.64
+; CHECK-NOT: and
+; CHECK: ret
+  %0 = tail call i64 @llvm.x86.sse42.crc32.64.64(i64 0, i64 4) nounwind
+  %1 = and i64 %0, 4294967295
+  ret i64 %1
+}
+
+declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind readnone
diff --git a/test/Transforms/InstSimplify/maxmin.ll b/test/Transforms/InstSimplify/maxmin.ll
new file mode 100644
index 0000000..e921214
--- /dev/null
+++ b/test/Transforms/InstSimplify/maxmin.ll
@@ -0,0 +1,269 @@
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+define i1 @max1(i32 %x, i32 %y) {
+; CHECK: @max1
+  %c = icmp sgt i32 %x, %y
+  %m = select i1 %c, i32 %x, i32 %y
+  %r = icmp slt i32 %m, %x
+  ret i1 %r
+; CHECK: ret i1 false
+}
+
+define i1 @max2(i32 %x, i32 %y) {
+; CHECK: @max2
+  %c = icmp sge i32 %x, %y
+  %m = select i1 %c, i32 %x, i32 %y
+  %r = icmp sge i32 %m, %x
+  ret i1 %r
+; CHECK: ret i1 true
+}
+
+define i1 @max3(i32 %x, i32 %y) {
+; CHECK: @max3
+  %c = icmp ugt i32 %x, %y
+  %m = select i1 %c, i32 %x, i32 %y
+  %r = icmp ult i32 %m, %x
+  ret i1 %r
+; CHECK: ret i1 false
+}
+
+define i1 @max4(i32 %x, i32 %y) {
+; CHECK: @max4
+  %c = icmp uge i32 %x, %y
+  %m = select i1 %c, i32 %x, i32 %y
+  %r = icmp uge i32 %m, %x
+  ret i1 %r
+; CHECK: ret i1 true
+}
+
+define i1 @max5(i32 %x, i32 %y) {
+; CHECK: @max5
+  %c = icmp sgt i32 %x, %y
+  %m = select i1 %c, i32 %x, i32 %y
+  %r = icmp sgt i32 %x, %m
+  ret i1 %r
+; CHECK: ret i1 false
+}
+
+define i1 @max6(i32 %x, i32 %y) {
+; CHECK: @max6
+  %c = icmp sge i32 %x, %y
+  %m = select i1 %c, i32 %x, i32 %y
+  %r = icmp sle i32 %x, %m
+  ret i1 %r
+; CHECK: ret i1 true
+}
+
+define i1 @max7(i32 %x, i32 %y) {
+; CHECK: @max7
+  %c = icmp ugt i32 %x, %y
+  %m = select i1 %c, i32 %x, i32 %y
+  %r = icmp ugt i32 %x, %m
+  ret i1 %r
+; CHECK: ret i1 false
+}
+
+define i1 @max8(i32 %x, i32 %y) {
+; CHECK: @max8
+  %c = icmp uge i32 %x, %y
+  %m = select i1 %c, i32 %x, i32 %y
+  %r = icmp ule i32 %x, %m
+  ret i1 %r
+; CHECK: ret i1 true
+}
+
+define i1 @min1(i32 %x, i32 %y) {
+; CHECK: @min1
+  %c = icmp sgt i32 %x, %y
+  %m = select i1 %c, i32 %y, i32 %x
+  %r = icmp sgt i32 %m, %x
+  ret i1 %r
+; CHECK: ret i1 false
+}
+
+define i1 @min2(i32 %x, i32 %y) {
+; CHECK: @min2
+  %c = icmp sge i32 %x, %y
+  %m = select i1 %c, i32 %y, i32 %x
+  %r = icmp sle i32 %m, %x
+  ret i1 %r
+; CHECK: ret i1 true
+}
+
+define i1 @min3(i32 %x, i32 %y) {
+; CHECK: @min3
+  %c = icmp ugt i32 %x, %y
+  %m = select i1 %c, i32 %y, i32 %x
+  %r = icmp ugt i32 %m, %x
+  ret i1 %r
+; CHECK: ret i1 false
+}
+
+define i1 @min4(i32 %x, i32 %y) {
+; CHECK: @min4
+  %c = icmp uge i32 %x, %y
+  %m = select i1 %c, i32 %y, i32 %x
+  %r = icmp ule i32 %m, %x
+  ret i1 %r
+; CHECK: ret i1 true
+}
+
+define i1 @min5(i32 %x, i32 %y) {
+; CHECK: @min5
+  %c = icmp sgt i32 %x, %y
+  %m = select i1 %c, i32 %y, i32 %x
+  %r = icmp slt i32 %x, %m
+  ret i1 %r
+; CHECK: ret i1 false
+}
+
+define i1 @min6(i32 %x, i32 %y) {
+; CHECK: @min6
+  %c = icmp sge i32 %x, %y
+  %m = select i1 %c, i32 %y, i32 %x
+  %r = icmp sge i32 %x, %m
+  ret i1 %r
+; CHECK: ret i1 true
+}
+
+define i1 @min7(i32 %x, i32 %y) {
+; CHECK: @min7
+  %c = icmp ugt i32 %x, %y
+  %m = select i1 %c, i32 %y, i32 %x
+  %r = icmp ult i32 %x, %m
+  ret i1 %r
+; CHECK: ret i1 false
+}
+
+define i1 @min8(i32 %x, i32 %y) {
+; CHECK: @min8
+  %c = icmp uge i32 %x, %y
+  %m = select i1 %c, i32 %y, i32 %x
+  %r = icmp uge i32 %x, %m
+  ret i1 %r
+; CHECK: ret i1 true
+}
+
+define i1 @maxmin1(i32 %x, i32 %y, i32 %z) {
+; CHECK: @maxmin1
+  %c1 = icmp sge i32 %x, %y
+  %max = select i1 %c1, i32 %x, i32 %y
+  %c2 = icmp sge i32 %x, %z
+  %min = select i1 %c2, i32 %z, i32 %x
+  %c = icmp sge i32 %max, %min
+  ret i1 %c
+; CHECK: ret i1 true
+}
+
+define i1 @maxmin2(i32 %x, i32 %y, i32 %z) {
+; CHECK: @maxmin2
+  %c1 = icmp sge i32 %x, %y
+  %max = select i1 %c1, i32 %x, i32 %y
+  %c2 = icmp sge i32 %x, %z
+  %min = select i1 %c2, i32 %z, i32 %x
+  %c = icmp sgt i32 %min, %max
+  ret i1 %c
+; CHECK: ret i1 false
+}
+
+define i1 @maxmin3(i32 %x, i32 %y, i32 %z) {
+; CHECK: @maxmin3
+  %c1 = icmp sge i32 %x, %y
+  %max = select i1 %c1, i32 %x, i32 %y
+  %c2 = icmp sge i32 %x, %z
+  %min = select i1 %c2, i32 %z, i32 %x
+  %c = icmp sle i32 %min, %max
+  ret i1 %c
+; CHECK: ret i1 true
+}
+
+define i1 @maxmin4(i32 %x, i32 %y, i32 %z) {
+; CHECK: @maxmin4
+  %c1 = icmp sge i32 %x, %y
+  %max = select i1 %c1, i32 %x, i32 %y
+  %c2 = icmp sge i32 %x, %z
+  %min = select i1 %c2, i32 %z, i32 %x
+  %c = icmp slt i32 %max, %min
+  ret i1 %c
+; CHECK: ret i1 false
+}
+
+define i1 @maxmin5(i32 %x, i32 %y, i32 %z) {
+; CHECK: @maxmin5
+  %c1 = icmp uge i32 %x, %y
+  %max = select i1 %c1, i32 %x, i32 %y
+  %c2 = icmp uge i32 %x, %z
+  %min = select i1 %c2, i32 %z, i32 %x
+  %c = icmp uge i32 %max, %min
+  ret i1 %c
+; CHECK: ret i1 true
+}
+
+define i1 @maxmin6(i32 %x, i32 %y, i32 %z) {
+; CHECK: @maxmin6
+  %c1 = icmp uge i32 %x, %y
+  %max = select i1 %c1, i32 %x, i32 %y
+  %c2 = icmp uge i32 %x, %z
+  %min = select i1 %c2, i32 %z, i32 %x
+  %c = icmp ugt i32 %min, %max
+  ret i1 %c
+; CHECK: ret i1 false
+}
+
+define i1 @maxmin7(i32 %x, i32 %y, i32 %z) {
+; CHECK: @maxmin7
+  %c1 = icmp uge i32 %x, %y
+  %max = select i1 %c1, i32 %x, i32 %y
+  %c2 = icmp uge i32 %x, %z
+  %min = select i1 %c2, i32 %z, i32 %x
+  %c = icmp ule i32 %min, %max
+  ret i1 %c
+; CHECK: ret i1 true
+}
+
+define i1 @maxmin8(i32 %x, i32 %y, i32 %z) {
+; CHECK: @maxmin8
+  %c1 = icmp uge i32 %x, %y
+  %max = select i1 %c1, i32 %x, i32 %y
+  %c2 = icmp uge i32 %x, %z
+  %min = select i1 %c2, i32 %z, i32 %x
+  %c = icmp ult i32 %max, %min
+  ret i1 %c
+; CHECK: ret i1 false
+}
+
+define i1 @eqcmp1(i32 %x, i32 %y) {
+; CHECK: @eqcmp1
+  %c = icmp sge i32 %x, %y
+  %max = select i1 %c, i32 %x, i32 %y
+  %r = icmp eq i32 %max, %x
+  ret i1 %r
+; CHECK: ret i1 %c
+}
+
+define i1 @eqcmp2(i32 %x, i32 %y) {
+; CHECK: @eqcmp2
+  %c = icmp sge i32 %x, %y
+  %max = select i1 %c, i32 %x, i32 %y
+  %r = icmp eq i32 %x, %max
+  ret i1 %r
+; CHECK: ret i1 %c
+}
+
+define i1 @eqcmp3(i32 %x, i32 %y) {
+; CHECK: @eqcmp3
+  %c = icmp uge i32 %x, %y
+  %max = select i1 %c, i32 %x, i32 %y
+  %r = icmp eq i32 %max, %x
+  ret i1 %r
+; CHECK: ret i1 %c
+}
+
+define i1 @eqcmp4(i32 %x, i32 %y) {
+; CHECK: @eqcmp4
+  %c = icmp uge i32 %x, %y
+  %max = select i1 %c, i32 %x, i32 %y
+  %r = icmp eq i32 %x, %max
+  ret i1 %r
+; CHECK: ret i1 %c
+}
diff --git a/test/Transforms/LoopIdiom/basic.ll b/test/Transforms/LoopIdiom/basic.ll
index 485114c..9695418 100644
--- a/test/Transforms/LoopIdiom/basic.ll
+++ b/test/Transforms/LoopIdiom/basic.ll
@@ -347,3 +347,40 @@ for.end:                                          ; preds = %for.body
 ; CHECK-NOT: store
 ; CHECK: ret void
 }
+
+
+
+; PR9815 - This is a partial overlap case that cannot be safely transformed
+; into a memcpy.
+@g_50 = global [7 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0], align 16
+
+define i32 @test14() nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.lr.ph
+  %tmp5 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %add = add nsw i32 %tmp5, 4
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds [7 x i32]* @g_50, i32 0, i64 %idxprom
+  %tmp2 = load i32* %arrayidx, align 4
+  %add4 = add nsw i32 %tmp5, 5
+  %idxprom5 = sext i32 %add4 to i64
+  %arrayidx6 = getelementptr inbounds [7 x i32]* @g_50, i32 0, i64 %idxprom5
+  store i32 %tmp2, i32* %arrayidx6, align 4
+  %inc = add nsw i32 %tmp5, 1
+  %cmp = icmp slt i32 %inc, 2
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.inc
+  %tmp8 = load i32* getelementptr inbounds ([7 x i32]* @g_50, i32 0, i64 6), align 4
+  ret i32 %tmp8
+; CHECK: @test14
+; CHECK: for.body:
+; CHECK: load i32
+; CHECK: store i32
+; CHECK: br i1 %cmp
+
+}
+
+
diff --git a/test/Transforms/LoopStrengthReduce/2009-11-10-LSRCrash.ll b/test/Transforms/LoopStrengthReduce/X86/2009-11-10-LSRCrash.ll
index 4032a59..4032a59 100644
--- a/test/Transforms/LoopStrengthReduce/2009-11-10-LSRCrash.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/2009-11-10-LSRCrash.ll
diff --git a/test/Transforms/LoopStrengthReduce/X86/dg.exp b/test/Transforms/LoopStrengthReduce/X86/dg.exp
new file mode 100644
index 0000000..7b7bd4e
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/X86/dg.exp
@@ -0,0 +1,5 @@
+load_lib llvm.exp
+
+if { [llvm_supports_target X86] } {
+  RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll}]]
+}
diff --git a/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll b/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll
new file mode 100644
index 0000000..294c090
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll
@@ -0,0 +1,91 @@
+; RUN: opt -loop-reduce -S < %s | FileCheck %s
+; PR9939
+
+; LSR should property handle the post-inc offset when folding the
+; non-IV operand of an icmp into the IV.
+
+; CHECK:   %tmp2 = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+; CHECK:   %tmp3 = lshr i64 %tmp2, 1
+; CHECK:   %tmp4 = mul i64 %tmp3, 2
+; CHECK:   br label %for.body
+; CHECK: for.body:
+; CHECK:   %lsr.iv5 = phi i64 [ %lsr.iv.next, %for.body ], [ %tmp4, %for.body.lr.ph ]
+; CHECK:   %lsr.iv.next = add i64 %lsr.iv5, -2
+; CHECK:   %lsr.iv.next6 = inttoptr i64 %lsr.iv.next to i16*
+; CHECK:   %cmp27 = icmp eq i16* %lsr.iv.next6, null
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.Vector2 = type { i16*, [64 x i16], i32 }
+
+@.str = private unnamed_addr constant [37 x i8] c"0123456789abcdefghijklmnopqrstuvwxyz\00"
+
+define void @_Z15IntegerToStringjjR7Vector2(i32 %i, i32 %radix, %struct.Vector2* nocapture %result) nounwind noinline {
+entry:
+  %buffer = alloca [33 x i16], align 16
+  %add.ptr = getelementptr inbounds [33 x i16]* %buffer, i64 0, i64 33
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %0 = phi i64 [ %indvar.next44, %do.body ], [ 0, %entry ]
+  %i.addr.0 = phi i32 [ %div, %do.body ], [ %i, %entry ]
+  %tmp51 = sub i64 32, %0
+  %incdec.ptr = getelementptr [33 x i16]* %buffer, i64 0, i64 %tmp51
+  %rem = urem i32 %i.addr.0, 10
+  %div = udiv i32 %i.addr.0, 10
+  %idxprom = zext i32 %rem to i64
+  %arrayidx = getelementptr inbounds [37 x i8]* @.str, i64 0, i64 %idxprom
+  %tmp5 = load i8* %arrayidx, align 1
+  %conv = sext i8 %tmp5 to i16
+  store i16 %conv, i16* %incdec.ptr, align 2
+  %1 = icmp ugt i32 %i.addr.0, 9
+  %indvar.next44 = add i64 %0, 1
+  br i1 %1, label %do.body, label %do.end
+
+do.end:                                           ; preds = %do.body
+  %xap.0 = inttoptr i64 %0 to i1*
+  %cap.0 = ptrtoint i1* %xap.0 to i64
+  %sub.ptr.lhs.cast = ptrtoint i16* %add.ptr to i64
+  %sub.ptr.rhs.cast = ptrtoint i16* %incdec.ptr to i64
+  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  %sub.ptr.div39 = lshr exact i64 %sub.ptr.sub, 1
+  %conv11 = trunc i64 %sub.ptr.div39 to i32
+  %mLength = getelementptr inbounds %struct.Vector2* %result, i64 0, i32 2
+  %idx.ext21 = bitcast i64 %sub.ptr.div39 to i64
+  %incdec.ptr.sum = add i64 %idx.ext21, -1
+  %cp.0.sum = sub i64 %incdec.ptr.sum, %0
+  %add.ptr22 = getelementptr [33 x i16]* %buffer, i64 1, i64 %cp.0.sum
+  %cmp2740 = icmp eq i64 %idx.ext21, 0
+  br i1 %cmp2740, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %do.end
+  %tmp16 = load i32* %mLength, align 4
+  %mBegin = getelementptr inbounds %struct.Vector2* %result, i64 0, i32 0
+  %tmp14 = load i16** %mBegin, align 8
+  %tmp48 = zext i32 %tmp16 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvar = phi i64 [ 0, %for.body.lr.ph ], [ %indvar.next, %for.body ]
+  %tmp46 = add i64 %tmp51, %indvar
+  %p.042 = getelementptr [33 x i16]* %buffer, i64 0, i64 %tmp46
+  %tmp47 = sub i64 %indvar, %0
+  %incdec.ptr32 = getelementptr [33 x i16]* %buffer, i64 1, i64 %tmp47
+  %tmp49 = add i64 %tmp48, %indvar
+  %dst.041 = getelementptr i16* %tmp14, i64 %tmp49
+  %tmp29 = load i16* %p.042, align 2
+  store i16 %tmp29, i16* %dst.041, align 2
+  %cmp27 = icmp eq i16* %incdec.ptr32, %add.ptr22
+  %indvar.next = add i64 %indvar, 1
+  br i1 %cmp27, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %do.end
+  %tmp38 = load i32* %mLength, align 4
+  %add = add i32 %tmp38, %conv11
+  store i32 %add, i32* %mLength, align 4
+  ret void
+}
diff --git a/test/Transforms/LoopUnswitch/2011-06-02-CritSwitch.ll b/test/Transforms/LoopUnswitch/2011-06-02-CritSwitch.ll
new file mode 100644
index 0000000..61c54dd
--- /dev/null
+++ b/test/Transforms/LoopUnswitch/2011-06-02-CritSwitch.ll
@@ -0,0 +1,28 @@
+; RUN: opt -loop-unswitch -disable-output
+; PR10031
+
+define i32 @test(i32 %command) {
+entry:
+  br label %tailrecurse
+
+tailrecurse:                                      ; preds = %if.then14, %tailrecurse, %entry
+  br i1 undef, label %if.then, label %tailrecurse
+
+if.then:                                          ; preds = %tailrecurse
+  switch i32 %command, label %sw.bb [
+    i32 2, label %land.lhs.true
+    i32 0, label %land.lhs.true
+  ]
+
+land.lhs.true:                                    ; preds = %if.then, %if.then
+  br i1 undef, label %sw.bb, label %if.then14
+
+if.then14:                                        ; preds = %land.lhs.true
+  switch i32 %command, label %tailrecurse [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb
+  ]
+
+sw.bb:                                            ; preds = %if.then14
+  unreachable
+}
diff --git a/test/Transforms/MemCpyOpt/2011-06-02-CallSlotOverwritten.ll b/test/Transforms/MemCpyOpt/2011-06-02-CallSlotOverwritten.ll
new file mode 100644
index 0000000..132966e
--- /dev/null
+++ b/test/Transforms/MemCpyOpt/2011-06-02-CallSlotOverwritten.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -basicaa -memcpyopt -S | FileCheck %s
+; PR10067
+; Make sure the call+copy isn't optimized in such a way that
+; %ret ends up with the wrong value.
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+target triple = "i386-apple-darwin10"
+
+%struct1 = type { i32, i32 }
+%struct2 = type { %struct1, i8* }
+
+declare void @bar(%struct1* nocapture sret %agg.result) nounwind
+
+define i32 @foo() nounwind {
+  %x = alloca %struct1, align 8
+  %y = alloca %struct2, align 8
+  call void @bar(%struct1* sret %x) nounwind
+; CHECK: call void @bar(%struct1* sret %x)
+
+  %gepn1 = getelementptr inbounds %struct2* %y, i32 0, i32 0, i32 0
+  store i32 0, i32* %gepn1, align 8
+  %gepn2 = getelementptr inbounds %struct2* %y, i32 0, i32 0, i32 1
+  store i32 0, i32* %gepn2, align 4
+
+  %bit1 = bitcast %struct1* %x to i64*
+  %bit2 = bitcast %struct2* %y to i64*
+  %load = load i64* %bit1, align 8
+  store i64 %load, i64* %bit2, align 8
+
+; CHECK: %load = load i64* %bit1, align 8
+; CHECK: store i64 %load, i64* %bit2, align 8
+
+  %gep1 = getelementptr %struct2* %y, i32 0, i32 0, i32 0
+  %ret = load i32* %gep1
+  ret i32 %ret
+}
diff --git a/test/Transforms/MemCpyOpt/memcpy.ll b/test/Transforms/MemCpyOpt/memcpy.ll
index b387d32..5c6a94c 100644
--- a/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/test/Transforms/MemCpyOpt/memcpy.ll
@@ -109,3 +109,23 @@ define void @test6(i8 *%P) {
 ; CHECK-NEXT: ret void
 }
 
+
+; PR9794 - Should forward memcpy into byval argument even though the memcpy
+; isn't itself 8 byte aligned.
+%struct.p = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
+
+define i32 @test7(%struct.p* nocapture byval align 8 %q) nounwind ssp {
+entry:
+  %agg.tmp = alloca %struct.p, align 4
+  %tmp = bitcast %struct.p* %agg.tmp to i8*
+  %tmp1 = bitcast %struct.p* %q to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp, i8* %tmp1, i64 48, i32 4, i1 false)
+  %call = call i32 @g(%struct.p* byval align 8 %agg.tmp) nounwind
+  ret i32 %call
+; CHECK: @test7
+; CHECK: call i32 @g(%struct.p* byval align 8 %q) nounwind
+}
+
+declare i32 @g(%struct.p* byval align 8)
+
+
diff --git a/test/Transforms/PhaseOrdering/basic.ll b/test/Transforms/PhaseOrdering/basic.ll
index c66e150..e5b2ba4 100644
--- a/test/Transforms/PhaseOrdering/basic.ll
+++ b/test/Transforms/PhaseOrdering/basic.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -O3 -S %s | FileCheck %s
+; XFAIL: *
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-macosx10.6.7"
diff --git a/test/Transforms/PhaseOrdering/dg.exp b/test/Transforms/PhaseOrdering/dg.exp
new file mode 100644
index 0000000..f200589
--- /dev/null
+++ b/test/Transforms/PhaseOrdering/dg.exp
@@ -0,0 +1,3 @@
+load_lib llvm.exp
+
+RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll,c,cpp}]]
diff --git a/test/Transforms/ScalarRepl/2011-05-06-CapturedAlloca.ll b/test/Transforms/ScalarRepl/2011-05-06-CapturedAlloca.ll
new file mode 100644
index 0000000..816cb60
--- /dev/null
+++ b/test/Transforms/ScalarRepl/2011-05-06-CapturedAlloca.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; PR9820
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+@func_1.l_10 = internal unnamed_addr constant [4 x i32] [i32 1, i32 0, i32 0, i32 0], align 16
+
+define i32* @noop(i32* %p_29) nounwind readnone {
+entry:
+  ret i32* %p_29
+}
+
+define i32 @main() nounwind {
+entry:
+  %l_10 = alloca [4 x i32], align 16
+  %tmp = bitcast [4 x i32]* %l_10 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp, i8* bitcast ([4 x i32]* @func_1.l_10 to i8*), i64 16, i32 16, i1 false)
+; CHECK: call void @llvm.memcpy
+  %arrayidx = getelementptr inbounds [4 x i32]* %l_10, i64 0, i64 0
+  %call = call i32* @noop(i32* %arrayidx)
+  store i32 0, i32* %call
+  ret i32 0
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/Transforms/ScalarRepl/2011-06-08-VectorExtractValue.ll b/test/Transforms/ScalarRepl/2011-06-08-VectorExtractValue.ll
new file mode 100644
index 0000000..32e67fb
--- /dev/null
+++ b/test/Transforms/ScalarRepl/2011-06-08-VectorExtractValue.ll
@@ -0,0 +1,62 @@
+; RUN: opt < %s -S -scalarrepl | FileCheck %s
+; RUN: opt < %s -S -scalarrepl-ssa | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.7.0"
+
+%0 = type { <2 x float>, float }
+%struct.PointC3 = type { %struct.array }
+%struct.Point_3 = type { %struct.PointC3 }
+%struct.array = type { [3 x float], [4 x i8] }
+
+; CHECK: main
+; CHECK-NOT: alloca
+; CHECK: extractelement <2 x float> zeroinitializer
+
+define void @main() uwtable ssp {
+entry:
+  %ref.tmp2 = alloca %0, align 16
+  %tmpcast = bitcast %0* %ref.tmp2 to %struct.Point_3*
+  %0 = getelementptr %0* %ref.tmp2, i64 0, i32 0
+  store <2 x float> zeroinitializer, <2 x float>* %0, align 16
+  %1 = getelementptr inbounds %struct.Point_3* %tmpcast, i64 0, i32 0
+  %base.i.i.i = getelementptr inbounds %struct.PointC3* %1, i64 0, i32 0
+  %arrayidx.i.i.i.i = getelementptr inbounds %struct.array* %base.i.i.i, i64 0, i32 0, i64 0
+  %tmp5.i.i = load float* %arrayidx.i.i.i.i, align 4
+  ret void
+}
+
+; CHECK: test1
+; CHECK-NOT: alloca
+; CHECK: extractelement <2 x float> zeroinitializer
+
+define void @test1() uwtable ssp {
+entry:
+  %ref.tmp2 = alloca {<2 x float>, float}, align 16
+  %tmpcast = bitcast {<2 x float>, float}* %ref.tmp2 to float*
+  %0 = getelementptr {<2 x float>, float}* %ref.tmp2, i64 0, i32 0
+  store <2 x float> zeroinitializer, <2 x float>* %0, align 16
+  %tmp5.i.i = load float* %tmpcast, align 4
+  ret void
+}
+
+; CHECK: test2
+; CHECK-NOT: alloca
+; CHECK: and i128
+; CHECK: or i128
+; CHECK: trunc i128
+; CHECK-NOT: insertelement
+; CHECK-NOT: extractelement
+
+define float @test2() uwtable ssp {
+entry:
+  %ref.tmp2 = alloca {<2 x float>, float}, align 16
+  %tmpcast = bitcast {<2 x float>, float}* %ref.tmp2 to float*
+  %tmpcast2 = getelementptr {<2 x float>, float}* %ref.tmp2, i64 0, i32 1
+  %0 = getelementptr {<2 x float>, float}* %ref.tmp2, i64 0, i32 0
+  store <2 x float> zeroinitializer, <2 x float>* %0, align 16
+  store float 1.0, float* %tmpcast2, align 4
+  %r1 = load float* %tmpcast, align 4
+  %r2 = load float* %tmpcast2, align 4
+  %r = fadd float %r1, %r2
+  ret float %r
+}
diff --git a/test/Transforms/ScalarRepl/debuginfo-preserved.ll b/test/Transforms/ScalarRepl/debuginfo-preserved.ll
new file mode 100644
index 0000000..c149134
--- /dev/null
+++ b/test/Transforms/ScalarRepl/debuginfo-preserved.ll
@@ -0,0 +1,61 @@
+; RUN: opt < %s -scalarrepl -S | FileCheck %s
+; RUN: opt < %s -scalarrepl-ssa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.0"
+
+; CHECK: f
+; CHECK-NOT: llvm.dbg.declare
+; CHECK: llvm.dbg.value
+; CHECK: llvm.dbg.value
+; CHECK: llvm.dbg.value
+; CHECK: llvm.dbg.value
+; CHECK: llvm.dbg.value
+
+define i32 @f(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  %c = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !6), !dbg !7
+  store i32 %b, i32* %b.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !8), !dbg !9
+  call void @llvm.dbg.declare(metadata !{i32* %c}, metadata !10), !dbg !12
+  %tmp = load i32* %a.addr, align 4, !dbg !13
+  store i32 %tmp, i32* %c, align 4, !dbg !13
+  %tmp1 = load i32* %a.addr, align 4, !dbg !14
+  %tmp2 = load i32* %b.addr, align 4, !dbg !14
+  %add = add nsw i32 %tmp1, %tmp2, !dbg !14
+  store i32 %add, i32* %a.addr, align 4, !dbg !14
+  %tmp3 = load i32* %c, align 4, !dbg !15
+  %tmp4 = load i32* %b.addr, align 4, !dbg !15
+  %sub = sub nsw i32 %tmp3, %tmp4, !dbg !15
+  store i32 %sub, i32* %b.addr, align 4, !dbg !15
+  %tmp5 = load i32* %a.addr, align 4, !dbg !16
+  %tmp6 = load i32* %b.addr, align 4, !dbg !16
+  %add7 = add nsw i32 %tmp5, %tmp6, !dbg !16
+  ret i32 %add7, !dbg !16
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+!llvm.dbg.sp = !{!1}
+
+!0 = metadata !{i32 589841, i32 0, i32 12, metadata !"/d/j/debug-test.c", metadata !"/Volumes/Data/b", metadata !"clang version 3.0 (trunk 131941)", i1 true, i1 false, metadata !"", i32 0} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{i32 589870, i32 0, metadata !2, metadata !"f", metadata !"f", metadata !"", metadata !2, i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 (i32, i32)* @f, null, null} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 589865, metadata !"/d/j/debug-test.c", metadata !"/Volumes/Data/b", metadata !0} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 589845, metadata !2, metadata !"", metadata !2, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 589860, metadata !0, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{i32 590081, metadata !1, metadata !"a", metadata !2, i32 16777217, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
+!7 = metadata !{i32 1, i32 11, metadata !1, null}
+!8 = metadata !{i32 590081, metadata !1, metadata !"b", metadata !2, i32 33554433, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{i32 1, i32 18, metadata !1, null}
+!10 = metadata !{i32 590080, metadata !11, metadata !"c", metadata !2, i32 2, metadata !5, i32 0} ; [ DW_TAG_auto_variable ]
+!11 = metadata !{i32 589835, metadata !1, i32 1, i32 21, metadata !2, i32 0} ; [ DW_TAG_lexical_block ]
+!12 = metadata !{i32 2, i32 9, metadata !11, null}
+!13 = metadata !{i32 2, i32 14, metadata !11, null}
+!14 = metadata !{i32 3, i32 5, metadata !11, null}
+!15 = metadata !{i32 4, i32 5, metadata !11, null}
+!16 = metadata !{i32 5, i32 5, metadata !11, null}
diff --git a/test/Transforms/ScalarRepl/debuginfo.ll b/test/Transforms/ScalarRepl/debuginfo.ll
index 6b8422c..ae2c6cc 100644
--- a/test/Transforms/ScalarRepl/debuginfo.ll
+++ b/test/Transforms/ScalarRepl/debuginfo.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -scalarrepl -S | not grep alloca
+; RUN: opt < %s -scalarrepl-ssa -S | not grep alloca
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 	%llvm.dbg.anchor.type = type { i32, i32 }
 	%llvm.dbg.basictype.type = type { i32, { }*, i8*, { }*, i32, i64, i64, i64, i32, i32 }
diff --git a/test/Transforms/SimplifyCFG/PR9946.ll b/test/Transforms/SimplifyCFG/PR9946.ll
new file mode 100644
index 0000000..4a61b84
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/PR9946.ll
@@ -0,0 +1,18 @@
+; RUN: opt  %s -simplifycfg -disable-output
+
+@foo = external constant i32
+
+define i32 @f() {
+entry:
+  br i1 icmp eq (i64 and (i64 ptrtoint (i32* @foo to i64), i64 15), i64 0), label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  br label %return
+
+if.end:                                           ; preds = %entry
+  br label %return
+
+return:                                           ; preds = %if.end, %if.then
+  %storemerge = phi i32 [ 1, %if.end ], [ 0, %if.then ]
+  ret i32 %storemerge
+}
diff --git a/test/Transforms/SimplifyCFG/dce-cond-after-folding-terminator.ll b/test/Transforms/SimplifyCFG/dce-cond-after-folding-terminator.ll
new file mode 100644
index 0000000..3996efd
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/dce-cond-after-folding-terminator.ll
@@ -0,0 +1,52 @@
+; RUN: opt -S <%s -simplifycfg | FileCheck %s
+
+define void @test_br(i32 %x) {
+entry:
+; CHECK: @test_br
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret void
+  %cmp = icmp eq i32 %x, 10
+  br i1 %cmp, label %if.then, label %if.then
+
+if.then:                                          ; preds = %entry
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define void @test_switch(i32 %x) nounwind {
+entry:
+; CHECK: @test_switch
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret void
+  %rem = srem i32 %x, 3
+  switch i32 %rem, label %sw.bb [
+    i32 1, label %sw.bb
+    i32 10, label %sw.bb
+  ]
+
+sw.bb:                                            ; preds = %sw.default, %entry, %entry
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %sw.bb
+  ret void
+}
+
+define void @test_indirectbr(i32 %x) {
+entry:
+; CHECK: @test_indirectbr
+; CHECK-NEXT: entry:
+; Ideally this should now check:
+;   CHK-NEXT: ret void
+; But that doesn't happen yet. Instead:
+; CHECK-NEXT: br label %L1
+
+  %label = bitcast i8* blockaddress(@test_indirectbr, %L1) to i8*
+  indirectbr i8* %label, [label %L1, label %L2]
+
+L1:                                               ; preds = %entry
+  ret void
+L2:                                               ; preds = %entry
+  ret void
+}
diff --git a/test/Transforms/SimplifyCFG/indirectbr.ll b/test/Transforms/SimplifyCFG/indirectbr.ll
index 7fb4def..7853e9a 100644
--- a/test/Transforms/SimplifyCFG/indirectbr.ll
+++ b/test/Transforms/SimplifyCFG/indirectbr.ll
@@ -180,3 +180,72 @@ L3:
 ; before SimplifyCFG even looks at the indirectbr.
   indirectbr i8* %anchor, [label %L1, label %L2]
 }
+
+; PR10072
+
+@xblkx.bbs = internal unnamed_addr constant [9 x i8*] [i8* blockaddress(@indbrtest7, %xblkx.begin), i8* blockaddress(@indbrtest7, %xblkx.begin3), i8* blockaddress(@indbrtest7, %xblkx.begin4), i8* blockaddress(@indbrtest7, %xblkx.begin5), i8* blockaddress(@indbrtest7, %xblkx.begin6), i8* blockaddress(@indbrtest7, %xblkx.begin7), i8* blockaddress(@indbrtest7, %xblkx.begin8), i8* blockaddress(@indbrtest7, %xblkx.begin9), i8* blockaddress(@indbrtest7, %xblkx.end)]
+
+define void @indbrtest7() {
+escape-string.top:
+  %xval202x = call i32 @xfunc5x()
+  br label %xlab5x
+
+xlab8x:                                           ; preds = %xlab5x
+  %xvaluex = call i32 @xselectorx()
+  %xblkx.x = getelementptr [9 x i8*]* @xblkx.bbs, i32 0, i32 %xvaluex
+  %xblkx.load = load i8** %xblkx.x
+  indirectbr i8* %xblkx.load, [label %xblkx.begin, label %xblkx.begin3, label %xblkx.begin4, label %xblkx.begin5, label %xblkx.begin6, label %xblkx.begin7, label %xblkx.begin8, label %xblkx.begin9, label %xblkx.end]
+
+xblkx.begin:
+  br label %xblkx.end
+
+xblkx.begin3:
+  br label %xblkx.end
+
+xblkx.begin4:
+  br label %xblkx.end
+
+xblkx.begin5:
+  br label %xblkx.end
+
+xblkx.begin6:
+  br label %xblkx.end
+
+xblkx.begin7:
+  br label %xblkx.end
+
+xblkx.begin8:
+  br label %xblkx.end
+
+xblkx.begin9:
+  br label %xblkx.end
+
+xblkx.end:
+  %yes.0 = phi i1 [ false, %xblkx.begin ], [ true, %xlab8x ], [ false, %xblkx.begin9 ], [ false, %xblkx.begin8 ], [ false, %xblkx.begin7 ], [ false, %xblkx.begin6 ], [ false, %xblkx.begin5 ], [ true, %xblkx.begin4 ], [ false, %xblkx.begin3 ]
+  br i1 %yes.0, label %v2j, label %xlab17x
+
+v2j:
+; CHECK: %xunusedx = call i32 @xactionx()
+  %xunusedx = call i32 @xactionx()
+  br label %xlab4x
+
+xlab17x:
+  br label %xlab4x
+
+xlab4x:
+  %incr19 = add i32 %xval704x.0, 1
+  br label %xlab5x
+
+xlab5x:
+  %xval704x.0 = phi i32 [ 0, %escape-string.top ], [ %incr19, %xlab4x ]
+  %xval10x = icmp ult i32 %xval704x.0, %xval202x
+  br i1 %xval10x, label %xlab8x, label %xlab9x
+
+xlab9x:
+  ret void
+}
+
+declare i32 @xfunc5x()
+declare i8 @xfunc7x()
+declare i32 @xselectorx()
+declare i32 @xactionx()
diff --git a/test/Transforms/SimplifyCFG/switch-masked-bits.ll b/test/Transforms/SimplifyCFG/switch-masked-bits.ll
new file mode 100644
index 0000000..fc83ec2
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/switch-masked-bits.ll
@@ -0,0 +1,38 @@
+; RUN: opt -S -simplifycfg < %s | FileCheck %s
+
+define i32 @test1(i32 %x) nounwind {
+  %i = shl i32 %x, 1
+  switch i32 %i, label %a [
+    i32 21, label %b
+    i32 24, label %c
+  ]
+
+a:
+  ret i32 0
+b:
+  ret i32 3
+c:
+  ret i32 5
+; CHECK: @test1
+; CHECK: %cond = icmp eq i32 %i, 24
+; CHECK: %merge = select i1 %cond, i32 5, i32 0
+; CHECK: ret i32 %merge
+}
+
+
+define i32 @test2(i32 %x) nounwind {
+  %i = shl i32 %x, 1
+  switch i32 %i, label %a [
+    i32 21, label %b
+    i32 23, label %c
+  ]
+
+a:
+  ret i32 0
+b:
+  ret i32 3
+c:
+  ret i32 5
+; CHECK: @test2
+; CHECK: ret i32 0
+}
diff --git a/test/Transforms/TailCallElim/setjmp.ll b/test/Transforms/TailCallElim/setjmp.ll
new file mode 100644
index 0000000..7b7fe56
--- /dev/null
+++ b/test/Transforms/TailCallElim/setjmp.ll
@@ -0,0 +1,16 @@
+; RUN: opt < %s -tailcallelim -S | FileCheck %s
+
+; Test that we don't tail call in a functions that calls setjmp.
+
+; CHECK-NOT: tail call void @bar()
+
+define void @foo(i32* %x) {
+bb:
+  %tmp75 = tail call i32 @setjmp(i32* %x)
+  call void @bar()
+  ret void
+}
+
+declare i32 @setjmp(i32*)
+
+declare void @bar()
diff --git a/test/Transforms/TailDup/X86/dg.exp b/test/Transforms/TailDup/X86/dg.exp
new file mode 100644
index 0000000..7b7bd4e
--- /dev/null
+++ b/test/Transforms/TailDup/X86/dg.exp
@@ -0,0 +1,5 @@
+load_lib llvm.exp
+
+if { [llvm_supports_target X86] } {
+  RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll}]]
+}
diff --git a/test/Transforms/TailDup/if-tail-dup.ll b/test/Transforms/TailDup/X86/if-tail-dup.ll
index 2e4f5be..2e4f5be 100644
--- a/test/Transforms/TailDup/if-tail-dup.ll
+++ b/test/Transforms/TailDup/X86/if-tail-dup.ll
diff --git a/tools/bugpoint/ExecutionDriver.cpp b/tools/bugpoint/ExecutionDriver.cpp
index 9be9dfd..77c01ac 100644
--- a/tools/bugpoint/ExecutionDriver.cpp
+++ b/tools/bugpoint/ExecutionDriver.cpp
@@ -475,7 +475,7 @@ bool BugDriver::createReferenceFile(Module *M, const std::string &Filename) {
 /// diffProgram - This method executes the specified module and diffs the
 /// output against the file specified by ReferenceOutputFile.  If the output
 /// is different, 1 is returned.  If there is a problem with the code
-/// generator (e.g., llc crashes), this will return -1 and set Error.
+/// generator (e.g., llc crashes), this will set ErrMsg.
 ///
 bool BugDriver::diffProgram(const Module *Program,
                             const std::string &BitcodeFile,
diff --git a/tools/bugpoint/Miscompilation.cpp b/tools/bugpoint/Miscompilation.cpp
index a9db38f..1834fe1 100644
--- a/tools/bugpoint/Miscompilation.cpp
+++ b/tools/bugpoint/Miscompilation.cpp
@@ -624,9 +624,10 @@ DebugAMiscompilation(BugDriver &BD,
   if (!BugpointIsInterrupted)
     ReduceMiscompilingFunctions(BD, TestFn).reduceList(MiscompiledFunctions,
                                                        Error);
-  if (!Error.empty())
+  if (!Error.empty()) {
+    errs() << "\n***Cannot reduce functions: ";
     return MiscompiledFunctions;
-
+  }
   outs() << "\n*** The following function"
          << (MiscompiledFunctions.size() == 1 ? " is" : "s are")
          << " being miscompiled: ";
diff --git a/tools/bugpoint/OptimizerDriver.cpp b/tools/bugpoint/OptimizerDriver.cpp
index c6be271..336c83d 100644
--- a/tools/bugpoint/OptimizerDriver.cpp
+++ b/tools/bugpoint/OptimizerDriver.cpp
@@ -223,7 +223,7 @@ bool BugDriver::runPasses(Module *Program,
       if (result == -1)
         outs() << "Execute failed: " << ErrMsg << "\n";
       else
-        outs() << "Crashed with signal #" << abs(result) << "\n";
+        outs() << "Crashed: " << ErrMsg << "\n";
     }
     if (result & 0x01000000)
       outs() << "Dumped core\n";
diff --git a/tools/bugpoint/ToolRunner.cpp b/tools/bugpoint/ToolRunner.cpp
index 6c46ef1..0d98262 100644
--- a/tools/bugpoint/ToolRunner.cpp
+++ b/tools/bugpoint/ToolRunner.cpp
@@ -854,9 +854,18 @@ int GCC::ExecuteProgram(const std::string &ProgramFile,
 
   if (RemoteClientPath.isEmpty()) {
     DEBUG(errs() << "<run locally>");
-    return RunProgramWithTimeout(OutputBinary, &ProgramArgs[0],
+    int ExitCode = RunProgramWithTimeout(OutputBinary, &ProgramArgs[0],
         sys::Path(InputFile), sys::Path(OutputFile), sys::Path(OutputFile),
         Timeout, MemoryLimit, Error);
+    // Treat a signal (usually SIGSEGV) or timeout as part of the program output
+    // so that crash-causing miscompilation is handled seamlessly.
+    if (ExitCode < -1) {
+      std::ofstream outFile(OutputFile.c_str(), std::ios_base::app);
+      outFile << *Error << '\n';
+      outFile.close();
+      Error->clear();
+    }
+    return ExitCode;
   } else {
     outs() << "<run remotely>"; outs().flush();
     return RunProgramRemotelyWithTimeout(sys::Path(RemoteClientPath),
diff --git a/tools/bugpoint/bugpoint.cpp b/tools/bugpoint/bugpoint.cpp
index f9c9e18..e25414f 100644
--- a/tools/bugpoint/bugpoint.cpp
+++ b/tools/bugpoint/bugpoint.cpp
@@ -22,7 +22,7 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/StandardPasses.h"
+#include "llvm/Support/PassManagerBuilder.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/Valgrind.h"
@@ -69,6 +69,18 @@ static cl::opt<bool>
 StandardLinkOpts("std-link-opts", 
                  cl::desc("Include the standard link time optimizations"));
 
+static cl::opt<bool>
+OptLevelO1("O1",
+           cl::desc("Optimization level 1. Similar to llvm-gcc -O1"));
+
+static cl::opt<bool>
+OptLevelO2("O2",
+           cl::desc("Optimization level 2. Similar to llvm-gcc -O2"));
+
+static cl::opt<bool>
+OptLevelO3("O3",
+           cl::desc("Optimization level 3. Similar to llvm-gcc -O3"));
+
 static cl::opt<std::string>
 OverrideTriple("mtriple", cl::desc("Override target triple for module"));
 
@@ -83,10 +95,10 @@ static void BugpointInterruptFunction() {
 
 // Hack to capture a pass list.
 namespace {
-  class AddToDriver : public PassManager {
+  class AddToDriver : public FunctionPassManager {
     BugDriver &D;
   public:
-    AddToDriver(BugDriver &_D) : D(_D) {}
+    AddToDriver(BugDriver &_D) : FunctionPassManager(0), D(_D) {}
     
     virtual void add(Pass *P) {
       const void *ID = P->getPassID();
@@ -146,20 +158,32 @@ int main(int argc, char **argv) {
   
   AddToDriver PM(D);
   if (StandardCompileOpts) {
-    createStandardModulePasses(&PM, 3,
-                               /*OptimizeSize=*/ false,
-                               /*UnitAtATime=*/ true,
-                               /*UnrollLoops=*/ true,
-                               /*SimplifyLibCalls=*/ true,
-                               /*HaveExceptions=*/ true,
-                               createFunctionInliningPass());
+    PassManagerBuilder Builder;
+    Builder.OptLevel = 3;
+    Builder.Inliner = createFunctionInliningPass();
+    Builder.populateModulePassManager(PM);
   }
       
-  if (StandardLinkOpts)
-    createStandardLTOPasses(&PM, /*Internalize=*/true,
-                            /*RunInliner=*/true,
-                            /*VerifyEach=*/false);
+  if (StandardLinkOpts) {
+    PassManagerBuilder Builder;
+    Builder.populateLTOPassManager(PM, /*Internalize=*/true,
+                                   /*RunInliner=*/true);
+  }
 
+  if (OptLevelO1 || OptLevelO2 || OptLevelO3) {
+    PassManagerBuilder Builder;
+    if (OptLevelO1)
+      Builder.Inliner = createAlwaysInlinerPass();
+    else if (OptLevelO2)
+      Builder.Inliner = createFunctionInliningPass(225);
+    else
+      Builder.Inliner = createFunctionInliningPass(275);
+
+    // Note that although clang/llvm-gcc use two separate passmanagers
+    // here, it shouldn't normally make a difference.
+    Builder.populateFunctionPassManager(PM);
+    Builder.populateModulePassManager(PM);
+  }
 
   for (std::vector<const PassInfo*>::iterator I = PassList.begin(),
          E = PassList.end();
diff --git a/tools/gold/CMakeLists.txt b/tools/gold/CMakeLists.txt
index d8633e6..eb4b6e6 100644
--- a/tools/gold/CMakeLists.txt
+++ b/tools/gold/CMakeLists.txt
@@ -1,7 +1,9 @@
-set(LLVM_BINUTILS_INCDIR "/usr/include" CACHE PATH
+set(LLVM_BINUTILS_INCDIR "" CACHE PATH
   "PATH to binutils/include containing plugin-api.h for gold plugin.")
 
-if( NOT EXISTS "${LLVM_BINUTILS_INCDIR}/plugin-api.h" )
+if( NOT LLVM_BINUTILS_INCDIR )
+  # Nothing to say.
+elseif( NOT EXISTS "${LLVM_BINUTILS_INCDIR}/plugin-api.h" )
   message(STATUS "plugin-api.h not found. gold plugin excluded from the build.")
 else()
   include_directories( ${LLVM_BINUTILS_INCDIR} )
diff --git a/tools/gold/Makefile b/tools/gold/Makefile
index 66a0271..759406f 100644
--- a/tools/gold/Makefile
+++ b/tools/gold/Makefile
@@ -22,10 +22,10 @@ SHARED_LIBRARY = 1
 LOADABLE_MODULE = 1
 
 LINK_COMPONENTS := support
-LIBS += -llto
 
 # Because off_t is used in the public API, the largefile parts are required for
 # ABI compatibility.
-CXXFLAGS+=-I$(BINUTILS_INCDIR) -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -lLTO
+CXXFLAGS+=-I$(BINUTILS_INCDIR) -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64
+CXXFLAGS+=$(SharedLibDir)/$(SharedPrefix)LTO$(SHLIBEXT)
 
 include $(LEVEL)/Makefile.common
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index dd66eae..9e43bef 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -408,7 +408,6 @@ static ld_plugin_status all_symbols_read_hook(void) {
     if (options::generate_bc_file == options::BC_ONLY)
       exit(0);
   }
-  size_t bufsize = 0;
   const char *objPath;
   if (lto_codegen_compile_to_file(code_gen, &objPath)) {
     (*message)(LDPL_ERROR, "Could not produce a combined object file\n");
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index a36b6d7..162d6c8 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -107,11 +107,6 @@ DisableRedZone("disable-red-zone",
   cl::desc("Do not emit code that uses the red zone."),
   cl::init(false));
 
-static cl::opt<bool>
-NoImplicitFloats("no-implicit-float",
-  cl::desc("Don't generate implicit floating point instructions (x86-only)"),
-  cl::init(false));
-
 // GetFileNameRoot - Helper function to get the basename of a filename.
 static inline std::string
 GetFileNameRoot(const std::string &InputFilename) {
diff --git a/tools/llvm-ld/Optimize.cpp b/tools/llvm-ld/Optimize.cpp
index ef4502b..ca6a477 100644
--- a/tools/llvm-ld/Optimize.cpp
+++ b/tools/llvm-ld/Optimize.cpp
@@ -12,9 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Module.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/StandardPasses.h"
+#include "llvm/Support/PassManagerBuilder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Target/TargetData.h"
@@ -71,11 +70,10 @@ static inline void addPass(PassManager &PM, Pass *P) {
 }
 
 namespace llvm {
-
 /// Optimize - Perform link time optimizations. This will run the scalar
 /// optimizations, any loaded plugin-optimization modules, and then the
 /// inter-procedural optimizations if applicable.
-void Optimize(Module* M) {
+void Optimize(Module *M) {
 
   // Instantiate the pass manager to organize the passes.
   PassManager Passes;
@@ -88,8 +86,8 @@ void Optimize(Module* M) {
   addPass(Passes, new TargetData(M));
 
   if (!DisableOptimizations)
-    createStandardLTOPasses(&Passes, !DisableInternalize, !DisableInline,
-                            VerifyEach);
+    PassManagerBuilder().populateLTOPassManager(Passes, !DisableInternalize,
+                                                !DisableInline);
 
   // If the -s or -S command line options were specified, strip the symbols out
   // of the resulting program to make it smaller.  -s and -S are GNU ld options
diff --git a/tools/llvm-mc/Disassembler.cpp b/tools/llvm-mc/Disassembler.cpp
index cdd1892..93b9723 100644
--- a/tools/llvm-mc/Disassembler.cpp
+++ b/tools/llvm-mc/Disassembler.cpp
@@ -39,7 +39,7 @@ private:
   const ByteArrayTy &Bytes;
 public:
   VectorMemoryObject(const ByteArrayTy &bytes) : Bytes(bytes) {}
-  
+
   uint64_t getBase() const { return 0; }
   uint64_t getExtent() const { return Bytes.size(); }
 
@@ -57,15 +57,15 @@ static bool PrintInsts(const MCDisassembler &DisAsm,
                        SourceMgr &SM, raw_ostream &Out) {
   // Wrap the vector in a MemoryObject.
   VectorMemoryObject memoryObject(Bytes);
-  
+
   // Disassemble it to strings.
   uint64_t Size;
   uint64_t Index;
-  
+
   for (Index = 0; Index < Bytes.size(); Index += Size) {
     MCInst Inst;
-    
-    if (DisAsm.getInstruction(Inst, Size, memoryObject, Index, 
+
+    if (DisAsm.getInstruction(Inst, Size, memoryObject, Index,
                                /*REMOVE*/ nulls())) {
       Printer.printInst(&Inst, Out);
       Out << "\n";
@@ -76,12 +76,12 @@ static bool PrintInsts(const MCDisassembler &DisAsm,
         Size = 1; // skip illegible bytes
     }
   }
-  
+
   return false;
 }
 
-static bool ByteArrayFromString(ByteArrayTy &ByteArray, 
-                                StringRef &Str, 
+static bool ByteArrayFromString(ByteArrayTy &ByteArray,
+                                StringRef &Str,
                                 SourceMgr &SM) {
   while (!Str.empty()) {
     // Strip horizontal whitespace.
@@ -89,7 +89,7 @@ static bool ByteArrayFromString(ByteArrayTy &ByteArray,
       Str = Str.substr(Pos);
       continue;
     }
-    
+
     // If this is the end of a line or start of a comment, remove the rest of
     // the line.
     if (Str[0] == '\n' || Str[0] == '#') {
@@ -104,11 +104,11 @@ static bool ByteArrayFromString(ByteArrayTy &ByteArray,
       }
       continue;
     }
-    
+
     // Get the current token.
     size_t Next = Str.find_first_of(" \t\n\r#");
     StringRef Value = Str.substr(0, Next);
-    
+
     // Convert to a byte and add to the byte vector.
     unsigned ByteVal;
     if (Value.getAsInteger(0, ByteVal) || ByteVal > 255) {
@@ -119,11 +119,11 @@ static bool ByteArrayFromString(ByteArrayTy &ByteArray,
       ByteArray.clear();
       continue;
     }
-    
+
     ByteArray.push_back(std::make_pair((unsigned char)ByteVal, Value.data()));
     Str = Str.substr(Next);
   }
-  
+
   return false;
 }
 
@@ -133,18 +133,18 @@ int Disassembler::disassemble(const Target &T,  TargetMachine &TM,
                               raw_ostream &Out) {
   // Set up disassembler.
   OwningPtr<const MCAsmInfo> AsmInfo(T.createAsmInfo(Triple));
-  
+
   if (!AsmInfo) {
     errs() << "error: no assembly info for target " << Triple << "\n";
     return -1;
   }
-  
+
   OwningPtr<const MCDisassembler> DisAsm(T.createMCDisassembler());
   if (!DisAsm) {
     errs() << "error: no disassembler for target " << Triple << "\n";
     return -1;
   }
-  
+
   int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
   OwningPtr<MCInstPrinter> IP(T.createMCInstPrinter(TM, AsmPrinterVariant,
                                                     *AsmInfo));
@@ -152,67 +152,67 @@ int Disassembler::disassemble(const Target &T,  TargetMachine &TM,
     errs() << "error: no instruction printer for target " << Triple << '\n';
     return -1;
   }
-  
+
   bool ErrorOccurred = false;
-  
+
   SourceMgr SM;
   SM.AddNewSourceBuffer(&Buffer, SMLoc());
-  
+
   // Convert the input to a vector for disassembly.
   ByteArrayTy ByteArray;
   StringRef Str = Buffer.getBuffer();
-  
+
   ErrorOccurred |= ByteArrayFromString(ByteArray, Str, SM);
-  
+
   if (!ByteArray.empty())
     ErrorOccurred |= PrintInsts(*DisAsm, *IP, ByteArray, SM, Out);
-    
+
   return ErrorOccurred;
 }
 
 static int byteArrayReader(uint8_t *B, uint64_t A, void *Arg) {
   ByteArrayTy &ByteArray = *((ByteArrayTy*)Arg);
-  
+
   if (A >= ByteArray.size())
     return -1;
-  
+
   *B = ByteArray[A].first;
-  
+
   return 0;
 }
 
 static int verboseEvaluator(uint64_t *V, unsigned R, void *Arg) {
   EDDisassembler &disassembler = *(EDDisassembler *)((void **)Arg)[0];
   raw_ostream &Out = *(raw_ostream *)((void **)Arg)[1];
-  
+
   if (const char *regName = disassembler.nameWithRegisterID(R))
     Out << "[" << regName << "/" << R << "]";
-  
+
   if (disassembler.registerIsStackPointer(R))
     Out << "(sp)";
   if (disassembler.registerIsProgramCounter(R))
     Out << "(pc)";
-  
+
   *V = 0;
   return 0;
 }
 
-int Disassembler::disassembleEnhanced(const std::string &TS, 
+int Disassembler::disassembleEnhanced(const std::string &TS,
                                       MemoryBuffer &Buffer,
                                       raw_ostream &Out) {
   ByteArrayTy ByteArray;
   StringRef Str = Buffer.getBuffer();
   SourceMgr SM;
-  
+
   SM.AddNewSourceBuffer(&Buffer, SMLoc());
-  
+
   if (ByteArrayFromString(ByteArray, Str, SM)) {
     return -1;
   }
-  
+
   Triple T(TS);
   EDDisassembler::AssemblySyntax AS;
-  
+
   switch (T.getArch()) {
   default:
     errs() << "error: no default assembly syntax for " << TS.c_str() << "\n";
@@ -226,53 +226,53 @@ int Disassembler::disassembleEnhanced(const std::string &TS,
     AS = EDDisassembler::kEDAssemblySyntaxX86ATT;
     break;
   }
-  
+
   EDDisassembler::initialize();
   OwningPtr<EDDisassembler>
     disassembler(EDDisassembler::getDisassembler(TS.c_str(), AS));
-  
+
   if (disassembler == 0) {
     errs() << "error: couldn't get disassembler for " << TS << '\n';
     return -1;
   }
-  
+
   while (ByteArray.size()) {
     OwningPtr<EDInst>
       inst(disassembler->createInst(byteArrayReader, 0, &ByteArray));
-  
+
     if (inst == 0) {
       errs() << "error: Didn't get an instruction\n";
       return -1;
     }
 
     ByteArray.erase (ByteArray.begin(), ByteArray.begin() + inst->byteSize());
-    
+
     unsigned numTokens = inst->numTokens();
     if ((int)numTokens < 0) {
       errs() << "error: couldn't count the instruction's tokens\n";
       return -1;
     }
-    
+
     for (unsigned tokenIndex = 0; tokenIndex != numTokens; ++tokenIndex) {
       EDToken *token;
-      
+
       if (inst->getToken(token, tokenIndex)) {
         errs() << "error: Couldn't get token\n";
         return -1;
       }
-      
+
       const char *buf;
       if (token->getString(buf)) {
         errs() << "error: Couldn't get string for token\n";
         return -1;
       }
-      
+
       Out << '[';
       int operandIndex = token->operandID();
-      
+
       if (operandIndex >= 0)
         Out << operandIndex << "-";
-      
+
       switch (token->type()) {
       default: Out << "?"; break;
       case EDToken::kTokenWhitespace: Out << "w"; break;
@@ -281,9 +281,9 @@ int Disassembler::disassembleEnhanced(const std::string &TS,
       case EDToken::kTokenLiteral: Out << "l"; break;
       case EDToken::kTokenRegister: Out << "r"; break;
       }
-      
+
       Out << ":" << buf;
-    
+
       if (token->type() == EDToken::kTokenLiteral) {
         Out << "=";
         if (token->literalSign())
@@ -303,33 +303,34 @@ int Disassembler::disassembleEnhanced(const std::string &TS,
         }
         Out << "r" << regID;
       }
-      
+
       Out << "]";
     }
-    
+
     Out << " ";
-      
+
     if (inst->isBranch())
       Out << "<br> ";
     if (inst->isMove())
       Out << "<mov> ";
-    
+
     unsigned numOperands = inst->numOperands();
-    
+
     if ((int)numOperands < 0) {
       errs() << "error: Couldn't count operands\n";
       return -1;
     }
-    
-    for (unsigned operandIndex = 0; operandIndex != numOperands; ++operandIndex) {
+
+    for (unsigned operandIndex = 0; operandIndex != numOperands;
+         ++operandIndex) {
       Out << operandIndex << ":";
-      
+
       EDOperand *operand;
       if (inst->getOperand(operand, operandIndex)) {
         errs() << "error: couldn't get operand\n";
         return -1;
       }
-      
+
       uint64_t evaluatedResult;
       void *Arg[] = { disassembler.get(), &Out };
       if (operand->evaluate(evaluatedResult, verboseEvaluator, Arg)) {
@@ -338,10 +339,10 @@ int Disassembler::disassembleEnhanced(const std::string &TS,
       }
       Out << "=" << evaluatedResult << " ";
     }
-    
+
     Out << '\n';
   }
-  
+
   return 0;
 }
 
diff --git a/tools/llvm-mc/Disassembler.h b/tools/llvm-mc/Disassembler.h
index aaf77b5..d738ee7 100644
--- a/tools/llvm-mc/Disassembler.h
+++ b/tools/llvm-mc/Disassembler.h
@@ -31,12 +31,12 @@ public:
                          const std::string &tripleString,
                          MemoryBuffer &buffer,
                          raw_ostream &Out);
-  
+
   static int disassembleEnhanced(const std::string &tripleString,
                                  MemoryBuffer &buffer,
                                  raw_ostream &Out);
 };
-  
+
 } // namespace llvm
 
 #endif
diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index 24cc263..eb23a1a 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp
@@ -183,10 +183,10 @@ static int AsLexInput(const char *ProgName) {
   MemoryBuffer *Buffer = BufferPtr.take();
 
   SourceMgr SrcMgr;
-  
+
   // Tell SrcMgr about this buffer, which is what TGParser will pick up.
   SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
-  
+
   // Record the location of the include directories so that the lexer can find
   // it later.
   SrcMgr.setIncludeDirs(IncludeDirs);
@@ -279,7 +279,7 @@ static int AsLexInput(const char *ProgName) {
 
   // Keep output if no errors.
   if (Error == 0) Out->keep();
- 
+
   return Error;
 }
 
@@ -294,20 +294,20 @@ static int AssembleInput(const char *ProgName) {
     return 1;
   }
   MemoryBuffer *Buffer = BufferPtr.take();
-  
+
   SourceMgr SrcMgr;
-  
+
   // Tell SrcMgr about this buffer, which is what the parser will pick up.
   SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
-  
+
   // Record the location of the include directories so that the lexer can find
   // it later.
   SrcMgr.setIncludeDirs(IncludeDirs);
-  
-  
+
+
   llvm::OwningPtr<MCAsmInfo> MAI(TheTarget->createAsmInfo(TripleName));
   assert(MAI && "Unable to create target asm info!");
-  
+
   // Package up features to be passed to target/subtarget
   std::string FeaturesStr;
   if (MCPU.size()) {
@@ -378,7 +378,7 @@ static int AssembleInput(const char *ProgName) {
                                                    *Str.get(), *MAI));
   OwningPtr<TargetAsmParser> TAP(TheTarget->createAsmParser(*Parser, *TM));
   if (!TAP) {
-    errs() << ProgName 
+    errs() << ProgName
            << ": error: this target does not support assembly parsing.\n";
     return 1;
   }
@@ -404,7 +404,7 @@ static int DisassembleInput(const char *ProgName, bool Enhanced) {
     errs() << ProgName << ": " << ec.message() << '\n';
     return 1;
   }
-  
+
   OwningPtr<tool_output_file> Out(GetOutputStream());
   if (!Out)
     return 1;
@@ -459,7 +459,7 @@ int main(int argc, char **argv) {
   llvm::InitializeAllAsmPrinters();
   llvm::InitializeAllAsmParsers();
   llvm::InitializeAllDisassemblers();
-  
+
   cl::ParseCommandLineOptions(argc, argv, "llvm machine code playground\n");
   TripleName = Triple::normalize(TripleName);
 
@@ -474,7 +474,7 @@ int main(int argc, char **argv) {
   case AC_EDisassemble:
     return DisassembleInput(argv[0], true);
   }
-  
+
   return 0;
 }
 
diff --git a/tools/llvmc/src/Base.td.in b/tools/llvmc/src/Base.td.in
index 527913c..84e39e7 100644
--- a/tools/llvmc/src/Base.td.in
+++ b/tools/llvmc/src/Base.td.in
@@ -301,7 +301,7 @@ def llc : Tool<
 [(in_language "llvm-bitcode", "llvm-assembler"),
  (out_language "assembler"),
  (output_suffix "s"),
- (command "llc"),
+ (command "llc -disable-cfi"),
  (actions (case
           (switch_on "S"), (stop_compilation),
           (switch_on "O0"), (forward "O0"),
diff --git a/tools/lto/LTOCodeGenerator.cpp b/tools/lto/LTOCodeGenerator.cpp
index d95f354..3abd641 100644
--- a/tools/lto/LTOCodeGenerator.cpp
+++ b/tools/lto/LTOCodeGenerator.cpp
@@ -14,7 +14,6 @@
 
 #include "LTOModule.h"
 #include "LTOCodeGenerator.h"
-
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Linker.h"
@@ -37,7 +36,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/StandardPasses.h"
+#include "llvm/Support/PassManagerBuilder.h"
 #include "llvm/Support/SystemUtils.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/Host.h"
@@ -355,9 +354,8 @@ void LTOCodeGenerator::applyScopeRestrictions() {
 }
 
 /// Optimize merged modules using various IPO passes
-bool LTOCodeGenerator::generateObjectFile(raw_ostream& out,
-                                          std::string& errMsg)
-{
+bool LTOCodeGenerator::generateObjectFile(raw_ostream &out,
+                                          std::string &errMsg) {
     if ( this->determineTarget(errMsg) ) 
         return true;
 
@@ -380,13 +378,13 @@ bool LTOCodeGenerator::generateObjectFile(raw_ostream& out,
     // Add an appropriate TargetData instance for this module...
     passes.add(new TargetData(*_target->getTargetData()));
     
-    createStandardLTOPasses(&passes, /*Internalize=*/ false, !DisableInline,
-                            /*VerifyEach=*/ false);
+    PassManagerBuilder().populateLTOPassManager(passes, /*Internalize=*/ false,
+                                                !DisableInline);
 
     // Make sure everything is still good.
     passes.add(createVerifierPass());
 
-    FunctionPassManager* codeGenPasses = new FunctionPassManager(mergedModule);
+    FunctionPassManager *codeGenPasses = new FunctionPassManager(mergedModule);
 
     codeGenPasses->add(new TargetData(*_target->getTargetData()));
 
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 25474c4..aa375c5 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -35,7 +35,7 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/StandardPasses.h"
+#include "llvm/Support/PassManagerBuilder.h"
 #include "llvm/Support/SystemUtils.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/LinkAllPasses.h"
@@ -387,10 +387,12 @@ struct BreakpointPrinter : public ModulePass {
     AU.setPreservesAll();
   }
 };
+ 
+} // anonymous namespace
 
 char BreakpointPrinter::ID = 0;
 
-inline void addPass(PassManagerBase &PM, Pass *P) {
+static inline void addPass(PassManagerBase &PM, Pass *P) {
   // Add the pass to the pass manager...
   PM.add(P);
 
@@ -403,31 +405,30 @@ inline void addPass(PassManagerBase &PM, Pass *P) {
 /// duplicates llvm-gcc behaviour.
 ///
 /// OptLevel - Optimization Level
-void AddOptimizationPasses(PassManagerBase &MPM, PassManagerBase &FPM,
-                           unsigned OptLevel) {
-  createStandardFunctionPasses(&FPM, OptLevel);
+static void AddOptimizationPasses(PassManagerBase &MPM,FunctionPassManager &FPM,
+                                  unsigned OptLevel) {
+  PassManagerBuilder Builder;
+  Builder.OptLevel = OptLevel;
 
-  llvm::Pass *InliningPass = 0;
   if (DisableInline) {
     // No inlining pass
-  } else if (OptLevel) {
+  } else if (OptLevel > 1) {
     unsigned Threshold = 225;
     if (OptLevel > 2)
       Threshold = 275;
-    InliningPass = createFunctionInliningPass(Threshold);
+    Builder.Inliner = createFunctionInliningPass(Threshold);
   } else {
-    InliningPass = createAlwaysInlinerPass();
+    Builder.Inliner = createAlwaysInlinerPass();
   }
-  createStandardModulePasses(&MPM, OptLevel,
-                             /*OptimizeSize=*/ false,
-                             UnitAtATime,
-                             /*UnrollLoops=*/ OptLevel > 1,
-                             !DisableSimplifyLibCalls,
-                             /*HaveExceptions=*/ true,
-                             InliningPass);
+  Builder.DisableUnitAtATime = !UnitAtATime;
+  Builder.DisableUnrollLoops = OptLevel == 0;
+  Builder.DisableSimplifyLibCalls = DisableSimplifyLibCalls;
+  
+  Builder.populateFunctionPassManager(FPM);
+  Builder.populateModulePassManager(MPM);
 }
 
-void AddStandardCompilePasses(PassManagerBase &PM) {
+static void AddStandardCompilePasses(PassManagerBase &PM) {
   PM.add(createVerifierPass());                  // Verify that input is correct
 
   addPass(PM, createLowerSetJmpPass());          // Lower llvm.setjmp/.longjmp
@@ -438,19 +439,16 @@ void AddStandardCompilePasses(PassManagerBase &PM) {
 
   if (DisableOptimizations) return;
 
-  llvm::Pass *InliningPass = !DisableInline ? createFunctionInliningPass() : 0;
-
   // -std-compile-opts adds the same module passes as -O3.
-  createStandardModulePasses(&PM, 3,
-                             /*OptimizeSize=*/ false,
-                             /*UnitAtATime=*/ true,
-                             /*UnrollLoops=*/ true,
-                             !DisableSimplifyLibCalls,
-                             /*HaveExceptions=*/ true,
-                             InliningPass);
+  PassManagerBuilder Builder;
+  if (!DisableInline)
+    Builder.Inliner = createFunctionInliningPass();
+  Builder.OptLevel = 3;
+  Builder.DisableSimplifyLibCalls = DisableSimplifyLibCalls;
+  Builder.populateModulePassManager(PM);
 }
 
-void AddStandardLinkPasses(PassManagerBase &PM) {
+static void AddStandardLinkPasses(PassManagerBase &PM) {
   PM.add(createVerifierPass());                  // Verify that input is correct
 
   // If the -strip-debug command line option was specified, do it.
@@ -459,13 +457,11 @@ void AddStandardLinkPasses(PassManagerBase &PM) {
 
   if (DisableOptimizations) return;
 
-  createStandardLTOPasses(&PM, /*Internalize=*/ !DisableInternalize,
-                          /*RunInliner=*/ !DisableInline,
-                          /*VerifyEach=*/ VerifyEach);
+  PassManagerBuilder Builder;
+  Builder.populateLTOPassManager(PM, /*Internalize=*/ !DisableInternalize,
+                                 /*RunInliner=*/ !DisableInline);
 }
 
-} // anonymous namespace
-
 
 //===----------------------------------------------------------------------===//
 // main for opt
@@ -566,9 +562,9 @@ int main(int argc, char **argv) {
   if (TD)
     Passes.add(TD);
 
-  OwningPtr<PassManager> FPasses;
+  OwningPtr<FunctionPassManager> FPasses;
   if (OptLevelO1 || OptLevelO2 || OptLevelO3) {
-    FPasses.reset(new PassManager());
+    FPasses.reset(new FunctionPassManager(M.get()));
     if (TD)
       FPasses->add(new TargetData(*TD));
   }
@@ -686,8 +682,12 @@ int main(int argc, char **argv) {
   if (OptLevelO3)
     AddOptimizationPasses(Passes, *FPasses, 3);
 
-  if (OptLevelO1 || OptLevelO2 || OptLevelO3)
-    FPasses->run(*M.get());
+  if (OptLevelO1 || OptLevelO2 || OptLevelO3) {
+    FPasses->doInitialization();
+    for (Module::iterator F = M->begin(), E = M->end(); F != E; ++F)
+      FPasses->run(*F);
+    FPasses->doFinalization();
+  }
 
   // Check that the module is well formed on completion of optimization
   if (!NoVerify && !VerifyEach)
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index da4a652..fcf6109 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -20,7 +20,7 @@ set_target_properties(UnitTests PROPERTIES FOLDER "Tests")
 
 include_directories(${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include)
 add_definitions(-DGTEST_HAS_RTTI=0)
-if( CMAKE_COMPILER_IS_GNUCXX )
+if( LLVM_COMPILER_IS_GCC_COMPATIBLE )
   llvm_replace_compiler_option(CMAKE_CXX_FLAGS "-frtti" "-fno-rtti")
 elseif( MSVC )
   llvm_replace_compiler_option(CMAKE_CXX_FLAGS "/GR" "/GR-")
diff --git a/unittests/Support/IRBuilderTest.cpp b/unittests/Support/IRBuilderTest.cpp
new file mode 100644
index 0000000..5d635ae
--- /dev/null
+++ b/unittests/Support/IRBuilderTest.cpp
@@ -0,0 +1,70 @@
+//===- llvm/unittest/Support/IRBuilderTest.cpp - IRBuilder tests ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Function.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/OwningPtr.h"
+
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+class IRBuilderTest : public testing::Test {
+protected:
+  virtual void SetUp() {
+    M.reset(new Module("MyModule", getGlobalContext()));
+    FunctionType *FTy = FunctionType::get(Type::getVoidTy(getGlobalContext()),
+                                          /*isVarArg=*/false);
+    Function *F = Function::Create(FTy, Function::ExternalLinkage, "", M.get());
+    BB = BasicBlock::Create(getGlobalContext(), "", F);
+  }
+
+  virtual void TearDown() {
+    BB = 0;
+    M.reset();
+  }
+
+  OwningPtr<Module> M;
+  BasicBlock *BB;
+};
+
+TEST_F(IRBuilderTest, Lifetime) {
+  IRBuilder<> Builder(BB);
+  AllocaInst *Var1 = Builder.CreateAlloca(Builder.getInt8Ty());
+  AllocaInst *Var2 = Builder.CreateAlloca(Builder.getInt32Ty());
+  AllocaInst *Var3 = Builder.CreateAlloca(Builder.getInt8Ty(),
+                                          Builder.getInt32(123));
+
+  CallInst *Start1 = Builder.CreateLifetimeStart(Var1);
+  CallInst *Start2 = Builder.CreateLifetimeStart(Var2);
+  CallInst *Start3 = Builder.CreateLifetimeStart(Var3, Builder.getInt64(100));
+
+  EXPECT_EQ(Start1->getArgOperand(0), Builder.getInt64(-1));
+  EXPECT_EQ(Start2->getArgOperand(0), Builder.getInt64(-1));
+  EXPECT_EQ(Start3->getArgOperand(0), Builder.getInt64(100));
+
+  EXPECT_EQ(Start1->getArgOperand(1), Var1);
+  EXPECT_NE(Start2->getArgOperand(1), Var2);
+  EXPECT_EQ(Start3->getArgOperand(1), Var3);
+
+  Value *End1 = Builder.CreateLifetimeEnd(Var1);
+  Builder.CreateLifetimeEnd(Var2);
+  Builder.CreateLifetimeEnd(Var3);
+
+  IntrinsicInst *II_Start1 = dyn_cast<IntrinsicInst>(Start1);
+  IntrinsicInst *II_End1 = dyn_cast<IntrinsicInst>(End1);
+  ASSERT_TRUE(II_Start1 != NULL);
+  EXPECT_EQ(II_Start1->getIntrinsicID(), Intrinsic::lifetime_start);
+  ASSERT_TRUE(II_End1 != NULL);
+  EXPECT_EQ(II_End1->getIntrinsicID(), Intrinsic::lifetime_end);
+}
diff --git a/unittests/VMCore/InstructionsTest.cpp b/unittests/VMCore/InstructionsTest.cpp
index 8edcce4..9624b81 100644
--- a/unittests/VMCore/InstructionsTest.cpp
+++ b/unittests/VMCore/InstructionsTest.cpp
@@ -114,11 +114,19 @@ TEST(InstructionsTest, CastInst) {
   const Type* Int8Ty = Type::getInt8Ty(C);
   const Type* Int64Ty = Type::getInt64Ty(C);
   const Type* V8x8Ty = VectorType::get(Int8Ty, 8);
+  const Type* V8x64Ty = VectorType::get(Int64Ty, 8);
   const Type* X86MMXTy = Type::getX86_MMXTy(C);
 
+  const Constant* c8 = Constant::getNullValue(V8x8Ty);
+  const Constant* c64 = Constant::getNullValue(V8x64Ty);
+
   EXPECT_TRUE(CastInst::isCastable(V8x8Ty, X86MMXTy));
   EXPECT_TRUE(CastInst::isCastable(X86MMXTy, V8x8Ty));
   EXPECT_FALSE(CastInst::isCastable(Int64Ty, X86MMXTy));
+  EXPECT_TRUE(CastInst::isCastable(V8x64Ty, V8x8Ty));
+  EXPECT_TRUE(CastInst::isCastable(V8x8Ty, V8x64Ty));
+  EXPECT_EQ(CastInst::getCastOpcode(c64, true, V8x8Ty, true), CastInst::Trunc);
+  EXPECT_EQ(CastInst::getCastOpcode(c8, true, V8x64Ty, true), CastInst::SExt);
 }
 
 }  // end anonymous namespace
diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp
index 1d14037..bbe4349 100644
--- a/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/utils/TableGen/AsmMatcherEmitter.cpp
@@ -2321,7 +2321,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "    for (unsigned i = 0; i != " << MaxNumOperands << "; ++i) {\n";
   OS << "      if (i + 1 >= Operands.size()) {\n";
   OS << "        OperandsValid = (it->Classes[i] == " <<"InvalidMatchClass);\n";
-  OS << "        break;";
+  OS << "        break;\n";
   OS << "      }\n";
   OS << "      if (ValidateOperandClass(Operands[i+1], it->Classes[i]))\n";
   OS << "        continue;\n";
diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp
index 2b1a4cc..e6deb69 100644
--- a/utils/TableGen/AsmWriterEmitter.cpp
+++ b/utils/TableGen/AsmWriterEmitter.cpp
@@ -670,8 +670,8 @@ public:
 
     for (std::map<StringRef, unsigned>::iterator
            I = OpMap.begin(), E = OpMap.end(); I != E; ++I)
-      O.indent(6) << "OpMap[\"" << I->first << "\"] = "
-                  << I->second << ";\n";
+      O.indent(6) << "OpMap.push_back(std::make_pair(\"" << I->first << "\", "
+                  << I->second << "));\n";
 
     O.indent(6) << "break;\n";
     O.indent(4) << '}';
@@ -754,6 +754,20 @@ static void EmitComputeAvailableFeatures(AsmWriterInfo &Info,
   O << "}\n\n";
 }
 
+static void EmitGetMapOperandNumber(raw_ostream &O) {
+  O << "static unsigned getMapOperandNumber("
+    << "const SmallVectorImpl<std::pair<StringRef, unsigned> > &OpMap,\n";
+  O << "                                    StringRef Name) {\n";
+  O << "  for (SmallVectorImpl<std::pair<StringRef, unsigned> >::"
+    << "const_iterator\n";
+  O << "         I = OpMap.begin(), E = OpMap.end(); I != E; ++I)\n";
+  O << "    if (I->first == Name)\n";
+  O << "      return I->second;\n";
+  O << "  assert(false && \"Operand not in map!\");\n";
+  O << "  return 0;\n";
+  O << "}\n\n";
+}
+
 void AsmWriterEmitter::EmitRegIsInRegClass(raw_ostream &O) {
   CodeGenTarget Target(Records);
 
@@ -934,9 +948,12 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   EmitSubtargetFeatureFlagEnumeration(AWI, O);
   EmitComputeAvailableFeatures(AWI, AsmWriter, Target, O);
 
-  O << "bool " << Target.getName() << ClassName
-    << "::printAliasInstr(const " << MachineInstrClassName
-    << " *MI, raw_ostream &OS) {\n";
+  std::string Header;
+  raw_string_ostream HeaderO(Header);
+
+  HeaderO << "bool " << Target.getName() << ClassName
+          << "::printAliasInstr(const " << MachineInstrClassName
+          << " *MI, raw_ostream &OS) {\n";
 
   std::string Cases;
   raw_string_ostream CasesO(Cases);
@@ -979,14 +996,18 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   }
 
   if (CasesO.str().empty() || !isMC) {
+    O << HeaderO.str();
     O << "  return false;\n";
     O << "}\n\n";
     O << "#endif // PRINT_ALIAS_INSTR\n";
     return;
   }
 
+  EmitGetMapOperandNumber(O);
+
+  O << HeaderO.str();
   O.indent(2) << "StringRef AsmString;\n";
-  O.indent(2) << "std::map<StringRef, unsigned> OpMap;\n";
+  O.indent(2) << "SmallVector<std::pair<StringRef, unsigned>, 4> OpMap;\n";
   if (NeedAvailableFeatures)
     O.indent(2) << "unsigned AvailableFeatures = getAvailableFeatures();\n\n";
   O.indent(2) << "switch (MI->getOpcode()) {\n";
@@ -1012,7 +1033,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   O << "                *I == '_'))\n";
   O << "          ++I;\n";
   O << "        StringRef Name(Start, I - Start);\n";
-  O << "        printOperand(MI, OpMap[Name], OS);\n";
+  O << "        printOperand(MI, getMapOperandNumber(OpMap, Name), OS);\n";
   O << "      } else {\n";
   O << "        OS << *I++;\n";
   O << "      }\n";
diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt
index 514b191..a24c921 100644
--- a/utils/TableGen/CMakeLists.txt
+++ b/utils/TableGen/CMakeLists.txt
@@ -16,6 +16,7 @@ add_llvm_utility(tblgen
   CodeEmitterGen.cpp
   CodeGenDAGPatterns.cpp
   CodeGenInstruction.cpp
+  CodeGenRegisters.cpp
   CodeGenTarget.cpp
   DAGISelEmitter.cpp
   DAGISelMatcherEmitter.cpp
@@ -34,6 +35,7 @@ add_llvm_utility(tblgen
   OptParserEmitter.cpp
   Record.cpp
   RegisterInfoEmitter.cpp
+  SetTheory.cpp
   StringMatcher.cpp
   SubtargetEmitter.cpp
   TGLexer.cpp
diff --git a/utils/TableGen/ClangDiagnosticsEmitter.cpp b/utils/TableGen/ClangDiagnosticsEmitter.cpp
index 0f4b606..acaa1f8 100644
--- a/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ b/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -162,7 +162,7 @@ void ClangDiagsDefsEmitter::run(raw_ostream &OS) {
       OS << ", \"";
       OS.write_escaped(DI->getDef()->getValueAsString("GroupName")) << '"';
     } else {
-      OS << ", 0";
+      OS << ", \"\"";
     }
 
     // SFINAE bit
@@ -275,7 +275,9 @@ void ClangDiagGroupsEmitter::run(raw_ostream &OS) {
   for (std::map<std::string, GroupInfo>::iterator
        I = DiagsInGroup.begin(), E = DiagsInGroup.end(); I != E; ++I) {
     // Group option string.
-    OS << "  { \"";
+    OS << "  { ";
+    OS << I->first.size() << ", ";
+    OS << "\"";
     OS.write_escaped(I->first) << "\","
                                << std::string(MaxLen-I->first.size()+1, ' ');
     
diff --git a/utils/TableGen/CodeGenIntrinsics.h b/utils/TableGen/CodeGenIntrinsics.h
index 3208c0d..3f6ba61 100644
--- a/utils/TableGen/CodeGenIntrinsics.h
+++ b/utils/TableGen/CodeGenIntrinsics.h
@@ -69,6 +69,9 @@ namespace llvm {
 
     /// isCommutative - True if the intrinsic is commutative.
     bool isCommutative;
+
+    /// canThrow - True if the intrinsic can throw.
+    bool canThrow;
     
     enum ArgAttribute {
       NoCapture
diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp
new file mode 100644
index 0000000..a4504e4
--- /dev/null
+++ b/utils/TableGen/CodeGenRegisters.cpp
@@ -0,0 +1,312 @@
+//===- CodeGenRegisters.cpp - Register and RegisterClass Info -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines structures to encapsulate information gleaned from the
+// target register and register class definitions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeGenRegisters.h"
+#include "CodeGenTarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                              CodeGenRegister
+//===----------------------------------------------------------------------===//
+
+CodeGenRegister::CodeGenRegister(Record *R, unsigned Enum)
+  : TheDef(R),
+    EnumValue(Enum),
+    CostPerUse(R->getValueAsInt("CostPerUse")),
+    SubRegsComplete(false)
+{}
+
+const std::string &CodeGenRegister::getName() const {
+  return TheDef->getName();
+}
+
+namespace {
+  struct Orphan {
+    CodeGenRegister *SubReg;
+    Record *First, *Second;
+    Orphan(CodeGenRegister *r, Record *a, Record *b)
+      : SubReg(r), First(a), Second(b) {}
+  };
+}
+
+const CodeGenRegister::SubRegMap &
+CodeGenRegister::getSubRegs(CodeGenRegBank &RegBank) {
+  // Only compute this map once.
+  if (SubRegsComplete)
+    return SubRegs;
+  SubRegsComplete = true;
+
+  std::vector<Record*> SubList = TheDef->getValueAsListOfDefs("SubRegs");
+  std::vector<Record*> Indices = TheDef->getValueAsListOfDefs("SubRegIndices");
+  if (SubList.size() != Indices.size())
+    throw TGError(TheDef->getLoc(), "Register " + getName() +
+                  " SubRegIndices doesn't match SubRegs");
+
+  // First insert the direct subregs and make sure they are fully indexed.
+  for (unsigned i = 0, e = SubList.size(); i != e; ++i) {
+    CodeGenRegister *SR = RegBank.getReg(SubList[i]);
+    if (!SubRegs.insert(std::make_pair(Indices[i], SR)).second)
+      throw TGError(TheDef->getLoc(), "SubRegIndex " + Indices[i]->getName() +
+                    " appears twice in Register " + getName());
+  }
+
+  // Keep track of inherited subregs and how they can be reached.
+  SmallVector<Orphan, 8> Orphans;
+
+  // Clone inherited subregs and place duplicate entries on Orphans.
+  // Here the order is important - earlier subregs take precedence.
+  for (unsigned i = 0, e = SubList.size(); i != e; ++i) {
+    CodeGenRegister *SR = RegBank.getReg(SubList[i]);
+    const SubRegMap &Map = SR->getSubRegs(RegBank);
+    for (SubRegMap::const_iterator SI = Map.begin(), SE = Map.end(); SI != SE;
+         ++SI)
+      if (!SubRegs.insert(*SI).second)
+        Orphans.push_back(Orphan(SI->second, Indices[i], SI->first));
+  }
+
+  // Process the composites.
+  ListInit *Comps = TheDef->getValueAsListInit("CompositeIndices");
+  for (unsigned i = 0, e = Comps->size(); i != e; ++i) {
+    DagInit *Pat = dynamic_cast<DagInit*>(Comps->getElement(i));
+    if (!Pat)
+      throw TGError(TheDef->getLoc(), "Invalid dag '" +
+                    Comps->getElement(i)->getAsString() +
+                    "' in CompositeIndices");
+    DefInit *BaseIdxInit = dynamic_cast<DefInit*>(Pat->getOperator());
+    if (!BaseIdxInit || !BaseIdxInit->getDef()->isSubClassOf("SubRegIndex"))
+      throw TGError(TheDef->getLoc(), "Invalid SubClassIndex in " +
+                    Pat->getAsString());
+
+    // Resolve list of subreg indices into R2.
+    CodeGenRegister *R2 = this;
+    for (DagInit::const_arg_iterator di = Pat->arg_begin(),
+         de = Pat->arg_end(); di != de; ++di) {
+      DefInit *IdxInit = dynamic_cast<DefInit*>(*di);
+      if (!IdxInit || !IdxInit->getDef()->isSubClassOf("SubRegIndex"))
+        throw TGError(TheDef->getLoc(), "Invalid SubClassIndex in " +
+                      Pat->getAsString());
+      const SubRegMap &R2Subs = R2->getSubRegs(RegBank);
+      SubRegMap::const_iterator ni = R2Subs.find(IdxInit->getDef());
+      if (ni == R2Subs.end())
+        throw TGError(TheDef->getLoc(), "Composite " + Pat->getAsString() +
+                      " refers to bad index in " + R2->getName());
+      R2 = ni->second;
+    }
+
+    // Insert composite index. Allow overriding inherited indices etc.
+    SubRegs[BaseIdxInit->getDef()] = R2;
+
+    // R2 is no longer an orphan.
+    for (unsigned j = 0, je = Orphans.size(); j != je; ++j)
+      if (Orphans[j].SubReg == R2)
+          Orphans[j].SubReg = 0;
+  }
+
+  // Now Orphans contains the inherited subregisters without a direct index.
+  // Create inferred indexes for all missing entries.
+  for (unsigned i = 0, e = Orphans.size(); i != e; ++i) {
+    Orphan &O = Orphans[i];
+    if (!O.SubReg)
+      continue;
+    SubRegs[RegBank.getCompositeSubRegIndex(O.First, O.Second, true)] =
+      O.SubReg;
+  }
+  return SubRegs;
+}
+
+//===----------------------------------------------------------------------===//
+//                            CodeGenRegisterClass
+//===----------------------------------------------------------------------===//
+
+CodeGenRegisterClass::CodeGenRegisterClass(Record *R) : TheDef(R) {
+  // Rename anonymous register classes.
+  if (R->getName().size() > 9 && R->getName()[9] == '.') {
+    static unsigned AnonCounter = 0;
+    R->setName("AnonRegClass_"+utostr(AnonCounter++));
+  }
+
+  std::vector<Record*> TypeList = R->getValueAsListOfDefs("RegTypes");
+  for (unsigned i = 0, e = TypeList.size(); i != e; ++i) {
+    Record *Type = TypeList[i];
+    if (!Type->isSubClassOf("ValueType"))
+      throw "RegTypes list member '" + Type->getName() +
+        "' does not derive from the ValueType class!";
+    VTs.push_back(getValueType(Type));
+  }
+  assert(!VTs.empty() && "RegisterClass must contain at least one ValueType!");
+
+  std::vector<Record*> RegList = R->getValueAsListOfDefs("MemberList");
+  for (unsigned i = 0, e = RegList.size(); i != e; ++i) {
+    Record *Reg = RegList[i];
+    if (!Reg->isSubClassOf("Register"))
+      throw "Register Class member '" + Reg->getName() +
+            "' does not derive from the Register class!";
+    Elements.push_back(Reg);
+  }
+
+  // SubRegClasses is a list<dag> containing (RC, subregindex, ...) dags.
+  ListInit *SRC = R->getValueAsListInit("SubRegClasses");
+  for (ListInit::const_iterator i = SRC->begin(), e = SRC->end(); i != e; ++i) {
+    DagInit *DAG = dynamic_cast<DagInit*>(*i);
+    if (!DAG) throw "SubRegClasses must contain DAGs";
+    DefInit *DAGOp = dynamic_cast<DefInit*>(DAG->getOperator());
+    Record *RCRec;
+    if (!DAGOp || !(RCRec = DAGOp->getDef())->isSubClassOf("RegisterClass"))
+      throw "Operator '" + DAG->getOperator()->getAsString() +
+        "' in SubRegClasses is not a RegisterClass";
+    // Iterate over args, all SubRegIndex instances.
+    for (DagInit::const_arg_iterator ai = DAG->arg_begin(), ae = DAG->arg_end();
+         ai != ae; ++ai) {
+      DefInit *Idx = dynamic_cast<DefInit*>(*ai);
+      Record *IdxRec;
+      if (!Idx || !(IdxRec = Idx->getDef())->isSubClassOf("SubRegIndex"))
+        throw "Argument '" + (*ai)->getAsString() +
+          "' in SubRegClasses is not a SubRegIndex";
+      if (!SubRegClasses.insert(std::make_pair(IdxRec, RCRec)).second)
+        throw "SubRegIndex '" + IdxRec->getName() + "' mentioned twice";
+    }
+  }
+
+  // Allow targets to override the size in bits of the RegisterClass.
+  unsigned Size = R->getValueAsInt("Size");
+
+  Namespace = R->getValueAsString("Namespace");
+  SpillSize = Size ? Size : EVT(VTs[0]).getSizeInBits();
+  SpillAlignment = R->getValueAsInt("Alignment");
+  CopyCost = R->getValueAsInt("CopyCost");
+  Allocatable = R->getValueAsBit("isAllocatable");
+  MethodBodies = R->getValueAsCode("MethodBodies");
+  MethodProtos = R->getValueAsCode("MethodProtos");
+}
+
+const std::string &CodeGenRegisterClass::getName() const {
+  return TheDef->getName();
+}
+
+//===----------------------------------------------------------------------===//
+//                               CodeGenRegBank
+//===----------------------------------------------------------------------===//
+
+CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records) : Records(Records) {
+  // Read in the user-defined (named) sub-register indices.
+  // More indices will be synthesized later.
+  SubRegIndices = Records.getAllDerivedDefinitions("SubRegIndex");
+  std::sort(SubRegIndices.begin(), SubRegIndices.end(), LessRecord());
+  NumNamedIndices = SubRegIndices.size();
+
+  // Read in the register definitions.
+  std::vector<Record*> Regs = Records.getAllDerivedDefinitions("Register");
+  std::sort(Regs.begin(), Regs.end(), LessRecord());
+  Registers.reserve(Regs.size());
+  // Assign the enumeration values.
+  for (unsigned i = 0, e = Regs.size(); i != e; ++i)
+    Registers.push_back(CodeGenRegister(Regs[i], i + 1));
+}
+
+CodeGenRegister *CodeGenRegBank::getReg(Record *Def) {
+  if (Def2Reg.empty())
+    for (unsigned i = 0, e = Registers.size(); i != e; ++i)
+      Def2Reg[Registers[i].TheDef] = &Registers[i];
+
+  if (CodeGenRegister *Reg = Def2Reg[Def])
+    return Reg;
+
+  throw TGError(Def->getLoc(), "Not a known Register!");
+}
+
+Record *CodeGenRegBank::getCompositeSubRegIndex(Record *A, Record *B,
+                                                bool create) {
+  // Look for an existing entry.
+  Record *&Comp = Composite[std::make_pair(A, B)];
+  if (Comp || !create)
+    return Comp;
+
+  // None exists, synthesize one.
+  std::string Name = A->getName() + "_then_" + B->getName();
+  Comp = new Record(Name, SMLoc(), Records);
+  Records.addDef(Comp);
+  SubRegIndices.push_back(Comp);
+  return Comp;
+}
+
+unsigned CodeGenRegBank::getSubRegIndexNo(Record *idx) {
+  std::vector<Record*>::const_iterator i =
+    std::find(SubRegIndices.begin(), SubRegIndices.end(), idx);
+  assert(i != SubRegIndices.end() && "Not a SubRegIndex");
+  return (i - SubRegIndices.begin()) + 1;
+}
+
+void CodeGenRegBank::computeComposites() {
+  // Precompute all sub-register maps. This will create Composite entries for
+  // all inferred sub-register indices.
+  for (unsigned i = 0, e = Registers.size(); i != e; ++i)
+    Registers[i].getSubRegs(*this);
+
+  for (unsigned i = 0, e = Registers.size(); i != e; ++i) {
+    CodeGenRegister *Reg1 = &Registers[i];
+    const CodeGenRegister::SubRegMap &SRM1 = Reg1->getSubRegs(*this);
+    for (CodeGenRegister::SubRegMap::const_iterator i1 = SRM1.begin(),
+         e1 = SRM1.end(); i1 != e1; ++i1) {
+      Record *Idx1 = i1->first;
+      CodeGenRegister *Reg2 = i1->second;
+      // Ignore identity compositions.
+      if (Reg1 == Reg2)
+        continue;
+      const CodeGenRegister::SubRegMap &SRM2 = Reg2->getSubRegs(*this);
+      // Try composing Idx1 with another SubRegIndex.
+      for (CodeGenRegister::SubRegMap::const_iterator i2 = SRM2.begin(),
+           e2 = SRM2.end(); i2 != e2; ++i2) {
+        std::pair<Record*, Record*> IdxPair(Idx1, i2->first);
+        CodeGenRegister *Reg3 = i2->second;
+        // Ignore identity compositions.
+        if (Reg2 == Reg3)
+          continue;
+        // OK Reg1:IdxPair == Reg3. Find the index with Reg:Idx == Reg3.
+        for (CodeGenRegister::SubRegMap::const_iterator i1d = SRM1.begin(),
+             e1d = SRM1.end(); i1d != e1d; ++i1d) {
+          if (i1d->second == Reg3) {
+            std::pair<CompositeMap::iterator, bool> Ins =
+              Composite.insert(std::make_pair(IdxPair, i1d->first));
+            // Conflicting composition? Emit a warning but allow it.
+            if (!Ins.second && Ins.first->second != i1d->first) {
+              errs() << "Warning: SubRegIndex " << getQualifiedName(Idx1)
+                     << " and " << getQualifiedName(IdxPair.second)
+                     << " compose ambiguously as "
+                     << getQualifiedName(Ins.first->second) << " or "
+                     << getQualifiedName(i1d->first) << "\n";
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // We don't care about the difference between (Idx1, Idx2) -> Idx2 and invalid
+  // compositions, so remove any mappings of that form.
+  for (CompositeMap::iterator i = Composite.begin(), e = Composite.end();
+       i != e;) {
+    CompositeMap::iterator j = i;
+    ++i;
+    if (j->first.second == j->second)
+      Composite.erase(j);
+  }
+}
+
+void CodeGenRegBank::computeDerivedInfo() {
+  computeComposites();
+}
+
diff --git a/utils/TableGen/CodeGenRegisters.h b/utils/TableGen/CodeGenRegisters.h
index 39b92c5..09341f0 100644
--- a/utils/TableGen/CodeGenRegisters.h
+++ b/utils/TableGen/CodeGenRegisters.h
@@ -15,23 +15,43 @@
 #ifndef CODEGEN_REGISTERS_H
 #define CODEGEN_REGISTERS_H
 
+#include "Record.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/ADT/DenseMap.h"
+#include <cstdlib>
+#include <map>
 #include <string>
-#include <vector>
 #include <set>
-#include <cstdlib>
+#include <vector>
 
 namespace llvm {
-  class Record;
+  class CodeGenRegBank;
 
   /// CodeGenRegister - Represents a register definition.
   struct CodeGenRegister {
     Record *TheDef;
-    const std::string &getName() const;
     unsigned EnumValue;
     unsigned CostPerUse;
-    CodeGenRegister(Record *R);
+
+    // Map SubRegIndex -> Register.
+    typedef std::map<Record*, CodeGenRegister*, LessRecord> SubRegMap;
+
+    CodeGenRegister(Record *R, unsigned Enum);
+
+    const std::string &getName() const;
+
+    // Get a map of sub-registers computed lazily.
+    // This includes unique entries for all sub-sub-registers.
+    const SubRegMap &getSubRegs(CodeGenRegBank&);
+
+    const SubRegMap &getSubRegs() const {
+      assert(SubRegsComplete && "Must precompute sub-registers");
+      return SubRegs;
+    }
+
+  private:
+    bool SubRegsComplete;
+    SubRegMap SubRegs;
   };
 
 
@@ -43,6 +63,7 @@ namespace llvm {
     unsigned SpillSize;
     unsigned SpillAlignment;
     int CopyCost;
+    bool Allocatable;
     // Map SubRegIndex -> RegisterClass
     DenseMap<Record*,Record*> SubRegClasses;
     std::string MethodProtos, MethodBodies;
@@ -97,6 +118,47 @@ namespace llvm {
 
     CodeGenRegisterClass(Record *R);
   };
+
+  // CodeGenRegBank - Represent a target's registers and the relations between
+  // them.
+  class CodeGenRegBank {
+    RecordKeeper &Records;
+    std::vector<Record*> SubRegIndices;
+    unsigned NumNamedIndices;
+    std::vector<CodeGenRegister> Registers;
+    DenseMap<Record*, CodeGenRegister*> Def2Reg;
+
+    // Composite SubRegIndex instances.
+    // Map (SubRegIndex, SubRegIndex) -> SubRegIndex.
+    typedef DenseMap<std::pair<Record*, Record*>, Record*> CompositeMap;
+    CompositeMap Composite;
+
+    // Populate the Composite map from sub-register relationships.
+    void computeComposites();
+
+  public:
+    CodeGenRegBank(RecordKeeper&);
+
+    // Sub-register indices. The first NumNamedIndices are defined by the user
+    // in the .td files. The rest are synthesized such that all sub-registers
+    // have a unique name.
+    const std::vector<Record*> &getSubRegIndices() { return SubRegIndices; }
+    unsigned getNumNamedIndices() { return NumNamedIndices; }
+
+    // Map a SubRegIndex Record to its enum value.
+    unsigned getSubRegIndexNo(Record *idx);
+
+    // Find or create a sub-register index representing the A+B composition.
+    Record *getCompositeSubRegIndex(Record *A, Record *B, bool create = false);
+
+    const std::vector<CodeGenRegister> &getRegisters() { return Registers; }
+
+    // Find a register from its Record def.
+    CodeGenRegister *getReg(Record*);
+
+    // Computed derived records such as missing sub-register indices.
+    void computeDerivedInfo();
+  };
 }
 
 #endif
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index 57f7fdc..a0c64ff 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -98,17 +98,18 @@ std::string llvm::getEnumName(MVT::SimpleValueType T) {
 /// namespace qualifier if the record contains one.
 ///
 std::string llvm::getQualifiedName(const Record *R) {
-  std::string Namespace = R->getValueAsString("Namespace");
+  std::string Namespace;
+  if (R->getValue("Namespace"))
+     Namespace = R->getValueAsString("Namespace");
   if (Namespace.empty()) return R->getName();
   return Namespace + "::" + R->getName();
 }
 
 
-
-
 /// getTarget - Return the current instance of the Target class.
 ///
-CodeGenTarget::CodeGenTarget(RecordKeeper &records) : Records(records) {
+CodeGenTarget::CodeGenTarget(RecordKeeper &records)
+  : Records(records), RegBank(0) {
   std::vector<Record*> Targets = Records.getAllDerivedDefinitions("Target");
   if (Targets.size() == 0)
     throw std::string("ERROR: No 'Target' subclasses defined!");
@@ -156,30 +157,10 @@ Record *CodeGenTarget::getAsmWriter() const {
   return LI[AsmWriterNum];
 }
 
-void CodeGenTarget::ReadRegisters() const {
-  std::vector<Record*> Regs = Records.getAllDerivedDefinitions("Register");
-  if (Regs.empty())
-    throw std::string("No 'Register' subclasses defined!");
-  std::sort(Regs.begin(), Regs.end(), LessRecord());
-
-  Registers.reserve(Regs.size());
-  Registers.assign(Regs.begin(), Regs.end());
-  // Assign the enumeration values.
-  for (unsigned i = 0, e = Registers.size(); i != e; ++i)
-    Registers[i].EnumValue = i + 1;
-}
-
-CodeGenRegister::CodeGenRegister(Record *R) : TheDef(R) {
-  CostPerUse = R->getValueAsInt("CostPerUse");
-}
-
-const std::string &CodeGenRegister::getName() const {
-  return TheDef->getName();
-}
-
-void CodeGenTarget::ReadSubRegIndices() const {
-  SubRegIndices = Records.getAllDerivedDefinitions("SubRegIndex");
-  std::sort(SubRegIndices.begin(), SubRegIndices.end(), LessRecord());
+CodeGenRegBank &CodeGenTarget::getRegBank() const {
+  if (!RegBank)
+    RegBank = new CodeGenRegBank(Records);
+  return *RegBank;
 }
 
 void CodeGenTarget::ReadRegisterClasses() const {
@@ -195,7 +176,7 @@ void CodeGenTarget::ReadRegisterClasses() const {
 /// getRegisterByName - If there is a register with the specific AsmName,
 /// return it.
 const CodeGenRegister *CodeGenTarget::getRegisterByName(StringRef Name) const {
-  const std::vector<CodeGenRegister> &Regs = getRegisters();
+  const std::vector<CodeGenRegister> &Regs = getRegBank().getRegisters();
   for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
     const CodeGenRegister &Reg = Regs[i];
     if (Reg.TheDef->getValueAsString("AsmName") == Name)
@@ -226,70 +207,6 @@ getRegisterVTs(Record *R) const {
 }
 
 
-CodeGenRegisterClass::CodeGenRegisterClass(Record *R) : TheDef(R) {
-  // Rename anonymous register classes.
-  if (R->getName().size() > 9 && R->getName()[9] == '.') {
-    static unsigned AnonCounter = 0;
-    R->setName("AnonRegClass_"+utostr(AnonCounter++));
-  }
-
-  std::vector<Record*> TypeList = R->getValueAsListOfDefs("RegTypes");
-  for (unsigned i = 0, e = TypeList.size(); i != e; ++i) {
-    Record *Type = TypeList[i];
-    if (!Type->isSubClassOf("ValueType"))
-      throw "RegTypes list member '" + Type->getName() +
-        "' does not derive from the ValueType class!";
-    VTs.push_back(getValueType(Type));
-  }
-  assert(!VTs.empty() && "RegisterClass must contain at least one ValueType!");
-
-  std::vector<Record*> RegList = R->getValueAsListOfDefs("MemberList");
-  for (unsigned i = 0, e = RegList.size(); i != e; ++i) {
-    Record *Reg = RegList[i];
-    if (!Reg->isSubClassOf("Register"))
-      throw "Register Class member '" + Reg->getName() +
-            "' does not derive from the Register class!";
-    Elements.push_back(Reg);
-  }
-
-  // SubRegClasses is a list<dag> containing (RC, subregindex, ...) dags.
-  ListInit *SRC = R->getValueAsListInit("SubRegClasses");
-  for (ListInit::const_iterator i = SRC->begin(), e = SRC->end(); i != e; ++i) {
-    DagInit *DAG = dynamic_cast<DagInit*>(*i);
-    if (!DAG) throw "SubRegClasses must contain DAGs";
-    DefInit *DAGOp = dynamic_cast<DefInit*>(DAG->getOperator());
-    Record *RCRec;
-    if (!DAGOp || !(RCRec = DAGOp->getDef())->isSubClassOf("RegisterClass"))
-      throw "Operator '" + DAG->getOperator()->getAsString() +
-        "' in SubRegClasses is not a RegisterClass";
-    // Iterate over args, all SubRegIndex instances.
-    for (DagInit::const_arg_iterator ai = DAG->arg_begin(), ae = DAG->arg_end();
-         ai != ae; ++ai) {
-      DefInit *Idx = dynamic_cast<DefInit*>(*ai);
-      Record *IdxRec;
-      if (!Idx || !(IdxRec = Idx->getDef())->isSubClassOf("SubRegIndex"))
-        throw "Argument '" + (*ai)->getAsString() +
-          "' in SubRegClasses is not a SubRegIndex";
-      if (!SubRegClasses.insert(std::make_pair(IdxRec, RCRec)).second)
-        throw "SubRegIndex '" + IdxRec->getName() + "' mentioned twice";
-    }
-  }
-
-  // Allow targets to override the size in bits of the RegisterClass.
-  unsigned Size = R->getValueAsInt("Size");
-
-  Namespace = R->getValueAsString("Namespace");
-  SpillSize = Size ? Size : EVT(VTs[0]).getSizeInBits();
-  SpillAlignment = R->getValueAsInt("Alignment");
-  CopyCost = R->getValueAsInt("CopyCost");
-  MethodBodies = R->getValueAsCode("MethodBodies");
-  MethodProtos = R->getValueAsCode("MethodProtos");
-}
-
-const std::string &CodeGenRegisterClass::getName() const {
-  return TheDef->getName();
-}
-
 void CodeGenTarget::ReadLegalValueTypes() const {
   const std::vector<CodeGenRegisterClass> &RCs = getRegisterClasses();
   for (unsigned i = 0, e = RCs.size(); i != e; ++i)
@@ -453,6 +370,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
   ModRef = ReadWriteMem;
   isOverloaded = false;
   isCommutative = false;
+  canThrow = false;
 
   if (DefName.size() <= 4 ||
       std::string(DefName.begin(), DefName.begin() + 4) != "int_")
@@ -575,10 +493,15 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
       ModRef = ReadWriteArgMem;
     else if (Property->getName() == "Commutative")
       isCommutative = true;
+    else if (Property->getName() == "Throws")
+      canThrow = true;
     else if (Property->isSubClassOf("NoCapture")) {
       unsigned ArgNo = Property->getValueAsInt("ArgNo");
       ArgumentAttributes.push_back(std::make_pair(ArgNo, NoCapture));
     } else
       assert(0 && "Unknown property!");
   }
+
+  // Sort the argument attributes for later benefit.
+  std::sort(ArgumentAttributes.begin(), ArgumentAttributes.end());
 }
diff --git a/utils/TableGen/CodeGenTarget.h b/utils/TableGen/CodeGenTarget.h
index 4e04154..1f1c34c 100644
--- a/utils/TableGen/CodeGenTarget.h
+++ b/utils/TableGen/CodeGenTarget.h
@@ -65,12 +65,9 @@ class CodeGenTarget {
   Record *TargetRec;
 
   mutable DenseMap<const Record*, CodeGenInstruction*> Instructions;
-  mutable std::vector<CodeGenRegister> Registers;
-  mutable std::vector<Record*> SubRegIndices;
+  mutable CodeGenRegBank *RegBank;
   mutable std::vector<CodeGenRegisterClass> RegisterClasses;
   mutable std::vector<MVT::SimpleValueType> LegalValueTypes;
-  void ReadRegisters() const;
-  void ReadSubRegIndices() const;
   void ReadRegisterClasses() const;
   void ReadInstructions() const;
   void ReadLegalValueTypes() const;
@@ -98,29 +95,17 @@ public:
   ///
   Record *getAsmWriter() const;
 
+  /// getRegBank - Return the register bank description.
+  CodeGenRegBank &getRegBank() const;
+
   const std::vector<CodeGenRegister> &getRegisters() const {
-    if (Registers.empty()) ReadRegisters();
-    return Registers;
+    return getRegBank().getRegisters();
   }
 
   /// getRegisterByName - If there is a register with the specific AsmName,
   /// return it.
   const CodeGenRegister *getRegisterByName(StringRef Name) const;
 
-  const std::vector<Record*> &getSubRegIndices() const {
-    if (SubRegIndices.empty()) ReadSubRegIndices();
-    return SubRegIndices;
-  }
-
-  // Map a SubRegIndex Record to its number.
-  unsigned getSubRegIndexNo(Record *idx) const {
-    if (SubRegIndices.empty()) ReadSubRegIndices();
-    std::vector<Record*>::const_iterator i =
-      std::find(SubRegIndices.begin(), SubRegIndices.end(), idx);
-    assert(i != SubRegIndices.end() && "Not a SubRegIndex");
-    return (i - SubRegIndices.begin()) + 1;
-  }
-
   const std::vector<CodeGenRegisterClass> &getRegisterClasses() const {
     if (RegisterClasses.empty()) ReadRegisterClasses();
     return RegisterClasses;
diff --git a/utils/TableGen/DAGISelMatcherGen.cpp b/utils/TableGen/DAGISelMatcherGen.cpp
index 393ac69..402a239 100644
--- a/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/utils/TableGen/DAGISelMatcherGen.cpp
@@ -646,6 +646,35 @@ GetInstPatternNode(const DAGInstruction &Inst, const TreePatternNode *N) {
   return InstPatNode;
 }
 
+static bool
+mayInstNodeLoadOrStore(const TreePatternNode *N,
+                       const CodeGenDAGPatterns &CGP) {
+  Record *Op = N->getOperator();
+  const CodeGenTarget &CGT = CGP.getTargetInfo();
+  CodeGenInstruction &II = CGT.getInstruction(Op);
+  return II.mayLoad || II.mayStore;
+}
+
+static unsigned
+numNodesThatMayLoadOrStore(const TreePatternNode *N,
+                           const CodeGenDAGPatterns &CGP) {
+  if (N->isLeaf())
+    return 0;
+
+  Record *OpRec = N->getOperator();
+  if (!OpRec->isSubClassOf("Instruction"))
+    return 0;
+
+  unsigned Count = 0;
+  if (mayInstNodeLoadOrStore(N, CGP))
+    ++Count;
+
+  for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i)
+    Count += numNodesThatMayLoadOrStore(N->getChild(i), CGP);
+
+  return Count;
+}
+
 void MatcherGen::
 EmitResultInstructionAsOperand(const TreePatternNode *N,
                                SmallVectorImpl<unsigned> &OutputOps) {
@@ -772,21 +801,26 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
       (Pattern.getSrcPattern()->NodeHasProperty(SDNPVariadic, CGP)))
     NumFixedArityOperands = Pattern.getSrcPattern()->getNumChildren();
 
-  // If this is the root node and any of the nodes matched nodes in the input
-  // pattern have MemRefs in them, have the interpreter collect them and plop
-  // them onto this node.
-  //
-  // FIXME3: This is actively incorrect for result patterns where the root of
-  // the pattern is not the memory reference and is also incorrect when the
-  // result pattern has multiple memory-referencing instructions.  For example,
-  // in the X86 backend, this pattern causes the memrefs to get attached to the
-  // CVTSS2SDrr instead of the MOVSSrm:
+  // If this is the root node and multiple matched nodes in the input pattern
+  // have MemRefs in them, have the interpreter collect them and plop them onto
+  // this node. If there is just one node with MemRefs, leave them on that node
+  // even if it is not the root.
   //
-  //  def : Pat<(extloadf32 addr:$src),
-  //            (CVTSS2SDrr (MOVSSrm addr:$src))>;
-  //
-  bool NodeHasMemRefs =
-    isRoot && Pattern.getSrcPattern()->TreeHasProperty(SDNPMemOperand, CGP);
+  // FIXME3: This is actively incorrect for result patterns with multiple
+  // memory-referencing instructions.
+  bool PatternHasMemOperands =
+    Pattern.getSrcPattern()->TreeHasProperty(SDNPMemOperand, CGP);
+
+  bool NodeHasMemRefs = false;
+  if (PatternHasMemOperands) {
+    unsigned NumNodesThatLoadOrStore =
+      numNodesThatMayLoadOrStore(Pattern.getDstPattern(), CGP);
+    bool NodeIsUniqueLoadOrStore = mayInstNodeLoadOrStore(N, CGP) &&
+                                   NumNodesThatLoadOrStore == 1;
+    NodeHasMemRefs =
+      NodeIsUniqueLoadOrStore || (isRoot && (mayInstNodeLoadOrStore(N, CGP) ||
+                                             NumNodesThatLoadOrStore != 1));
+  }
 
   assert((!ResultVTs.empty() || TreeHasOutGlue || NodeHasChain) &&
          "Node has no result");
diff --git a/utils/TableGen/EDEmitter.cpp b/utils/TableGen/EDEmitter.cpp
index 5358c0c..daf9617 100644
--- a/utils/TableGen/EDEmitter.cpp
+++ b/utils/TableGen/EDEmitter.cpp
@@ -596,6 +596,7 @@ static int ARMFlagFromOpName(LiteralConstantEmitter *type,
   IMM("t_adrlabel");
   IMM("t2adrlabel");
   IMM("shift_imm");
+  IMM("ssat_imm");
   IMM("neon_vcvt_imm32");
   IMM("shr_imm8");
   IMM("shr_imm16");
@@ -635,6 +636,7 @@ static int ARMFlagFromOpName(LiteralConstantEmitter *type,
   MISC("addrmode6", "kOperandTypeARMAddrMode6");                  // R, R, I, I
   MISC("am6offset", "kOperandTypeARMAddrMode6Offset");            // R, I, I
   MISC("addrmode6dup", "kOperandTypeARMAddrMode6");               // R, R, I, I
+  MISC("addrmode6oneL32", "kOperandTypeARMAddrMode6");            // R, R, I, I
   MISC("addrmodepc", "kOperandTypeARMAddrModePC");                // R, I
   MISC("addrmode7", "kOperandTypeARMAddrMode7");                  // R
   MISC("reglist", "kOperandTypeARMRegisterList");                 // I, R, ...
diff --git a/utils/TableGen/FastISelEmitter.cpp b/utils/TableGen/FastISelEmitter.cpp
index 9c11bf6..78ac556 100644
--- a/utils/TableGen/FastISelEmitter.cpp
+++ b/utils/TableGen/FastISelEmitter.cpp
@@ -19,9 +19,10 @@
 
 #include "FastISelEmitter.h"
 #include "Record.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/VectorExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
 namespace {
@@ -278,8 +279,7 @@ struct OperandsSignature {
       } else if (Operands[i].isFP()) {
         OS << "ConstantFP *f" << i;
       } else {
-        assert("Unknown operand kind!");
-        abort();
+        llvm_unreachable("Unknown operand kind!");
       }
       if (i + 1 != e)
         OS << ", ";
@@ -307,8 +307,7 @@ struct OperandsSignature {
         OS << "f" << i;
         PrintedArg = true;
       } else {
-        assert("Unknown operand kind!");
-        abort();
+        llvm_unreachable("Unknown operand kind!");
       }
     }
   }
@@ -322,8 +321,7 @@ struct OperandsSignature {
       } else if (Operands[i].isFP()) {
         OS << "f" << i;
       } else {
-        assert("Unknown operand kind!");
-        abort();
+        llvm_unreachable("Unknown operand kind!");
       }
       if (i + 1 != e)
         OS << ", ";
diff --git a/utils/TableGen/IntrinsicEmitter.cpp b/utils/TableGen/IntrinsicEmitter.cpp
index 08f6728..39eb3bd 100644
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp
@@ -465,6 +465,46 @@ void IntrinsicEmitter::EmitGenerator(const std::vector<CodeGenIntrinsic> &Ints,
   OS << "#endif\n\n";
 }
 
+namespace {
+  enum ModRefKind {
+    MRK_none,
+    MRK_readonly,
+    MRK_readnone
+  };
+
+  ModRefKind getModRefKind(const CodeGenIntrinsic &intrinsic) {
+    switch (intrinsic.ModRef) {
+    case CodeGenIntrinsic::NoMem:
+      return MRK_readnone;
+    case CodeGenIntrinsic::ReadArgMem:
+    case CodeGenIntrinsic::ReadMem:
+      return MRK_readonly;
+    case CodeGenIntrinsic::ReadWriteArgMem:
+    case CodeGenIntrinsic::ReadWriteMem:
+      return MRK_none;
+    }
+    assert(0 && "bad mod-ref kind");
+    return MRK_none;
+  }
+
+  struct AttributeComparator {
+    bool operator()(const CodeGenIntrinsic *L, const CodeGenIntrinsic *R) const {
+      // Sort throwing intrinsics after non-throwing intrinsics.
+      if (L->canThrow != R->canThrow)
+        return R->canThrow;
+
+      // Try to order by readonly/readnone attribute.
+      ModRefKind LK = getModRefKind(*L);
+      ModRefKind RK = getModRefKind(*R);
+      if (LK != RK) return (LK > RK);
+
+      // Order by argument attributes.
+      // This is reliable because each side is already sorted internally.
+      return (L->ArgumentAttributes < R->ArgumentAttributes);
+    }
+  };
+}
+
 /// EmitAttributes - This emits the Intrinsic::getAttributes method.
 void IntrinsicEmitter::
 EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS) {
@@ -472,84 +512,96 @@ EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS) {
   OS << "#ifdef GET_INTRINSIC_ATTRIBUTES\n";
   if (TargetOnly)
     OS << "static AttrListPtr getAttributes(" << TargetPrefix 
-       << "Intrinsic::ID id) {";
+       << "Intrinsic::ID id) {\n";
   else
-    OS << "AttrListPtr Intrinsic::getAttributes(ID id) {";
-  OS << "  // No intrinsic can throw exceptions.\n";
-  OS << "  Attributes Attr = Attribute::NoUnwind;\n";
-  OS << "  switch (id) {\n";
-  OS << "  default: break;\n";
-  unsigned MaxArgAttrs = 0;
+    OS << "AttrListPtr Intrinsic::getAttributes(ID id) {\n";
+
+  // Compute the maximum number of attribute arguments.
+  std::vector<const CodeGenIntrinsic*> sortedIntrinsics(Ints.size());
+  unsigned maxArgAttrs = 0;
   for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
-    MaxArgAttrs =
-      std::max(MaxArgAttrs, unsigned(Ints[i].ArgumentAttributes.size()));
-    switch (Ints[i].ModRef) {
-    default: break;
-    case CodeGenIntrinsic::NoMem:
-      OS << "  case " << TargetPrefix << "Intrinsic::" << Ints[i].EnumName 
-         << ":\n";
-      break;
-    }
+    const CodeGenIntrinsic &intrinsic = Ints[i];
+    sortedIntrinsics[i] = &intrinsic;
+    maxArgAttrs =
+      std::max(maxArgAttrs, unsigned(intrinsic.ArgumentAttributes.size()));
   }
-  OS << "    Attr |= Attribute::ReadNone; // These do not access memory.\n";
-  OS << "    break;\n";
-  for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
-    switch (Ints[i].ModRef) {
-    default: break;
-    case CodeGenIntrinsic::ReadArgMem:
-    case CodeGenIntrinsic::ReadMem:
-      OS << "  case " << TargetPrefix << "Intrinsic::" << Ints[i].EnumName 
-         << ":\n";
-      break;
-    }
-  }
-  OS << "    Attr |= Attribute::ReadOnly; // These do not write memory.\n";
-  OS << "    break;\n";
-  OS << "  }\n";
-  OS << "  AttributeWithIndex AWI[" << MaxArgAttrs+1 << "];\n";
+
+  // Emit an array of AttributeWithIndex.  Most intrinsics will have
+  // at least one entry, for the function itself (index ~1), which is
+  // usually nounwind.
+  OS << "  AttributeWithIndex AWI[" << maxArgAttrs+1 << "];\n";
   OS << "  unsigned NumAttrs = 0;\n";
   OS << "  switch (id) {\n";
-  OS << "  default: break;\n";
-  
-  // Add argument attributes for any intrinsics that have them.
-  for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
-    if (Ints[i].ArgumentAttributes.empty()) continue;
-    
-    OS << "  case " << TargetPrefix << "Intrinsic::" << Ints[i].EnumName 
-       << ":\n";
+  OS << "    default: break;\n";
+
+  AttributeComparator precedes;
 
-    std::vector<std::pair<unsigned, CodeGenIntrinsic::ArgAttribute> > ArgAttrs =
-      Ints[i].ArgumentAttributes;
-    // Sort by argument index.
-    std::sort(ArgAttrs.begin(), ArgAttrs.end());
+  std::stable_sort(sortedIntrinsics.begin(), sortedIntrinsics.end(), precedes);
+
+  for (unsigned i = 0, e = sortedIntrinsics.size(); i != e; ++i) {
+    const CodeGenIntrinsic &intrinsic = *sortedIntrinsics[i];
+    OS << "  case " << TargetPrefix << "Intrinsic::"
+       << intrinsic.EnumName << ":\n";
+
+    // Fill out the case if this is the last case for this range of
+    // intrinsics.
+    if (i + 1 != e && !precedes(&intrinsic, sortedIntrinsics[i + 1]))
+      continue;
 
-    unsigned NumArgsWithAttrs = 0;
+    // Keep track of the number of attributes we're writing out.
+    unsigned numAttrs = 0;
 
-    while (!ArgAttrs.empty()) {
-      unsigned ArgNo = ArgAttrs[0].first;
+    // The argument attributes are alreadys sorted by argument index.
+    for (unsigned ai = 0, ae = intrinsic.ArgumentAttributes.size(); ai != ae;) {
+      unsigned argNo = intrinsic.ArgumentAttributes[ai].first;
       
-      OS << "    AWI[" << NumArgsWithAttrs++ << "] = AttributeWithIndex::get("
-         << ArgNo+1 << ", 0";
+      OS << "    AWI[" << numAttrs++ << "] = AttributeWithIndex::get("
+         << argNo+1 << ", ";
 
-      while (!ArgAttrs.empty() && ArgAttrs[0].first == ArgNo) {
-        switch (ArgAttrs[0].second) {
-        default: assert(0 && "Unknown arg attribute");
+      bool moreThanOne = false;
+
+      do {
+        if (moreThanOne) OS << '|';
+
+        switch (intrinsic.ArgumentAttributes[ai].second) {
         case CodeGenIntrinsic::NoCapture:
-          OS << "|Attribute::NoCapture";
+          OS << "Attribute::NoCapture";
           break;
         }
-        ArgAttrs.erase(ArgAttrs.begin());
+
+        ++ai;
+        moreThanOne = true;
+      } while (ai != ae && intrinsic.ArgumentAttributes[ai].first == argNo);
+
+      OS << ");\n";
+    }
+
+    ModRefKind modRef = getModRefKind(intrinsic);
+
+    if (!intrinsic.canThrow || modRef) {
+      OS << "    AWI[" << numAttrs++ << "] = AttributeWithIndex::get(~0, ";
+      if (!intrinsic.canThrow) {
+        OS << "Attribute::NoUnwind";
+        if (modRef) OS << '|';
+      }
+      switch (modRef) {
+      case MRK_none: break;
+      case MRK_readonly: OS << "Attribute::ReadOnly"; break;
+      case MRK_readnone: OS << "Attribute::ReadNone"; break;
       }
       OS << ");\n";
     }
-    
-    OS << "    NumAttrs = " << NumArgsWithAttrs << ";\n";
-    OS << "    break;\n";
+
+    if (numAttrs) {
+      OS << "    NumAttrs = " << numAttrs << ";\n";
+      OS << "    break;\n";
+    } else {
+      OS << "    return AttrListPtr();\n";
+    }
   }
   
   OS << "  }\n";
-  OS << "  AWI[NumAttrs] = AttributeWithIndex::get(~0, Attr);\n";
-  OS << "  return AttrListPtr::get(AWI, NumAttrs+1);\n";
+  OS << "  return AttrListPtr::get(AWI, NumAttrs);\n";
   OS << "}\n";
   OS << "#endif // GET_INTRINSIC_ATTRIBUTES\n\n";
 }
diff --git a/utils/TableGen/LLVMCConfigurationEmitter.cpp b/utils/TableGen/LLVMCConfigurationEmitter.cpp
index 6572595..090faf5 100644
--- a/utils/TableGen/LLVMCConfigurationEmitter.cpp
+++ b/utils/TableGen/LLVMCConfigurationEmitter.cpp
@@ -74,6 +74,25 @@ int InitPtrToInt(const Init* ptr) {
   return val.getValue();
 }
 
+bool InitPtrToBool(const Init* ptr) {
+  bool ret = false;
+  const DefInit& val = dynamic_cast<const DefInit&>(*ptr);
+  const std::string& str = val.getAsString();
+
+  if (str == "true") {
+    ret = true;
+  }
+  else if (str == "false") {
+    ret = false;
+  }
+  else {
+    throw "Incorrect boolean value: '" + str +
+      "': must be either 'true' or 'false'";
+  }
+
+  return ret;
+}
+
 const std::string& InitPtrToString(const Init* ptr) {
   const StringInit& val = dynamic_cast<const StringInit&>(*ptr);
   return val.getValue();
@@ -95,13 +114,7 @@ const std::string GetOperatorName(const DagInit& D) {
 
 /// CheckBooleanConstant - Check that the provided value is a boolean constant.
 void CheckBooleanConstant(const Init* I) {
-  const DefInit& val = dynamic_cast<const DefInit&>(*I);
-  const std::string& str = val.getAsString();
-
-  if (str != "true" && str != "false") {
-    throw "Incorrect boolean value: '" + str +
-      "': must be either 'true' or 'false'";
-  }
+  InitPtrToBool(I);
 }
 
 // CheckNumberOfArguments - Ensure that the number of args in d is
@@ -935,8 +948,22 @@ private:
   }
 
   void onJoin (const DagInit& d) {
-    CheckNumberOfArguments(d, 0);
-    toolDesc_.setJoin();
+    bool isReallyJoin = false;
+
+    if (d.getNumArgs() == 0) {
+      isReallyJoin = true;
+    }
+    else {
+      Init* I = d.getArg(0);
+      isReallyJoin = InitPtrToBool(I);
+    }
+
+    // Is this *really* a join tool? We allow (join false) for generating two
+    // tool descriptions from a single generic one.
+    // TOFIX: come up with a cleaner solution.
+    if (isReallyJoin) {
+      toolDesc_.setJoin();
+    }
   }
 
   void onOutLanguage (const DagInit& d) {
diff --git a/utils/TableGen/NeonEmitter.cpp b/utils/TableGen/NeonEmitter.cpp
index 123abef..23fdbde 100644
--- a/utils/TableGen/NeonEmitter.cpp
+++ b/utils/TableGen/NeonEmitter.cpp
@@ -1398,9 +1398,14 @@ void NeonEmitter::runHeader(raw_ostream &OS) {
     for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
       std::string namestr, shiftstr, rangestr;
 
-      // Builtins which are overloaded by type will need to have their upper
-      // bound computed at Sema time based on the type constant.
-      if (Proto.find('s') == std::string::npos) {
+      if (R->getValueAsBit("isVCVT_N")) {
+        // VCVT between floating- and fixed-point values takes an immediate
+        // in the range 1 to 32.
+        ck = ClassB;
+        rangestr = "l = 1; u = 31"; // upper bound = l + u
+      } else if (Proto.find('s') == std::string::npos) {
+        // Builtins which are overloaded by type will need to have their upper
+        // bound computed at Sema time based on the type constant.
         ck = ClassB;
         if (R->getValueAsBit("isShift")) {
           shiftstr = ", true";
diff --git a/utils/TableGen/Record.cpp b/utils/TableGen/Record.cpp
index abbbafe..8ac8cd9 100644
--- a/utils/TableGen/Record.cpp
+++ b/utils/TableGen/Record.cpp
@@ -68,14 +68,9 @@ Init *BitsRecTy::convertValue(BitInit *UI) {
 /// canFitInBitfield - Return true if the number of bits is large enough to hold
 /// the integer value.
 static bool canFitInBitfield(int64_t Value, unsigned NumBits) {
-  if (Value >= 0) {
-    if (Value & ~((1LL << NumBits) - 1))
-      return false;
-  } else if ((Value >> NumBits) != -1 || (Value & (1LL << (NumBits-1))) == 0) {
-    return false;
-  }
-
-  return true;
+  // For example, with NumBits == 4, we permit Values from [-7 .. 15].
+  return (NumBits >= sizeof(Value) * 8) ||
+         (Value >> NumBits == 0) || (Value >> (NumBits-1) == -1);
 }
 
 /// convertValue from Int initializer to bits type: Split the integer up into the
@@ -583,9 +578,7 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) {
         if (Record *D = (CurRec->getRecords()).getDef(Name))
           return new DefInit(D);
 
-        errs() << "Variable not defined: '" + Name + "'\n";
-        assert(0 && "Variable not found");
-        return 0;
+        throw TGError(CurRec->getLoc(), "Undefined reference:'" + Name + "'\n");
       }
     }
     break;
@@ -813,15 +806,13 @@ static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
   OpInit *RHSo = dynamic_cast<OpInit*>(RHS);
 
   if (!RHSo) {
-    errs() << "!foreach requires an operator\n";
-    assert(0 && "No operator for !foreach");
+    throw TGError(CurRec->getLoc(), "!foreach requires an operator\n");
   }
 
   TypedInit *LHSt = dynamic_cast<TypedInit*>(LHS);
 
   if (!LHSt) {
-    errs() << "!foreach requires typed variable\n";
-    assert(0 && "No typed variable for !foreach");
+    throw TGError(CurRec->getLoc(), "!foreach requires typed variable\n");
   }
 
   if ((MHSd && DagType) || (MHSl && ListType)) {
diff --git a/utils/TableGen/RegisterInfoEmitter.cpp b/utils/TableGen/RegisterInfoEmitter.cpp
index 4ddc47d..5a441e2 100644
--- a/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/utils/TableGen/RegisterInfoEmitter.cpp
@@ -26,6 +26,7 @@ using namespace llvm;
 // runEnums - Print out enum values for all of the registers.
 void RegisterInfoEmitter::runEnums(raw_ostream &OS) {
   CodeGenTarget Target(Records);
+  CodeGenRegBank &Bank = Target.getRegBank();
   const std::vector<CodeGenRegister> &Registers = Target.getRegisters();
 
   std::string Namespace = Registers[0].TheDef->getValueAsString("Namespace");
@@ -47,16 +48,16 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS) {
   if (!Namespace.empty())
     OS << "}\n";
 
-  const std::vector<Record*> SubRegIndices = Target.getSubRegIndices();
+  const std::vector<Record*> &SubRegIndices = Bank.getSubRegIndices();
   if (!SubRegIndices.empty()) {
     OS << "\n// Subregister indices\n";
     Namespace = SubRegIndices[0]->getValueAsString("Namespace");
     if (!Namespace.empty())
       OS << "namespace " << Namespace << " {\n";
     OS << "enum {\n  NoSubRegister,\n";
-    for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i)
+    for (unsigned i = 0, e = Bank.getNumNamedIndices(); i != e; ++i)
       OS << "  " << SubRegIndices[i]->getName() << ",\t// " << i+1 << "\n";
-    OS << "  NUM_TARGET_SUBREGS = " << SubRegIndices.size()+1 << "\n";
+    OS << "  NUM_TARGET_NAMED_SUBREGS = " << SubRegIndices.size()+1 << "\n";
     OS << "};\n";
     if (!Namespace.empty())
       OS << "}\n";
@@ -80,6 +81,8 @@ void RegisterInfoEmitter::runHeader(raw_ostream &OS) {
      << "(int CallFrameSetupOpcode = -1, int CallFrameDestroyOpcode = -1);\n"
      << "  virtual int getDwarfRegNumFull(unsigned RegNum, "
      << "unsigned Flavour) const;\n"
+     << "  virtual int getLLVMRegNumFull(unsigned DwarfRegNum, "
+     << "unsigned Flavour) const;\n"
      << "  virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const = 0;\n"
      << "  virtual bool needsStackRealignment(const MachineFunction &) const\n"
      << "     { return false; }\n"
@@ -165,160 +168,6 @@ static void addSubSuperReg(Record *R, Record *S,
       addSubSuperReg(R, *I, SubRegs, SuperRegs, Aliases);
 }
 
-struct RegisterMaps {
-  // Map SubRegIndex -> Register
-  typedef std::map<Record*, Record*, LessRecord> SubRegMap;
-  // Map Register -> SubRegMap
-  typedef std::map<Record*, SubRegMap> SubRegMaps;
-
-  SubRegMaps SubReg;
-  SubRegMap &inferSubRegIndices(Record *Reg);
-
-  // Composite SubRegIndex instances.
-  // Map (SubRegIndex,SubRegIndex) -> SubRegIndex
-  typedef DenseMap<std::pair<Record*,Record*>,Record*> CompositeMap;
-  CompositeMap Composite;
-
-  // Compute SubRegIndex compositions after inferSubRegIndices has run on all
-  // registers.
-  void computeComposites();
-};
-
-// Calculate all subregindices for Reg. Loopy subregs cause infinite recursion.
-RegisterMaps::SubRegMap &RegisterMaps::inferSubRegIndices(Record *Reg) {
-  SubRegMap &SRM = SubReg[Reg];
-  if (!SRM.empty())
-    return SRM;
-  std::vector<Record*> SubRegs = Reg->getValueAsListOfDefs("SubRegs");
-  std::vector<Record*> Indices = Reg->getValueAsListOfDefs("SubRegIndices");
-  if (SubRegs.size() != Indices.size())
-    throw "Register " + Reg->getName() + " SubRegIndices doesn't match SubRegs";
-
-  // First insert the direct subregs and make sure they are fully indexed.
-  for (unsigned i = 0, e = SubRegs.size(); i != e; ++i) {
-    if (!SRM.insert(std::make_pair(Indices[i], SubRegs[i])).second)
-      throw "SubRegIndex " + Indices[i]->getName()
-        + " appears twice in Register " + Reg->getName();
-    inferSubRegIndices(SubRegs[i]);
-  }
-
-  // Keep track of inherited subregs and how they can be reached.
-  // Register -> (SubRegIndex, SubRegIndex)
-  typedef std::map<Record*, std::pair<Record*,Record*>, LessRecord> OrphanMap;
-  OrphanMap Orphans;
-
-  // Clone inherited subregs. Here the order is important - earlier subregs take
-  // precedence.
-  for (unsigned i = 0, e = SubRegs.size(); i != e; ++i) {
-    SubRegMap &M = SubReg[SubRegs[i]];
-    for (SubRegMap::iterator si = M.begin(), se = M.end(); si != se; ++si)
-      if (!SRM.insert(*si).second)
-        Orphans[si->second] = std::make_pair(Indices[i], si->first);
-  }
-
-  // Finally process the composites.
-  ListInit *Comps = Reg->getValueAsListInit("CompositeIndices");
-  for (unsigned i = 0, e = Comps->size(); i != e; ++i) {
-    DagInit *Pat = dynamic_cast<DagInit*>(Comps->getElement(i));
-    if (!Pat)
-      throw "Invalid dag '" + Comps->getElement(i)->getAsString()
-        + "' in CompositeIndices";
-    DefInit *BaseIdxInit = dynamic_cast<DefInit*>(Pat->getOperator());
-    if (!BaseIdxInit || !BaseIdxInit->getDef()->isSubClassOf("SubRegIndex"))
-      throw "Invalid SubClassIndex in " + Pat->getAsString();
-
-    // Resolve list of subreg indices into R2.
-    Record *R2 = Reg;
-    for (DagInit::const_arg_iterator di = Pat->arg_begin(),
-         de = Pat->arg_end(); di != de; ++di) {
-      DefInit *IdxInit = dynamic_cast<DefInit*>(*di);
-      if (!IdxInit || !IdxInit->getDef()->isSubClassOf("SubRegIndex"))
-        throw "Invalid SubClassIndex in " + Pat->getAsString();
-      SubRegMap::const_iterator ni = SubReg[R2].find(IdxInit->getDef());
-      if (ni == SubReg[R2].end())
-        throw "Composite " + Pat->getAsString() + " refers to bad index in "
-          + R2->getName();
-      R2 = ni->second;
-    }
-
-    // Insert composite index. Allow overriding inherited indices etc.
-    SRM[BaseIdxInit->getDef()] = R2;
-
-    // R2 is now directly addressable, no longer an orphan.
-    Orphans.erase(R2);
-  }
-
-  // Now, Orphans contains the inherited subregisters without a direct index.
-  if (!Orphans.empty()) {
-    errs() << "Error: Register " << getQualifiedName(Reg)
-           << " inherited subregisters without an index:\n";
-    for (OrphanMap::iterator i = Orphans.begin(), e = Orphans.end(); i != e;
-         ++i) {
-      errs() << "  " << getQualifiedName(i->first)
-             << " = " << i->second.first->getName()
-             << ", " << i->second.second->getName() << "\n";
-    }
-    abort();
-  }
-  return SRM;
-}
-
-void RegisterMaps::computeComposites() {
-  for (SubRegMaps::const_iterator sri = SubReg.begin(), sre = SubReg.end();
-       sri != sre; ++sri) {
-    Record *Reg1 = sri->first;
-    const SubRegMap &SRM1 = sri->second;
-    for (SubRegMap::const_iterator i1 = SRM1.begin(), e1 = SRM1.end();
-         i1 != e1; ++i1) {
-      Record *Idx1 = i1->first;
-      Record *Reg2 = i1->second;
-      // Ignore identity compositions.
-      if (Reg1 == Reg2)
-        continue;
-      // If Reg2 has no subregs, Idx1 doesn't compose.
-      if (!SubReg.count(Reg2))
-        continue;
-      const SubRegMap &SRM2 = SubReg[Reg2];
-      // Try composing Idx1 with another SubRegIndex.
-      for (SubRegMap::const_iterator i2 = SRM2.begin(), e2 = SRM2.end();
-           i2 != e2; ++i2) {
-        std::pair<Record*,Record*> IdxPair(Idx1, i2->first);
-        Record *Reg3 = i2->second;
-        // OK Reg1:IdxPair == Reg3. Find the index with Reg:Idx == Reg3.
-        for (SubRegMap::const_iterator i1d = SRM1.begin(), e1d = SRM1.end();
-             i1d != e1d; ++i1d) {
-          // Ignore identity compositions.
-          if (Reg2 == Reg3)
-            continue;
-          if (i1d->second == Reg3) {
-            std::pair<CompositeMap::iterator,bool> Ins =
-              Composite.insert(std::make_pair(IdxPair, i1d->first));
-            // Conflicting composition?
-            if (!Ins.second && Ins.first->second != i1d->first) {
-              errs() << "Error: SubRegIndex " << getQualifiedName(Idx1)
-                     << " and " << getQualifiedName(IdxPair.second)
-                     << " compose ambiguously as "
-                     << getQualifiedName(Ins.first->second) << " or "
-                     << getQualifiedName(i1d->first) << "\n";
-              abort();
-            }
-          }
-        }
-      }
-    }
-  }
-
-  // We don't care about the difference between (Idx1, Idx2) -> Idx2 and invalid
-  // compositions, so remove any mappings of that form.
-  for (CompositeMap::iterator i = Composite.begin(), e = Composite.end();
-       i != e;) {
-    CompositeMap::iterator j = i;
-    ++i;
-    if (j->first.second == j->second)
-      Composite.erase(j);
-  }
-}
-
 class RegisterSorter {
 private:
   std::map<Record*, std::set<Record*>, LessRecord> &RegisterSubRegs;
@@ -337,28 +186,30 @@ public:
 //
 void RegisterInfoEmitter::run(raw_ostream &OS) {
   CodeGenTarget Target(Records);
+  CodeGenRegBank &RegBank = Target.getRegBank();
+  RegBank.computeDerivedInfo();
   EmitSourceFileHeader("Register Information Source Fragment", OS);
 
   OS << "namespace llvm {\n\n";
 
-  // Start out by emitting each of the register classes... to do this, we build
-  // a set of registers which belong to a register class, this is to ensure that
-  // each register is only in a single register class.
-  //
+  // Start out by emitting each of the register classes.
   const std::vector<CodeGenRegisterClass> &RegisterClasses =
     Target.getRegisterClasses();
 
+  // Collect all registers belonging to any allocatable class.
+  std::set<Record*> AllocatableRegs;
+
   // Loop over all of the register classes... emitting each one.
   OS << "namespace {     // Register classes...\n";
 
-  // RegClassesBelongedTo - Keep track of which register classes each reg
-  // belongs to.
-  std::multimap<Record*, const CodeGenRegisterClass*> RegClassesBelongedTo;
-
   // Emit the register enum value arrays for each RegisterClass
   for (unsigned rc = 0, e = RegisterClasses.size(); rc != e; ++rc) {
     const CodeGenRegisterClass &RC = RegisterClasses[rc];
 
+    // Collect allocatable registers.
+    if (RC.Allocatable)
+      AllocatableRegs.insert(RC.Elements.begin(), RC.Elements.end());
+
     // Give the register class a legal C name if it's anonymous.
     std::string Name = RC.TheDef->getName();
 
@@ -369,9 +220,6 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
     for (unsigned i = 0, e = RC.Elements.size(); i != e; ++i) {
       Record *Reg = RC.Elements[i];
       OS << getQualifiedName(Reg) << ", ";
-
-      // Keep track of which regclasses this register is in.
-      RegClassesBelongedTo.insert(std::make_pair(Reg, &RC));
     }
     OS << "\n  };\n\n";
   }
@@ -406,7 +254,7 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
     std::map<unsigned, std::set<unsigned> > SuperRegClassMap;
     OS << "\n";
 
-    unsigned NumSubRegIndices = Target.getSubRegIndices().size();
+    unsigned NumSubRegIndices = RegBank.getSubRegIndices().size();
 
     if (NumSubRegIndices) {
       // Emit the sub-register classes for each RegisterClass
@@ -417,7 +265,7 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
              i = RC.SubRegClasses.begin(),
              e = RC.SubRegClasses.end(); i != e; ++i) {
           // Build SRC array.
-          unsigned idx = Target.getSubRegIndexNo(i->first);
+          unsigned idx = RegBank.getSubRegIndexNo(i->first);
           SRC.at(idx-1) = i->second;
 
           // Find the register class number of i->second for SuperRegClassMap.
@@ -567,6 +415,7 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
          << RC.SpillSize/8 << ", "
          << RC.SpillAlignment/8 << ", "
          << RC.CopyCost << ", "
+         << RC.Allocatable << ", "
          << RC.getName() << ", " << RC.getName() << " + " << RC.Elements.size()
          << ") {}\n";
     }
@@ -841,7 +690,7 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
   }
 
   OS<<"\n  const TargetRegisterDesc RegisterDescriptors[] = { // Descriptors\n";
-  OS << "    { \"NOREG\",\t0,\t0,\t0,\t0 },\n";
+  OS << "    { \"NOREG\",\t0,\t0,\t0,\t0,\t0 },\n";
 
   // Now that register alias and sub-registers sets have been emitted, emit the
   // register descriptors now.
@@ -857,12 +706,17 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
       OS << Reg.getName() << "_SuperRegsSet,\t";
     else
       OS << "Empty_SuperRegsSet,\t";
-    OS << Reg.CostPerUse << " },\n";
+    OS << Reg.CostPerUse << ",\t"
+       << int(AllocatableRegs.count(Reg.TheDef)) << " },\n";
   }
   OS << "  };\n";      // End of register descriptors...
 
+  // Calculate the mapping of subregister+index pairs to physical registers.
+  // This will also create further anonymous indexes.
+  unsigned NamedIndices = RegBank.getNumNamedIndices();
+
   // Emit SubRegIndex names, skipping 0
-  const std::vector<Record*> SubRegIndices = Target.getSubRegIndices();
+  const std::vector<Record*> &SubRegIndices = RegBank.getSubRegIndices();
   OS << "\n  const char *const SubRegIndexTable[] = { \"";
   for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i) {
     OS << SubRegIndices[i]->getName();
@@ -870,13 +724,21 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
       OS << "\", \"";
   }
   OS << "\" };\n\n";
+
+  // Emit names of the anonymus subreg indexes.
+  if (SubRegIndices.size() > NamedIndices) {
+    OS << "  enum {";
+    for (unsigned i = NamedIndices, e = SubRegIndices.size(); i != e; ++i) {
+      OS << "\n    " << SubRegIndices[i]->getName() << " = " << i+1;
+      if (i+1 != e)
+        OS << ',';
+    }
+    OS << "\n  };\n\n";
+  }
   OS << "}\n\n";       // End of anonymous namespace...
 
   std::string ClassName = Target.getName() + "GenRegisterInfo";
 
-  // Calculate the mapping of subregister+index pairs to physical registers.
-  RegisterMaps RegMaps;
-
   // Emit the subregister + index mapping function based on the information
   // calculated above.
   OS << "unsigned " << ClassName
@@ -884,16 +746,16 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
      << "  switch (RegNo) {\n"
      << "  default:\n    return 0;\n";
   for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-    RegisterMaps::SubRegMap &SRM = RegMaps.inferSubRegIndices(Regs[i].TheDef);
+    const CodeGenRegister::SubRegMap &SRM = Regs[i].getSubRegs();
     if (SRM.empty())
       continue;
     OS << "  case " << getQualifiedName(Regs[i].TheDef) << ":\n";
     OS << "    switch (Index) {\n";
     OS << "    default: return 0;\n";
-    for (RegisterMaps::SubRegMap::const_iterator ii = SRM.begin(),
+    for (CodeGenRegister::SubRegMap::const_iterator ii = SRM.begin(),
          ie = SRM.end(); ii != ie; ++ii)
       OS << "    case " << getQualifiedName(ii->first)
-         << ": return " << getQualifiedName(ii->second) << ";\n";
+         << ": return " << getQualifiedName(ii->second->TheDef) << ";\n";
     OS << "    };\n" << "    break;\n";
   }
   OS << "  };\n";
@@ -905,13 +767,13 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
      << "  switch (RegNo) {\n"
      << "  default:\n    return 0;\n";
    for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
-     RegisterMaps::SubRegMap &SRM = RegMaps.SubReg[Regs[i].TheDef];
+     const CodeGenRegister::SubRegMap &SRM = Regs[i].getSubRegs();
      if (SRM.empty())
        continue;
     OS << "  case " << getQualifiedName(Regs[i].TheDef) << ":\n";
-    for (RegisterMaps::SubRegMap::const_iterator ii = SRM.begin(),
+    for (CodeGenRegister::SubRegMap::const_iterator ii = SRM.begin(),
          ie = SRM.end(); ii != ie; ++ii)
-      OS << "    if (SubRegNo == " << getQualifiedName(ii->second)
+      OS << "    if (SubRegNo == " << getQualifiedName(ii->second->TheDef)
          << ")  return " << getQualifiedName(ii->first) << ";\n";
     OS << "    return 0;\n";
   }
@@ -920,7 +782,6 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
   OS << "}\n\n";
 
   // Emit composeSubRegIndices
-  RegMaps.computeComposites();
   OS << "unsigned " << ClassName
      << "::composeSubRegIndices(unsigned IdxA, unsigned IdxB) const {\n"
      << "  switch (IdxA) {\n"
@@ -928,8 +789,8 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
   for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i) {
     bool Open = false;
     for (unsigned j = 0; j != e; ++j) {
-      if (Record *Comp = RegMaps.Composite.lookup(
-                          std::make_pair(SubRegIndices[i], SubRegIndices[j]))) {
+      if (Record *Comp = RegBank.getCompositeSubRegIndex(SubRegIndices[i],
+                                                         SubRegIndices[j])) {
         if (!Open) {
           OS << "  case " << getQualifiedName(SubRegIndices[i])
              << ": switch(IdxB) {\n    default: return IdxB;\n";
@@ -975,6 +836,44 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
     for (unsigned i = I->second.size(), e = maxLength; i != e; ++i)
       I->second.push_back(-1);
 
+  // Emit reverse information about the dwarf register numbers.
+  OS << "int " << ClassName << "::getLLVMRegNumFull(unsigned DwarfRegNum, "
+     << "unsigned Flavour) const {\n"
+     << "  switch (Flavour) {\n"
+     << "  default:\n"
+     << "    assert(0 && \"Unknown DWARF flavour\");\n"
+     << "    return -1;\n";
+
+  for (unsigned i = 0, e = maxLength; i != e; ++i) {
+    OS << "  case " << i << ":\n"
+       << "    switch (DwarfRegNum) {\n"
+       << "    default:\n"
+       << "      assert(0 && \"Invalid DwarfRegNum\");\n"
+       << "      return -1;\n";
+
+    for (DwarfRegNumsMapTy::iterator
+           I = DwarfRegNums.begin(), E = DwarfRegNums.end(); I != E; ++I) {
+      int DwarfRegNo = I->second[i];
+      if (DwarfRegNo >= 0)
+        OS << "    case " <<  DwarfRegNo << ":\n"
+           << "      return " << getQualifiedName(I->first) << ";\n";
+    }
+    OS << "    };\n";
+  }
+
+  OS << "  };\n}\n\n";
+
+  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
+    Record *Reg = Regs[i].TheDef;
+    const RecordVal *V = Reg->getValue("DwarfAlias");
+    if (!V || !V->getValue())
+      continue;
+
+    DefInit *DI = dynamic_cast<DefInit*>(V->getValue());
+    Record *Alias = DI->getDef();
+    DwarfRegNums[Reg] = DwarfRegNums[Alias];
+  }
+
   // Emit information about the dwarf register numbers.
   OS << "int " << ClassName << "::getDwarfRegNumFull(unsigned RegNum, "
      << "unsigned Flavour) const {\n"
@@ -996,13 +895,8 @@ void RegisterInfoEmitter::run(raw_ostream &OS) {
     for (DwarfRegNumsMapTy::iterator
            I = DwarfRegNums.begin(), E = DwarfRegNums.end(); I != E; ++I) {
       int RegNo = I->second[i];
-      if (RegNo != -2)
-        OS << "    case " << getQualifiedName(I->first) << ":\n"
-           << "      return " << RegNo << ";\n";
-      else
-        OS << "    case " << getQualifiedName(I->first) << ":\n"
-           << "      assert(0 && \"Invalid register for this mode\");\n"
-           << "      return -1;\n";
+      OS << "    case " << getQualifiedName(I->first) << ":\n"
+         << "      return " << RegNo << ";\n";
     }
     OS << "    };\n";
   }
diff --git a/utils/TableGen/SetTheory.cpp b/utils/TableGen/SetTheory.cpp
new file mode 100644
index 0000000..ade1825
--- /dev/null
+++ b/utils/TableGen/SetTheory.cpp
@@ -0,0 +1,270 @@
+//===- SetTheory.cpp - Generate ordered sets from DAG expressions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SetTheory class that computes ordered sets of
+// Records from DAG expressions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SetTheory.h"
+#include "Record.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+
+// Define the standard operators.
+namespace {
+
+typedef SetTheory::RecSet RecSet;
+typedef SetTheory::RecVec RecVec;
+
+// (add a, b, ...) Evaluate and union all arguments.
+struct AddOp : public SetTheory::Operator {
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+    ST.evaluate(Expr->arg_begin(), Expr->arg_end(), Elts);
+  }
+};
+
+// (sub Add, Sub, ...) Set difference.
+struct SubOp : public SetTheory::Operator {
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+    if (Expr->arg_size() < 2)
+      throw "Set difference needs at least two arguments: " +
+        Expr->getAsString();
+    RecSet Add, Sub;
+    ST.evaluate(*Expr->arg_begin(), Add);
+    ST.evaluate(Expr->arg_begin() + 1, Expr->arg_end(), Sub);
+    for (RecSet::iterator I = Add.begin(), E = Add.end(); I != E; ++I)
+      if (!Sub.count(*I))
+        Elts.insert(*I);
+  }
+};
+
+// (and S1, S2) Set intersection.
+struct AndOp : public SetTheory::Operator {
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+    if (Expr->arg_size() != 2)
+      throw "Set intersection requires two arguments: " + Expr->getAsString();
+    RecSet S1, S2;
+    ST.evaluate(Expr->arg_begin()[0], S1);
+    ST.evaluate(Expr->arg_begin()[1], S2);
+    for (RecSet::iterator I = S1.begin(), E = S1.end(); I != E; ++I)
+      if (S2.count(*I))
+        Elts.insert(*I);
+  }
+};
+
+// SetIntBinOp - Abstract base class for (Op S, N) operators.
+struct SetIntBinOp : public SetTheory::Operator {
+  virtual void apply2(SetTheory &ST, DagInit *Expr,
+                     RecSet &Set, int64_t N,
+                     RecSet &Elts) =0;
+
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+    if (Expr->arg_size() != 2)
+      throw "Operator requires (Op Set, Int) arguments: " + Expr->getAsString();
+    RecSet Set;
+    ST.evaluate(Expr->arg_begin()[0], Set);
+    IntInit *II = dynamic_cast<IntInit*>(Expr->arg_begin()[1]);
+    if (!II)
+      throw "Second argument must be an integer: " + Expr->getAsString();
+    apply2(ST, Expr, Set, II->getValue(), Elts);
+  }
+};
+
+// (shl S, N) Shift left, remove the first N elements.
+struct ShlOp : public SetIntBinOp {
+  void apply2(SetTheory &ST, DagInit *Expr,
+             RecSet &Set, int64_t N,
+             RecSet &Elts) {
+    if (N < 0)
+      throw "Positive shift required: " + Expr->getAsString();
+    if (unsigned(N) < Set.size())
+      Elts.insert(Set.begin() + N, Set.end());
+  }
+};
+
+// (trunc S, N) Truncate after the first N elements.
+struct TruncOp : public SetIntBinOp {
+  void apply2(SetTheory &ST, DagInit *Expr,
+             RecSet &Set, int64_t N,
+             RecSet &Elts) {
+    if (N < 0)
+      throw "Positive length required: " + Expr->getAsString();
+    if (unsigned(N) > Set.size())
+      N = Set.size();
+    Elts.insert(Set.begin(), Set.begin() + N);
+  }
+};
+
+// Left/right rotation.
+struct RotOp : public SetIntBinOp {
+  const bool Reverse;
+
+  RotOp(bool Rev) : Reverse(Rev) {}
+
+  void apply2(SetTheory &ST, DagInit *Expr,
+             RecSet &Set, int64_t N,
+             RecSet &Elts) {
+    if (Reverse)
+      N = -N;
+    // N > 0 -> rotate left, N < 0 -> rotate right.
+    if (Set.empty())
+      return;
+    if (N < 0)
+      N = Set.size() - (-N % Set.size());
+    else
+      N %= Set.size();
+    Elts.insert(Set.begin() + N, Set.end());
+    Elts.insert(Set.begin(), Set.begin() + N);
+  }
+};
+
+// (decimate S, N) Pick every N'th element of S.
+struct DecimateOp : public SetIntBinOp {
+  void apply2(SetTheory &ST, DagInit *Expr,
+             RecSet &Set, int64_t N,
+             RecSet &Elts) {
+    if (N <= 0)
+      throw "Positive stride required: " + Expr->getAsString();
+    for (unsigned I = 0; I < Set.size(); I += N)
+      Elts.insert(Set[I]);
+  }
+};
+
+// (sequence "Format", From, To) Generate a sequence of records by name.
+struct SequenceOp : public SetTheory::Operator {
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+    if (Expr->arg_size() != 3)
+      throw "Bad args to (sequence \"Format\", From, To): " +
+        Expr->getAsString();
+    std::string Format;
+    if (StringInit *SI = dynamic_cast<StringInit*>(Expr->arg_begin()[0]))
+      Format = SI->getValue();
+    else
+      throw "Format must be a string: " + Expr->getAsString();
+
+    int64_t From, To;
+    if (IntInit *II = dynamic_cast<IntInit*>(Expr->arg_begin()[1]))
+      From = II->getValue();
+    else
+      throw "From must be an integer: " + Expr->getAsString();
+    if (IntInit *II = dynamic_cast<IntInit*>(Expr->arg_begin()[2]))
+      To = II->getValue();
+    else
+      throw "From must be an integer: " + Expr->getAsString();
+
+    RecordKeeper &Records =
+      dynamic_cast<DefInit&>(*Expr->getOperator()).getDef()->getRecords();
+
+    int Step = From <= To ? 1 : -1;
+    for (To += Step; From != To; From += Step) {
+      std::string Name;
+      raw_string_ostream OS(Name);
+      OS << format(Format.c_str(), From);
+      Record *Rec = Records.getDef(OS.str());
+      if (!Rec)
+        throw "No def named '" + Name + "': " + Expr->getAsString();
+      // Try to reevaluate Rec in case it is a set.
+      if (const RecVec *Result = ST.expand(Rec))
+        Elts.insert(Result->begin(), Result->end());
+      else
+        Elts.insert(Rec);
+    }
+  }
+};
+
+// Expand a Def into a set by evaluating one of its fields.
+struct FieldExpander : public SetTheory::Expander {
+  StringRef FieldName;
+
+  FieldExpander(StringRef fn) : FieldName(fn) {}
+
+  void expand(SetTheory &ST, Record *Def, RecSet &Elts) {
+    ST.evaluate(Def->getValueInit(FieldName), Elts);
+  }
+};
+} // end anonymous namespace
+
+SetTheory::SetTheory() {
+  addOperator("add", new AddOp);
+  addOperator("sub", new SubOp);
+  addOperator("and", new AndOp);
+  addOperator("shl", new ShlOp);
+  addOperator("trunc", new TruncOp);
+  addOperator("rotl", new RotOp(false));
+  addOperator("rotr", new RotOp(true));
+  addOperator("decimate", new DecimateOp);
+  addOperator("sequence", new SequenceOp);
+}
+
+void SetTheory::addOperator(StringRef Name, Operator *Op) {
+  Operators[Name] = Op;
+}
+
+void SetTheory::addExpander(StringRef ClassName, Expander *E) {
+  Expanders[ClassName] = E;
+}
+
+void SetTheory::addFieldExpander(StringRef ClassName, StringRef FieldName) {
+  addExpander(ClassName, new FieldExpander(FieldName));
+}
+
+void SetTheory::evaluate(Init *Expr, RecSet &Elts) {
+  // A def in a list can be a just an element, or it may expand.
+  if (DefInit *Def = dynamic_cast<DefInit*>(Expr)) {
+    if (const RecVec *Result = expand(Def->getDef()))
+      return Elts.insert(Result->begin(), Result->end());
+    Elts.insert(Def->getDef());
+    return;
+  }
+
+  // Lists simply expand.
+  if (ListInit *LI = dynamic_cast<ListInit*>(Expr))
+    return evaluate(LI->begin(), LI->end(), Elts);
+
+  // Anything else must be a DAG.
+  DagInit *DagExpr = dynamic_cast<DagInit*>(Expr);
+  if (!DagExpr)
+    throw "Invalid set element: " + Expr->getAsString();
+  DefInit *OpInit = dynamic_cast<DefInit*>(DagExpr->getOperator());
+  if (!OpInit)
+    throw "Bad set expression: " + Expr->getAsString();
+  Operator *Op = Operators.lookup(OpInit->getDef()->getName());
+  if (!Op)
+    throw "Unknown set operator: " + Expr->getAsString();
+  Op->apply(*this, DagExpr, Elts);
+}
+
+const RecVec *SetTheory::expand(Record *Set) {
+  // Check existing entries for Set and return early.
+  ExpandMap::iterator I = Expansions.find(Set);
+  if (I != Expansions.end())
+    return &I->second;
+
+  // This is the first time we see Set. Find a suitable expander.
+  try {
+    const std::vector<Record*> &SC = Set->getSuperClasses();
+    for (unsigned i = 0, e = SC.size(); i != e; ++i)
+      if (Expander *Exp = Expanders.lookup(SC[i]->getName())) {
+        // This breaks recursive definitions.
+        RecVec &EltVec = Expansions[Set];
+        RecSet Elts;
+        Exp->expand(*this, Set, Elts);
+        EltVec.assign(Elts.begin(), Elts.end());
+        return &EltVec;
+      }
+  } catch (const std::string &Error) {
+    throw TGError(Set->getLoc(), Error);
+  }
+
+  // Set is not expandable.
+  return 0;
+}
+
diff --git a/utils/TableGen/SetTheory.h b/utils/TableGen/SetTheory.h
new file mode 100644
index 0000000..e37a76e
--- /dev/null
+++ b/utils/TableGen/SetTheory.h
@@ -0,0 +1,136 @@
+//===- SetTheory.h - Generate ordered sets from DAG expressions -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SetTheory class that computes ordered sets of
+// Records from DAG expressions.  Operators for standard set operations are
+// predefined, and it is possible to add special purpose set operators as well.
+//
+// The user may define named sets as Records of predefined classes. Set
+// expanders can be added to a SetTheory instance to teach it how to find the
+// elements of such a named set.
+//
+// These are the predefined operators. The argument lists can be individual
+// elements (defs), other sets (defs of expandable classes), lists, or DAG
+// expressions that are evaluated recursively.
+//
+// - (add S1, S2 ...) Union sets. This is also how sets are created from element
+//   lists.
+//
+// - (sub S1, S2, ...) Set difference. Every element in S1 except for the
+//   elements in S2, ...
+//
+// - (and S1, S2) Set intersection. Every element in S1 that is also in S2.
+//
+// - (shl S, N) Shift left. Remove the first N elements from S.
+//
+// - (trunc S, N) Truncate. The first N elements of S.
+//
+// - (rotl S, N) Rotate left. Same as (add (shl S, N), (trunc S, N)).
+//
+// - (rotr S, N) Rotate right.
+//
+// - (decimate S, N) Decimate S by picking every N'th element, starting with
+//   the first one. For instance, (decimate S, 2) returns the even elements of
+//   S.
+//
+// - (sequence "Format", From, To) Generate a sequence of defs with printf.
+//   For instance, (sequence "R%u", 0, 3) -> [ R0, R1, R2, R3 ]
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SETTHEORY_H
+#define SETTHEORY_H
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/SetVector.h"
+#include <map>
+#include <vector>
+
+namespace llvm {
+
+class DagInit;
+struct Init;
+class Record;
+class RecordKeeper;
+
+class SetTheory {
+public:
+  typedef std::vector<Record*> RecVec;
+  typedef SmallSetVector<Record*, 16> RecSet;
+
+  /// Operator - A callback representing a DAG operator.
+  struct Operator {
+    virtual ~Operator() {}
+
+    /// apply - Apply this operator to Expr's arguments and insert the result
+    /// in Elts.
+    virtual void apply(SetTheory&, DagInit *Expr, RecSet &Elts) =0;
+  };
+
+  /// Expander - A callback function that can transform a Record representing a
+  /// set into a fully expanded list of elements. Expanders provide a way for
+  /// users to define named sets that can be used in DAG expressions.
+  struct Expander {
+    virtual ~Expander() {}
+
+    virtual void expand(SetTheory&, Record*, RecSet &Elts) =0;
+  };
+
+private:
+  // Map set defs to their fully expanded contents. This serves as a memoization
+  // cache and it makes it possible to return const references on queries.
+  typedef std::map<Record*, RecVec> ExpandMap;
+  ExpandMap Expansions;
+
+  // Known DAG operators by name.
+  StringMap<Operator*> Operators;
+
+  // Typed expanders by class name.
+  StringMap<Expander*> Expanders;
+
+public:
+  /// Create a SetTheory instance with only the standard operators.
+  SetTheory();
+
+  /// addExpander - Add an expander for Records with the named super class.
+  void addExpander(StringRef ClassName, Expander*);
+
+  /// addFieldExpander - Add an expander for ClassName that simply evaluates
+  /// FieldName in the Record to get the set elements.  That is all that is
+  /// needed for a class like:
+  ///
+  ///   class Set<dag d> {
+  ///     dag Elts = d;
+  ///   }
+  ///
+  void addFieldExpander(StringRef ClassName, StringRef FieldName);
+
+  /// addOperator - Add a DAG operator.
+  void addOperator(StringRef Name, Operator*);
+
+  /// evaluate - Evaluate Expr and append the resulting set to Elts.
+  void evaluate(Init *Expr, RecSet &Elts);
+
+  /// evaluate - Evaluate a sequence of Inits and append to Elts.
+  template<typename Iter>
+  void evaluate(Iter begin, Iter end, RecSet &Elts) {
+    while (begin != end)
+      evaluate(*begin++, Elts);
+  }
+
+  /// expand - Expand a record into a set of elements if possible.  Return a
+  /// pointer to the expanded elements, or NULL if Set cannot be expanded
+  /// further.
+  const RecVec *expand(Record *Set);
+};
+
+} // end namespace llvm
+
+#endif
+
diff --git a/utils/TableGen/TGLexer.cpp b/utils/TableGen/TGLexer.cpp
index 82d2b64..572c36d 100644
--- a/utils/TableGen/TGLexer.cpp
+++ b/utils/TableGen/TGLexer.cpp
@@ -267,14 +267,17 @@ bool TGLexer::LexInclude() {
 
   // Get the string.
   std::string Filename = CurStrVal;
+  std::string IncludedFile;
 
   
-  CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr));
+  CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
+                                    IncludedFile);
   if (CurBuffer == -1) {
     PrintError(getLoc(), "Could not find include file '" + Filename + "'");
     return true;
   }
   
+  Dependencies.push_back(IncludedFile);
   // Save the line number and lex buffer of the includer.
   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
   CurPtr = CurBuf->getBufferStart();
diff --git a/utils/TableGen/TGLexer.h b/utils/TableGen/TGLexer.h
index e1aa5a7..c2a6453 100644
--- a/utils/TableGen/TGLexer.h
+++ b/utils/TableGen/TGLexer.h
@@ -16,6 +16,7 @@
 
 #include "llvm/Support/DataTypes.h"
 #include <string>
+#include <vector>
 #include <cassert>
 
 namespace llvm {
@@ -71,6 +72,8 @@ class TGLexer {
   /// CurBuffer - This is the current buffer index we're lexing from as managed
   /// by the SourceMgr object.
   int CurBuffer;
+  /// Dependencies - This is the list of all included files.
+  std::vector<std::string> Dependencies;
   
 public:
   TGLexer(SourceMgr &SrcMgr);
@@ -79,6 +82,10 @@ public:
   tgtok::TokKind Lex() {
     return CurCode = LexToken();
   }
+
+  const std::vector<std::string> &getDependencies() const {
+    return Dependencies;
+  }
   
   tgtok::TokKind getCode() const { return CurCode; }
 
diff --git a/utils/TableGen/TGParser.h b/utils/TableGen/TGParser.h
index 9cdf68f..419a99b 100644
--- a/utils/TableGen/TGParser.h
+++ b/utils/TableGen/TGParser.h
@@ -66,6 +66,9 @@ public:
   bool TokError(const Twine &Msg) const {
     return Error(Lex.getLoc(), Msg);
   }
+  const std::vector<std::string> &getDependencies() const {
+    return Lex.getDependencies();
+  }
 private:  // Semantic analysis methods.
   bool AddValue(Record *TheRec, SMLoc Loc, const RecordVal &RV);
   bool SetValue(Record *TheRec, SMLoc Loc, const std::string &ValName, 
diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp
index aa92302..4e4da36 100644
--- a/utils/TableGen/TableGen.cpp
+++ b/utils/TableGen/TableGen.cpp
@@ -37,6 +37,7 @@
 #include "RegisterInfoEmitter.h"
 #include "ARMDecoderEmitter.h"
 #include "SubtargetEmitter.h"
+#include "SetTheory.h"
 #include "TGParser.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/Support/CommandLine.h"
@@ -80,7 +81,8 @@ enum ActionType {
   GenArmNeon,
   GenArmNeonSema,
   GenArmNeonTest,
-  PrintEnums
+  PrintEnums,
+  PrintSets
 };
 
 namespace {
@@ -162,6 +164,8 @@ namespace {
                                "Generate ARM NEON tests for clang"),
                     clEnumValN(PrintEnums, "print-enums",
                                "Print enum values for a class"),
+                    clEnumValN(PrintSets, "print-sets",
+                               "Print expanded sets for testing DAG exprs"),
                     clEnumValEnd));
 
   cl::opt<std::string>
@@ -173,6 +177,10 @@ namespace {
                  cl::init("-"));
 
   cl::opt<std::string>
+  DependFilename("d", cl::desc("Dependency filename"), cl::value_desc("filename"),
+                 cl::init(""));
+
+  cl::opt<std::string>
   InputFilename(cl::Positional, cl::desc("<input file>"), cl::init("-"));
 
   cl::list<std::string>
@@ -192,34 +200,6 @@ void llvm::PrintError(SMLoc ErrorLoc, const Twine &Msg) {
   SrcMgr.PrintMessage(ErrorLoc, Msg, "error");
 }
 
-
-
-/// ParseFile - this function begins the parsing of the specified tablegen
-/// file.
-static bool ParseFile(const std::string &Filename,
-                      const std::vector<std::string> &IncludeDirs,
-                      SourceMgr &SrcMgr,
-                      RecordKeeper &Records) {
-  OwningPtr<MemoryBuffer> File;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename.c_str(), File)) {
-    errs() << "Could not open input file '" << Filename << "': "
-           << ec.message() <<"\n";
-    return true;
-  }
-  MemoryBuffer *F = File.take();
-
-  // Tell SrcMgr about this buffer, which is what TGParser will pick up.
-  SrcMgr.AddNewSourceBuffer(F, SMLoc());
-
-  // Record the location of the include directory so that the lexer can find
-  // it later.
-  SrcMgr.setIncludeDirs(IncludeDirs);
-
-  TGParser Parser(SrcMgr, Records);
-
-  return Parser.ParseFile();
-}
-
 int main(int argc, char **argv) {
   RecordKeeper Records;
 
@@ -228,19 +208,57 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv);
 
 
-  // Parse the input file.
-  if (ParseFile(InputFilename, IncludeDirs, SrcMgr, Records))
-    return 1;
+  try {
+    // Parse the input file.
+    OwningPtr<MemoryBuffer> File;
+    if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFilename.c_str(), File)) {
+      errs() << "Could not open input file '" << InputFilename << "': "
+             << ec.message() <<"\n";
+      return 1;
+    }
+    MemoryBuffer *F = File.take();
 
-  std::string Error;
-  tool_output_file Out(OutputFilename.c_str(), Error);
-  if (!Error.empty()) {
-    errs() << argv[0] << ": error opening " << OutputFilename
-           << ":" << Error << "\n";
-    return 1;
-  }
+    // Tell SrcMgr about this buffer, which is what TGParser will pick up.
+    SrcMgr.AddNewSourceBuffer(F, SMLoc());
+
+    // Record the location of the include directory so that the lexer can find
+    // it later.
+    SrcMgr.setIncludeDirs(IncludeDirs);
+
+    TGParser Parser(SrcMgr, Records);
+
+    if (Parser.ParseFile())
+      return 1;
+
+    std::string Error;
+    tool_output_file Out(OutputFilename.c_str(), Error);
+    if (!Error.empty()) {
+      errs() << argv[0] << ": error opening " << OutputFilename
+        << ":" << Error << "\n";
+      return 1;
+    }
+    if (!DependFilename.empty()) {
+      if (OutputFilename == "-") {
+        errs() << argv[0] << ": the option -d must be used together with -o\n";
+        return 1;
+      }
+      tool_output_file DepOut(DependFilename.c_str(), Error);
+      if (!Error.empty()) {
+        errs() << argv[0] << ": error opening " << DependFilename
+          << ":" << Error << "\n";
+        return 1;
+      }
+      DepOut.os() << DependFilename << ":";
+      const std::vector<std::string> &Dependencies = Parser.getDependencies();
+      for (std::vector<std::string>::const_iterator I = Dependencies.begin(),
+                                                          E = Dependencies.end();
+           I != E; ++I) {
+        DepOut.os() << " " << (*I);
+      }
+      DepOut.os() << "\n";
+      DepOut.keep();
+    }
 
-  try {
     switch (Action) {
     case PrintRecords:
       Out.os() << Records;           // No argument, dump all contents
@@ -360,6 +378,21 @@ int main(int argc, char **argv) {
       Out.os() << "\n";
       break;
     }
+    case PrintSets:
+    {
+      SetTheory Sets;
+      Sets.addFieldExpander("Set", "Elements");
+      std::vector<Record*> Recs = Records.getAllDerivedDefinitions("Set");
+      for (unsigned i = 0, e = Recs.size(); i != e; ++i) {
+        Out.os() << Recs[i]->getName() << " = [";
+        const std::vector<Record*> *Elts = Sets.expand(Recs[i]);
+        assert(Elts && "Couldn't expand Set instance");
+        for (unsigned ei = 0, ee = Elts->size(); ei != ee; ++ei)
+          Out.os() << ' ' << (*Elts)[ei]->getName();
+        Out.os() << " ]\n";
+      }
+      break;
+    }
     default:
       assert(1 && "Invalid Action");
       return 1;
diff --git a/utils/valgrind/i386-pc-linux-gnu.supp b/utils/valgrind/i386-pc-linux-gnu.supp
index 0509791..ddd0a08 100644
--- a/utils/valgrind/i386-pc-linux-gnu.supp
+++ b/utils/valgrind/i386-pc-linux-gnu.supp
@@ -39,3 +39,10 @@
    fun:malloc
    obj:/usr/bin/python*
 }
+
+{
+   We don't care about anything ld.so does.
+   Memcheck:Cond
+   obj:/lib/ld*.so
+}
+
diff --git a/utils/valgrind/x86_64-pc-linux-gnu.supp b/utils/valgrind/x86_64-pc-linux-gnu.supp
index 7b2dd45..3d15d71 100644
--- a/utils/valgrind/x86_64-pc-linux-gnu.supp
+++ b/utils/valgrind/x86_64-pc-linux-gnu.supp
@@ -44,3 +44,10 @@
    fun:malloc
    obj:/usr/bin/python*
 }
+
+{
+   We don't care about anything ld.so does.
+   Memcheck:Cond
+   obj:/lib/ld*.so
+}
+