29 files changed, 1419 insertions, 28 deletions
diff --git a/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll b/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll
index ff01506..f775c61 100644
--- a/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll
+++ b/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=arm-linux-gnueabi -regalloc=local
+; RUN: llc < %s -mtriple=arm-linux-gnueabi -regalloc=fast
 ; PR1925
 
 	%struct.encode_aux_nearestmatch = type { i32*, i32*, i32*, i32*, i32, i32 }
diff --git a/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll b/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll
index 06bc987..8ef8c7b 100644
--- a/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll
+++ b/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=local
+; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=fast
 ; PR1925
 
 	%"struct.kc::impl_Ccode_option" = type { %"struct.kc::impl_abstract_phylum" }
diff --git a/test/CodeGen/ARM/2009-05-05-DAGCombineBug.ll b/test/CodeGen/ARM/2009-05-05-DAGCombineBug.ll
index 670d204..a48e41f 100644
--- a/test/CodeGen/ARM/2009-05-05-DAGCombineBug.ll
+++ b/test/CodeGen/ARM/2009-05-05-DAGCombineBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm-linuxeabi-unknown-gnu -mattr=+v6
+; RUN: llc < %s -mtriple=arm-unknown-linux-gnueabi -mattr=+v6
 ; PR4166
 
 	%"byte[]" = type { i32, i8* }
diff --git a/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll b/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll
index 75610ff..912e6f9 100644
--- a/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll
+++ b/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=armv5-unknown-linux-gnueabi -O0 -regalloc=local
+; RUN: llc < %s -mtriple=armv5-unknown-linux-gnueabi -O0 -regalloc=fast
 ; PR4100
 @.str = external constant [30 x i8]		; <[30 x i8]*> [#uses=1]
 
diff --git a/test/CodeGen/ARM/2009-11-02-NegativeLane.ll b/test/CodeGen/ARM/2009-11-02-NegativeLane.ll
index f2288c3..89c9037 100644
--- a/test/CodeGen/ARM/2009-11-02-NegativeLane.ll
+++ b/test/CodeGen/ARM/2009-11-02-NegativeLane.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=cortex-a8 < %s | grep vdup.32
+; RUN: llc -mcpu=cortex-a8 < %s | grep vdup.16
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "armv7-eabi"
 
@@ -7,12 +7,12 @@ entry:
   br i1 undef, label %return, label %bb
 
 bb:                                               ; preds = %bb, %entry
-  %0 = load float* undef, align 4                 ; <float> [#uses=1]
-  %1 = insertelement <4 x float> undef, float %0, i32 2 ; <<4 x float>> [#uses=1]
-  %2 = insertelement <4 x float> %1, float undef, i32 3 ; <<4 x float>> [#uses=1]
-  %3 = fmul <4 x float> undef, %2                 ; <<4 x float>> [#uses=1]
-  %4 = extractelement <4 x float> %3, i32 1       ; <float> [#uses=1]
-  store float %4, float* undef, align 4
+  %0 = load i16* undef, align 2
+  %1 = insertelement <8 x i16> undef, i16 %0, i32 2
+  %2 = insertelement <8 x i16> %1, i16 undef, i32 3
+  %3 = mul <8 x i16> %2, %2
+  %4 = extractelement <8 x i16> %3, i32 2
+  store i16 %4, i16* undef, align 2
   br i1 undef, label %return, label %bb
 
 return:                                           ; preds = %bb, %entry
diff --git a/test/CodeGen/ARM/2010-05-14-IllegalType.ll b/test/CodeGen/ARM/2010-05-14-IllegalType.ll
new file mode 100644
index 0000000..99e5b09
--- /dev/null
+++ b/test/CodeGen/ARM/2010-05-14-IllegalType.ll
@@ -0,0 +1,10 @@
+; RUN: llc -march=thumb -mcpu=cortex-a8 -mtriple=thumbv7-eabi -float-abi=hard < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
+target triple = "thumbv7-apple-darwin10"
+
+define <4 x i64> @f_4_i64(<4 x i64> %a, <4 x i64> %b) nounwind {
+; CHECK: vadd.i64
+ %y = add <4 x i64> %a, %b
+ ret <4 x i64> %y
+}
diff --git a/test/CodeGen/ARM/2010-05-17-DAGCombineAssert.ll b/test/CodeGen/ARM/2010-05-17-DAGCombineAssert.ll
new file mode 100644
index 0000000..2a4bbd1
--- /dev/null
+++ b/test/CodeGen/ARM/2010-05-17-DAGCombineAssert.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mtriple=armv7-eabi -mcpu=cortex-a8
+; PR7158
+
+define arm_aapcs_vfpcc i32 @main() nounwind {
+bb.nph55.bb.nph55.split_crit_edge:
+  br label %bb3
+
+bb3:                                              ; preds = %bb3, %bb.nph55.bb.nph55.split_crit_edge
+  br i1 undef, label %bb.i19, label %bb3
+
+bb.i19:                                           ; preds = %bb.i19, %bb3
+  %0 = insertelement <4 x float> undef, float undef, i32 3 ; <<4 x float>> [#uses=3]
+  %1 = fmul <4 x float> %0, %0                    ; <<4 x float>> [#uses=1]
+  %2 = bitcast <4 x float> %1 to <2 x double>     ; <<2 x double>> [#uses=0]
+  %3 = fmul <4 x float> %0, undef                 ; <<4 x float>> [#uses=0]
+  br label %bb.i19
+}
diff --git a/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll b/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll
new file mode 100644
index 0000000..813bf3c
--- /dev/null
+++ b/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll
@@ -0,0 +1,105 @@
+; RUN: llc < %s -regalloc=fast -verify-machineinstrs
+target triple = "arm-pc-linux-gnu"
+
+; This test case would accidentally use the same physreg for two virtregs
+; because allocVirtReg forgot to check if registers were already used in the
+; instruction.
+; This caused the RegScavenger to complain, but -verify-machineinstrs also
+; catches it.
+
+%struct.CHESS_POSITION = type { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, i32, i8, i8, [64 x i8], i8, i8, i8, i8, i8 }
+
+@search = external global %struct.CHESS_POSITION  ; <%struct.CHESS_POSITION*> [#uses=1]
+@bishop_mobility_rr45 = external global [64 x [256 x i32]] ; <[64 x [256 x i32]]*> [#uses=1]
+
+declare fastcc i32 @FirstOne()
+
+define fastcc void @Evaluate() {
+entry:
+  br i1 false, label %cond_false186, label %cond_true
+
+cond_true:                                        ; preds = %entry
+  ret void
+
+cond_false186:                                    ; preds = %entry
+  br i1 false, label %cond_true293, label %bb203
+
+bb203:                                            ; preds = %cond_false186
+  ret void
+
+cond_true293:                                     ; preds = %cond_false186
+  br i1 false, label %cond_true298, label %cond_next317
+
+cond_true298:                                     ; preds = %cond_true293
+  br i1 false, label %cond_next518, label %cond_true397.preheader
+
+cond_next317:                                     ; preds = %cond_true293
+  ret void
+
+cond_true397.preheader:                           ; preds = %cond_true298
+  ret void
+
+cond_next518:                                     ; preds = %cond_true298
+  br i1 false, label %bb1069, label %cond_true522
+
+cond_true522:                                     ; preds = %cond_next518
+  ret void
+
+bb1069:                                           ; preds = %cond_next518
+  br i1 false, label %cond_next1131, label %bb1096
+
+bb1096:                                           ; preds = %bb1069
+  ret void
+
+cond_next1131:                                    ; preds = %bb1069
+  br i1 false, label %cond_next1207, label %cond_true1150
+
+cond_true1150:                                    ; preds = %cond_next1131
+  ret void
+
+cond_next1207:                                    ; preds = %cond_next1131
+  br i1 false, label %cond_next1219, label %cond_true1211
+
+cond_true1211:                                    ; preds = %cond_next1207
+  ret void
+
+cond_next1219:                                    ; preds = %cond_next1207
+  br i1 false, label %cond_true1223, label %cond_next1283
+
+cond_true1223:                                    ; preds = %cond_next1219
+  br i1 false, label %cond_true1254, label %cond_true1264
+
+cond_true1254:                                    ; preds = %cond_true1223
+  br i1 false, label %bb1567, label %cond_true1369.preheader
+
+cond_true1264:                                    ; preds = %cond_true1223
+  ret void
+
+cond_next1283:                                    ; preds = %cond_next1219
+  ret void
+
+cond_true1369.preheader:                          ; preds = %cond_true1254
+  ret void
+
+bb1567:                                           ; preds = %cond_true1254
+  %tmp1591 = load i64* getelementptr inbounds (%struct.CHESS_POSITION* @search, i32 0, i32 4) ; <i64> [#uses=1]
+  %tmp1572 = tail call fastcc i32 @FirstOne()     ; <i32> [#uses=1]
+  %tmp1594 = load i32* undef                      ; <i32> [#uses=1]
+  %tmp1594.upgrd.5 = trunc i32 %tmp1594 to i8     ; <i8> [#uses=1]
+  %shift.upgrd.6 = zext i8 %tmp1594.upgrd.5 to i64 ; <i64> [#uses=1]
+  %tmp1595 = lshr i64 %tmp1591, %shift.upgrd.6    ; <i64> [#uses=1]
+  %tmp1595.upgrd.7 = trunc i64 %tmp1595 to i32    ; <i32> [#uses=1]
+  %tmp1596 = and i32 %tmp1595.upgrd.7, 255        ; <i32> [#uses=1]
+  %gep.upgrd.8 = zext i32 %tmp1596 to i64         ; <i64> [#uses=1]
+  %tmp1598 = getelementptr [64 x [256 x i32]]* @bishop_mobility_rr45, i32 0, i32 %tmp1572, i64 %gep.upgrd.8 ; <i32*> [#uses=1]
+  %tmp1599 = load i32* %tmp1598                   ; <i32> [#uses=1]
+  %tmp1602 = sub i32 0, %tmp1599                  ; <i32> [#uses=1]
+  br i1 undef, label %cond_next1637, label %cond_true1607
+
+cond_true1607:                                    ; preds = %bb1567
+  ret void
+
+cond_next1637:                                    ; preds = %bb1567
+  %tmp1662 = sub i32 %tmp1602, 0                  ; <i32> [#uses=0]
+  ret void
+}
diff --git a/test/CodeGen/ARM/2010-05-18-LocalAllocCrash.ll b/test/CodeGen/ARM/2010-05-18-LocalAllocCrash.ll
new file mode 100644
index 0000000..b158afd
--- /dev/null
+++ b/test/CodeGen/ARM/2010-05-18-LocalAllocCrash.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -regalloc=local
+; RUN: llc < %s -O0 -verify-machineinstrs -regalloc=fast
+; rdar://problem/7948106
+;; This test would spill %R4 before the call to zz, but it forgot to move the
+; 'last use' marker to the spill.
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:64-n32"
+target triple = "armv6-apple-darwin"
+
+%struct.q = type { i32, i32 }
+
+@.str = external constant [1 x i8]                ; <[1 x i8]*> [#uses=1]
+
+define arm_apcscc void @yy(%struct.q* %qq) nounwind {
+entry:
+  %vla6 = alloca i8, i32 undef, align 1           ; <i8*> [#uses=1]
+  %vla10 = alloca i8, i32 undef, align 1          ; <i8*> [#uses=1]
+  %vla14 = alloca i8, i32 undef, align 1          ; <i8*> [#uses=1]
+  %vla18 = alloca i8, i32 undef, align 1          ; <i8*> [#uses=1]
+  %tmp21 = load i32* undef                        ; <i32> [#uses=1]
+  %0 = mul i32 1, %tmp21                          ; <i32> [#uses=1]
+  %vla22 = alloca i8, i32 %0, align 1             ; <i8*> [#uses=1]
+  call arm_apcscc  void (...)* @zz(i8* getelementptr inbounds ([1 x i8]* @.str, i32 0, i32 0), i32 2, i32 1)
+  br i1 undef, label %if.then, label %if.end36
+
+if.then:                                          ; preds = %entry
+  %call = call arm_apcscc  i32 (...)* @x(%struct.q* undef, i8* undef, i8* %vla6, i8* %vla10, i32 undef) ; <i32> [#uses=0]
+  %call35 = call arm_apcscc  i32 (...)* @x(%struct.q* undef, i8* %vla14, i8* %vla18, i8* %vla22, i32 undef) ; <i32> [#uses=0]
+  unreachable
+
+if.end36:                                         ; preds = %entry
+  ret void
+}
+
+declare arm_apcscc void @zz(...)
+
+declare arm_apcscc i32 @x(...)
diff --git a/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll b/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll
new file mode 100644
index 0000000..9907228
--- /dev/null
+++ b/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=armv7-apple-darwin   | FileCheck %s -check-prefix=ARM
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s -check-prefix=THUMB
+; rdar://7998649
+
+%struct.foo = type { i64, i64 }
+
+define arm_apcscc zeroext i8 @t(%struct.foo* %this) noreturn optsize {
+entry:
+; ARM:       t:
+; ARM:       str r0, [r1], r0
+
+; THUMB:     t:
+; THUMB-NOT: str r0, [r1], r0
+; THUMB:     str r0, [r1]
+  %0 = getelementptr inbounds %struct.foo* %this, i32 0, i32 1 ; <i64*> [#uses=1]
+  store i32 undef, i32* inttoptr (i32 8 to i32*), align 8
+  br i1 undef, label %bb.nph96, label %bb3
+
+bb3:                                              ; preds = %entry
+  %1 = load i64* %0, align 4                      ; <i64> [#uses=0]
+  unreachable
+
+bb.nph96:                                         ; preds = %entry
+  unreachable
+}
diff --git a/test/CodeGen/ARM/2010-05-19-Shuffles.ll b/test/CodeGen/ARM/2010-05-19-Shuffles.ll
new file mode 100644
index 0000000..587c0af
--- /dev/null
+++ b/test/CodeGen/ARM/2010-05-19-Shuffles.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=armv7-eabi -mcpu=cortex-a8
+; pr7167
+
+define <8 x i8> @f1(<8 x i8> %x) nounwind {
+  %y = shufflevector <8 x i8> %x, <8 x i8> undef,
+       <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
+  ret <8 x i8> %y
+}
+
+define <8 x i8> @f2(<8 x i8> %x) nounwind {
+  %y = shufflevector <8 x i8> %x, <8 x i8> undef,
+       <8 x i32> <i32 1, i32 2, i32 0, i32 5, i32 3, i32 6, i32 7, i32 4>
+  ret <8 x i8> %y
+}
+
+define void @f3(<4 x i64>* %xp) nounwind {
+  %x = load <4 x i64>* %xp
+  %y = shufflevector <4 x i64> %x, <4 x i64> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+  store <4 x i64> %y, <4 x i64>* %xp
+  ret void
+}
diff --git a/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll b/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll
new file mode 100644
index 0000000..b6fbf9b
--- /dev/null
+++ b/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -march=arm -mattr=+neon -O0
+
+; This test would crash the rewriter when trying to handle a spill after one of
+; the @llvm.arm.neon.vld3.v8i8 defined three parts of a register.
+
+%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
+
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*) nounwind readonly
+
+declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
+
+define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind {
+  %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A2) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp2b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 0 ; <<8 x i8>> [#uses=1]
+  %tmp4b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 1 ; <<8 x i8>> [#uses=1]
+  %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1]
+  %tmp4d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 1 ; <<8 x i8>> [#uses=1]
+  %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A5) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+  %tmp2e = extractvalue %struct.__neon_int8x8x3_t %tmp1e, 0 ; <<8 x i8>> [#uses=1]
+  %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+  %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1]
+  %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A7) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp2g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 0 ; <<8 x i8>> [#uses=1]
+  %tmp4g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 1 ; <<8 x i8>> [#uses=1]
+  %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A8) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp2h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 0 ; <<8 x i8>> [#uses=1]
+  %tmp3h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 2 ; <<8 x i8>> [#uses=1]
+  %tmp2bd = add <8 x i8> %tmp2b, %tmp2d           ; <<8 x i8>> [#uses=1]
+  %tmp4bd = add <8 x i8> %tmp4b, %tmp4d           ; <<8 x i8>> [#uses=1]
+  %tmp2abcd = mul <8 x i8> undef, %tmp2bd         ; <<8 x i8>> [#uses=1]
+  %tmp4abcd = mul <8 x i8> undef, %tmp4bd         ; <<8 x i8>> [#uses=2]
+  call void @llvm.arm.neon.vst3.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd)
+  %tmp2ef = sub <8 x i8> %tmp2e, %tmp2f           ; <<8 x i8>> [#uses=1]
+  %tmp2gh = sub <8 x i8> %tmp2g, %tmp2h           ; <<8 x i8>> [#uses=1]
+  %tmp3gh = sub <8 x i8> zeroinitializer, %tmp3h  ; <<8 x i8>> [#uses=1]
+  %tmp4ef = sub <8 x i8> zeroinitializer, %tmp4g  ; <<8 x i8>> [#uses=1]
+  %tmp2efgh = mul <8 x i8> %tmp2ef, %tmp2gh       ; <<8 x i8>> [#uses=1]
+  %tmp3efgh = mul <8 x i8> undef, %tmp3gh         ; <<8 x i8>> [#uses=1]
+  %tmp4efgh = mul <8 x i8> %tmp4ef, undef         ; <<8 x i8>> [#uses=2]
+  call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh)
+  %tmp4 = sub <8 x i8> %tmp4efgh, %tmp4abcd       ; <<8 x i8>> [#uses=1]
+  tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef)
+  ret <8 x i8> %tmp4
+}
diff --git a/test/CodeGen/ARM/2010-05-21-BuildVector.ll b/test/CodeGen/ARM/2010-05-21-BuildVector.ll
new file mode 100644
index 0000000..6b19490
--- /dev/null
+++ b/test/CodeGen/ARM/2010-05-21-BuildVector.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+; Radar 7872877
+
+define arm_apcscc void @test(float* %fltp, i32 %packedValue, float* %table) nounwind {
+entry:
+  %0 = load float* %fltp
+  %1 = insertelement <4 x float> undef, float %0, i32 0
+  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+  %3 = shl i32 %packedValue, 16
+  %4 = ashr i32 %3, 30
+  %.sum = add i32 %4, 4
+  %5 = getelementptr inbounds float* %table, i32 %.sum
+;CHECK: vldr.32 s
+  %6 = load float* %5, align 4
+  %tmp11 = insertelement <4 x float> undef, float %6, i32 0
+  %7 = shl i32 %packedValue, 18
+  %8 = ashr i32 %7, 30
+  %.sum12 = add i32 %8, 4
+  %9 = getelementptr inbounds float* %table, i32 %.sum12
+;CHECK: vldr.32 s
+  %10 = load float* %9, align 4
+  %tmp9 = insertelement <4 x float> %tmp11, float %10, i32 1
+  %11 = shl i32 %packedValue, 20
+  %12 = ashr i32 %11, 30
+  %.sum13 = add i32 %12, 4
+  %13 = getelementptr inbounds float* %table, i32 %.sum13
+;CHECK: vldr.32 s
+  %14 = load float* %13, align 4
+  %tmp7 = insertelement <4 x float> %tmp9, float %14, i32 2
+  %15 = shl i32 %packedValue, 22
+  %16 = ashr i32 %15, 30
+  %.sum14 = add i32 %16, 4
+  %17 = getelementptr inbounds float* %table, i32 %.sum14
+;CHECK: vldr.32 s
+  %18 = load float* %17, align 4
+  %tmp5 = insertelement <4 x float> %tmp7, float %18, i32 3
+  %19 = fmul <4 x float> %tmp5, %2
+  %20 = bitcast float* %fltp to i8*
+  tail call void @llvm.arm.neon.vst1.v4f32(i8* %20, <4 x float> %19)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>) nounwind
diff --git a/test/CodeGen/ARM/arm-frameaddr.ll b/test/CodeGen/ARM/arm-frameaddr.ll
index 2739860..1c7ac25 100644
--- a/test/CodeGen/ARM/arm-frameaddr.ll
+++ b/test/CodeGen/ARM/arm-frameaddr.ll
@@ -1,10 +1,15 @@
-; RUN: llc < %s -mtriple=arm-apple-darwin  | grep mov | grep r7
-; RUN: llc < %s -mtriple=arm-linux-gnueabi | grep mov | grep r11
+; RUN: llc < %s -mtriple=arm-apple-darwin  | FileCheck %s -check-prefix=DARWIN
+; RUN: llc < %s -mtriple=arm-linux-gnueabi | FileCheck %s -check-prefix=LINUX
 ; PR4344
 ; PR4416
 
 define arm_aapcscc i8* @t() nounwind {
 entry:
+; DARWIN: t:
+; DARWIN: mov r0, r7
+
+; LINUX: t:
+; LINUX: mov r0, r11
 	%0 = call i8* @llvm.frameaddress(i32 0)
         ret i8* %0
 }
diff --git a/test/CodeGen/ARM/arm-returnaddr.ll b/test/CodeGen/ARM/arm-returnaddr.ll
new file mode 100644
index 0000000..2c8f2ab
--- /dev/null
+++ b/test/CodeGen/ARM/arm-returnaddr.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv6-apple-darwin
+; rdar://8015977
+; rdar://8020118
+
+define arm_apcscc i8* @rt0(i32 %x) nounwind readnone {
+entry:
+; CHECK: rt0:
+; CHECK: mov r0, lr
+  %0 = tail call i8* @llvm.returnaddress(i32 0)
+  ret i8* %0
+}
+
+define arm_apcscc i8* @rt2() nounwind readnone {
+entry:
+; CHECK: rt2:
+; CHECK: ldr r0, [r7]
+; CHECK: ldr r0, [r0]
+; CHECK: ldr r0, [r0, #4]
+  %0 = tail call i8* @llvm.returnaddress(i32 2)
+  ret i8* %0
+}
+
+declare i8* @llvm.returnaddress(i32) nounwind readnone
diff --git a/test/CodeGen/ARM/div.ll b/test/CodeGen/ARM/div.ll
index 2f724e7..d833afa 100644
--- a/test/CodeGen/ARM/div.ll
+++ b/test/CodeGen/ARM/div.ll
@@ -1,29 +1,43 @@
-; RUN: llc < %s -march=arm > %t
-; RUN: grep __divsi3  %t
-; RUN: grep __udivsi3 %t
-; RUN: grep __modsi3  %t
-; RUN: grep __umodsi3 %t
+; RUN: llc < %s -march=arm | FileCheck %s -check-prefix=CHECK-ARM
+; RUN: llc < %s -march=arm -mcpu=cortex-m3 \
+; RUN:    | FileCheck %s -check-prefix=CHECK-ARMV7M
 
 define i32 @f1(i32 %a, i32 %b) {
 entry:
+; CHECK-ARM: f1
+; CHECK-ARM: __divsi3
+; CHECK-ARMV7M: f1
+; CHECK-ARMV7M: sdiv
         %tmp1 = sdiv i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
 
 define i32 @f2(i32 %a, i32 %b) {
 entry:
+; CHECK-ARM: f2
+; CHECK-ARM: __udivsi3
+; CHECK-ARMV7M: f2
+; CHECK-ARMV7M: udiv
         %tmp1 = udiv i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
 
 define i32 @f3(i32 %a, i32 %b) {
 entry:
+; CHECK-ARM: f3
+; CHECK-ARM: __modsi3
+; CHECK-ARMV7M: f3
+; CHECK-ARMV7M: sdiv
         %tmp1 = srem i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
 
 define i32 @f4(i32 %a, i32 %b) {
 entry:
+; CHECK-ARM: f4
+; CHECK-ARM: __umodsi3
+; CHECK-ARMV7M: f4
+; CHECK-ARMV7M: udiv
         %tmp1 = urem i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
diff --git a/test/CodeGen/ARM/fabss.ll b/test/CodeGen/ARM/fabss.ll
index f03282b..dfc1e0a 100644
--- a/test/CodeGen/ARM/fabss.ll
+++ b/test/CodeGen/ARM/fabss.ll
@@ -24,4 +24,4 @@ declare float @fabsf(float)
 ; CORTEXA8: test:
 ; CORTEXA8: 	vabs.f32	d1, d1
 ; CORTEXA9: test:
-; CORTEXA9: 	vabs.f32	s1, s1
+; CORTEXA9: 	vabs.f32	s0, s0
diff --git a/test/CodeGen/ARM/fadds.ll b/test/CodeGen/ARM/fadds.ll
index 749690e..113f0e2 100644
--- a/test/CodeGen/ARM/fadds.ll
+++ b/test/CodeGen/ARM/fadds.ll
@@ -20,4 +20,4 @@ entry:
 ; CORTEXA8: test:
 ; CORTEXA8: 	vadd.f32	d0, d1, d0
 ; CORTEXA9: test:
-; CORTEXA9: 	vadd.f32	s0, s1, s0
+; CORTEXA9: 	vadd.f32	s0, s0, s1
diff --git a/test/CodeGen/ARM/fdivs.ll b/test/CodeGen/ARM/fdivs.ll
index 0c31495..9af1217 100644
--- a/test/CodeGen/ARM/fdivs.ll
+++ b/test/CodeGen/ARM/fdivs.ll
@@ -20,4 +20,4 @@ entry:
 ; CORTEXA8: test:
 ; CORTEXA8: 	vdiv.f32	s0, s1, s0
 ; CORTEXA9: test:
-; CORTEXA9: 	vdiv.f32	s0, s1, s0
+; CORTEXA9: 	vdiv.f32	s0, s0, s1
diff --git a/test/CodeGen/ARM/fmacs.ll b/test/CodeGen/ARM/fmacs.ll
index f8b47b5..c4ceca9 100644
--- a/test/CodeGen/ARM/fmacs.ll
+++ b/test/CodeGen/ARM/fmacs.ll
@@ -21,4 +21,4 @@ entry:
 ; CORTEXA8: test:
 ; CORTEXA8: 	vmul.f32	d0, d1, d0
 ; CORTEXA9: test:
-; CORTEXA9: 	vmla.f32	s2, s1, s0
+; CORTEXA9: 	vmla.f32	s0, s1, s2
diff --git a/test/CodeGen/ARM/fmscs.ll b/test/CodeGen/ARM/fmscs.ll
index 7a70543..103ce33 100644
--- a/test/CodeGen/ARM/fmscs.ll
+++ b/test/CodeGen/ARM/fmscs.ll
@@ -21,4 +21,4 @@ entry:
 ; CORTEXA8: test:
 ; CORTEXA8: 	vnmls.f32	s2, s1, s0
 ; CORTEXA9: test:
-; CORTEXA9: 	vnmls.f32	s2, s1, s0
+; CORTEXA9: 	vnmls.f32	s0, s1, s2
diff --git a/test/CodeGen/ARM/fmuls.ll b/test/CodeGen/ARM/fmuls.ll
index ef4e3e5..bfafd20 100644
--- a/test/CodeGen/ARM/fmuls.ll
+++ b/test/CodeGen/ARM/fmuls.ll
@@ -20,4 +20,4 @@ entry:
 ; CORTEXA8: test:
 ; CORTEXA8: 	vmul.f32	d0, d1, d0
 ; CORTEXA9: test:
-; CORTEXA9: 	vmul.f32	s0, s1, s0
+; CORTEXA9: 	vmul.f32	s0, s0, s1
diff --git a/test/CodeGen/ARM/fnmscs.ll b/test/CodeGen/ARM/fnmscs.ll
index 6b7cefa..0b47edd 100644
--- a/test/CodeGen/ARM/fnmscs.ll
+++ b/test/CodeGen/ARM/fnmscs.ll
@@ -4,7 +4,7 @@
 ; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s
 
 define float @test1(float %acc, float %a, float %b) nounwind {
-; CHECK: vnmla.f32 s2, s1, s0
+; CHECK: vnmla.f32 s{{.*}}, s{{.*}}, s{{.*}}
 entry:
 	%0 = fmul float %a, %b
 	%1 = fsub float -0.0, %0
@@ -13,7 +13,7 @@ entry:
 }
 
 define float @test2(float %acc, float %a, float %b) nounwind {
-; CHECK: vnmla.f32 s2, s1, s0
+; CHECK: vnmla.f32 s{{.*}}, s{{.*}}, s{{.*}}
 entry:
 	%0 = fmul float %a, %b
 	%1 = fmul float -1.0, %0
diff --git a/test/CodeGen/ARM/lsr-on-unrolled-loops.ll b/test/CodeGen/ARM/lsr-on-unrolled-loops.ll
new file mode 100644
index 0000000..2ac4084
--- /dev/null
+++ b/test/CodeGen/ARM/lsr-on-unrolled-loops.ll
@@ -0,0 +1,642 @@
+; RUN: llc -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 < %s | FileCheck %s
+
+; LSR should recognize that this is an unrolled loop which can use
+; constant offset addressing, so that each of the following stores
+; uses the same register.
+
+; CHECK: vstr.32 s0, [r12, #-128]
+; CHECK: vstr.32 s0, [r12, #-96]
+; CHECK: vstr.32 s0, [r12, #-64]
+; CHECK: vstr.32 s0, [r12, #-32]
+; CHECK: vstr.32 s0, [r12]
+; CHECK: vstr.32 s0, [r12, #32]
+; CHECK: vstr.32 s0, [r12, #64]
+; CHECK: vstr.32 s0, [r12, #96]
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
+
+%0 = type { %1*, %3*, %6*, i8*, i32, i32, %8*, i32, i32, i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8**, i32, i32, i32, i32, i32, [64 x i32]*, [4 x %9*], [4 x %10*], [4 x %10*], i32, %11*, i32, i32, [16 x i8], [16 x i8], [16 x i8], i32, i32, i8, i8, i8, i16, i16, i32, i8, i32, %12*, i32, i32, i32, i32, i8*, i32, [4 x %11*], i32, i32, i32, [10 x i32], i32, i32, i32, i32, i32, %13*, %14*, %15*, %16*, %17*, %18*, %19*, %20*, %21*, %22*, %23* }
+%1 = type { void (%2*)*, void (%2*, i32)*, void (%2*)*, void (%2*, i8*)*, void (%2*)*, i32, %7, i32, i32, i8**, i32, i8**, i32, i32 }
+%2 = type { %1*, %3*, %6*, i8*, i32, i32 }
+%3 = type { i8* (%2*, i32, i32)*, i8* (%2*, i32, i32)*, i8** (%2*, i32, i32, i32)*, [64 x i16]** (%2*, i32, i32, i32)*, %4* (%2*, i32, i32, i32, i32, i32)*, %5* (%2*, i32, i32, i32, i32, i32)*, void (%2*)*, i8** (%2*, %4*, i32, i32, i32)*, [64 x i16]** (%2*, %5*, i32, i32, i32)*, void (%2*, i32)*, void (%2*)*, i32, i32 }
+%4 = type opaque
+%5 = type opaque
+%6 = type { void (%2*)*, i32, i32, i32, i32 }
+%7 = type { [8 x i32], [12 x i32] }
+%8 = type { i8*, i32, void (%0*)*, i32 (%0*)*, void (%0*, i32)*, i32 (%0*, i32)*, void (%0*)* }
+%9 = type { [64 x i16], i32 }
+%10 = type { [17 x i8], [256 x i8], i32 }
+%11 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %9*, i8* }
+%12 = type { %12*, i8, i32, i32, i8* }
+%13 = type { void (%0*)*, void (%0*)*, i32 }
+%14 = type { void (%0*, i32)*, void (%0*, i8**, i32*, i32)* }
+%15 = type { void (%0*)*, i32 (%0*)*, void (%0*)*, i32 (%0*, i8***)*, %5** }
+%16 = type { void (%0*, i32)*, void (%0*, i8***, i32*, i32, i8**, i32*, i32)* }
+%17 = type { i32 (%0*)*, void (%0*)*, void (%0*)*, void (%0*)*, i32, i32 }
+%18 = type { void (%0*)*, i32 (%0*)*, i32 (%0*)*, i32, i32, i32, i32 }
+%19 = type { void (%0*)*, i32 (%0*, [64 x i16]**)*, i32 }
+%20 = type { void (%0*)*, [10 x void (%0*, %11*, i16*, i8**, i32)*] }
+%21 = type { void (%0*)*, void (%0*, i8***, i32*, i32, i8**, i32*, i32)*, i32 }
+%22 = type { void (%0*)*, void (%0*, i8***, i32, i8**, i32)* }
+%23 = type { void (%0*, i32)*, void (%0*, i8**, i8**, i32)*, void (%0*)*, void (%0*)* }
+
+define arm_apcscc void @test(%0* nocapture %a0, %11* nocapture %a1, i16* nocapture %a2, i8** nocapture %a3, i32 %a4) nounwind {
+bb:
+  %t = alloca [64 x float], align 4           
+  %t5 = getelementptr inbounds %0* %a0, i32 0, i32 65
+  %t6 = load i8** %t5, align 4              
+  %t7 = getelementptr inbounds %11* %a1, i32 0, i32 20
+  %t8 = load i8** %t7, align 4              
+  br label %bb9
+
+bb9:                                            
+  %t10 = phi i32 [ 0, %bb ], [ %t157, %bb156 ]
+  %t11 = add i32 %t10, 8                    
+  %t12 = getelementptr [64 x float]* %t, i32 0, i32 %t11
+  %t13 = add i32 %t10, 16                   
+  %t14 = getelementptr [64 x float]* %t, i32 0, i32 %t13
+  %t15 = add i32 %t10, 24                   
+  %t16 = getelementptr [64 x float]* %t, i32 0, i32 %t15
+  %t17 = add i32 %t10, 32                   
+  %t18 = getelementptr [64 x float]* %t, i32 0, i32 %t17
+  %t19 = add i32 %t10, 40                   
+  %t20 = getelementptr [64 x float]* %t, i32 0, i32 %t19
+  %t21 = add i32 %t10, 48                   
+  %t22 = getelementptr [64 x float]* %t, i32 0, i32 %t21
+  %t23 = add i32 %t10, 56                   
+  %t24 = getelementptr [64 x float]* %t, i32 0, i32 %t23
+  %t25 = getelementptr [64 x float]* %t, i32 0, i32 %t10
+  %t26 = shl i32 %t10, 5                    
+  %t27 = or i32 %t26, 8                     
+  %t28 = getelementptr i8* %t8, i32 %t27  
+  %t29 = bitcast i8* %t28 to float*         
+  %t30 = or i32 %t26, 16                    
+  %t31 = getelementptr i8* %t8, i32 %t30  
+  %t32 = bitcast i8* %t31 to float*         
+  %t33 = or i32 %t26, 24                    
+  %t34 = getelementptr i8* %t8, i32 %t33  
+  %t35 = bitcast i8* %t34 to float*         
+  %t36 = or i32 %t26, 4                     
+  %t37 = getelementptr i8* %t8, i32 %t36  
+  %t38 = bitcast i8* %t37 to float*         
+  %t39 = or i32 %t26, 12                    
+  %t40 = getelementptr i8* %t8, i32 %t39  
+  %t41 = bitcast i8* %t40 to float*         
+  %t42 = or i32 %t26, 20                    
+  %t43 = getelementptr i8* %t8, i32 %t42  
+  %t44 = bitcast i8* %t43 to float*         
+  %t45 = or i32 %t26, 28                    
+  %t46 = getelementptr i8* %t8, i32 %t45  
+  %t47 = bitcast i8* %t46 to float*         
+  %t48 = getelementptr i8* %t8, i32 %t26  
+  %t49 = bitcast i8* %t48 to float*         
+  %t50 = shl i32 %t10, 3                    
+  %t51 = or i32 %t50, 1                     
+  %t52 = getelementptr i16* %a2, i32 %t51 
+  %t53 = or i32 %t50, 2                     
+  %t54 = getelementptr i16* %a2, i32 %t53 
+  %t55 = or i32 %t50, 3                     
+  %t56 = getelementptr i16* %a2, i32 %t55 
+  %t57 = or i32 %t50, 4                     
+  %t58 = getelementptr i16* %a2, i32 %t57 
+  %t59 = or i32 %t50, 5                     
+  %t60 = getelementptr i16* %a2, i32 %t59 
+  %t61 = or i32 %t50, 6                     
+  %t62 = getelementptr i16* %a2, i32 %t61 
+  %t63 = or i32 %t50, 7                     
+  %t64 = getelementptr i16* %a2, i32 %t63 
+  %t65 = getelementptr i16* %a2, i32 %t50 
+  %t66 = load i16* %t52, align 2            
+  %t67 = icmp eq i16 %t66, 0                
+  %t68 = load i16* %t54, align 2            
+  %t69 = icmp eq i16 %t68, 0                
+  %t70 = and i1 %t67, %t69                
+  br i1 %t70, label %bb71, label %bb91
+
+bb71:                                           
+  %t72 = load i16* %t56, align 2            
+  %t73 = icmp eq i16 %t72, 0                
+  br i1 %t73, label %bb74, label %bb91
+
+bb74:                                           
+  %t75 = load i16* %t58, align 2            
+  %t76 = icmp eq i16 %t75, 0                
+  br i1 %t76, label %bb77, label %bb91
+
+bb77:                                           
+  %t78 = load i16* %t60, align 2            
+  %t79 = icmp eq i16 %t78, 0                
+  br i1 %t79, label %bb80, label %bb91
+
+bb80:                                           
+  %t81 = load i16* %t62, align 2            
+  %t82 = icmp eq i16 %t81, 0                
+  br i1 %t82, label %bb83, label %bb91
+
+bb83:                                           
+  %t84 = load i16* %t64, align 2            
+  %t85 = icmp eq i16 %t84, 0                
+  br i1 %t85, label %bb86, label %bb91
+
+bb86:                                           
+  %t87 = load i16* %t65, align 2            
+  %t88 = sitofp i16 %t87 to float           
+  %t89 = load float* %t49, align 4          
+  %t90 = fmul float %t88, %t89            
+  store float %t90, float* %t25, align 4
+  store float %t90, float* %t12, align 4
+  store float %t90, float* %t14, align 4
+  store float %t90, float* %t16, align 4
+  store float %t90, float* %t18, align 4
+  store float %t90, float* %t20, align 4
+  store float %t90, float* %t22, align 4
+  store float %t90, float* %t24, align 4
+  br label %bb156
+
+bb91:                                           
+  %t92 = load i16* %t65, align 2            
+  %t93 = sitofp i16 %t92 to float           
+  %t94 = load float* %t49, align 4          
+  %t95 = fmul float %t93, %t94            
+  %t96 = sitofp i16 %t68 to float           
+  %t97 = load float* %t29, align 4          
+  %t98 = fmul float %t96, %t97            
+  %t99 = load i16* %t58, align 2            
+  %t100 = sitofp i16 %t99 to float          
+  %t101 = load float* %t32, align 4         
+  %t102 = fmul float %t100, %t101         
+  %t103 = load i16* %t62, align 2           
+  %t104 = sitofp i16 %t103 to float         
+  %t105 = load float* %t35, align 4         
+  %t106 = fmul float %t104, %t105         
+  %t107 = fadd float %t95, %t102          
+  %t108 = fsub float %t95, %t102          
+  %t109 = fadd float %t98, %t106          
+  %t110 = fsub float %t98, %t106          
+  %t111 = fmul float %t110, 0x3FF6A09E60000000
+  %t112 = fsub float %t111, %t109         
+  %t113 = fadd float %t107, %t109         
+  %t114 = fsub float %t107, %t109         
+  %t115 = fadd float %t108, %t112         
+  %t116 = fsub float %t108, %t112         
+  %t117 = sitofp i16 %t66 to float          
+  %t118 = load float* %t38, align 4         
+  %t119 = fmul float %t117, %t118         
+  %t120 = load i16* %t56, align 2           
+  %t121 = sitofp i16 %t120 to float         
+  %t122 = load float* %t41, align 4         
+  %t123 = fmul float %t121, %t122         
+  %t124 = load i16* %t60, align 2           
+  %t125 = sitofp i16 %t124 to float         
+  %t126 = load float* %t44, align 4         
+  %t127 = fmul float %t125, %t126         
+  %t128 = load i16* %t64, align 2           
+  %t129 = sitofp i16 %t128 to float         
+  %t130 = load float* %t47, align 4         
+  %t131 = fmul float %t129, %t130         
+  %t132 = fadd float %t127, %t123         
+  %t133 = fsub float %t127, %t123         
+  %t134 = fadd float %t119, %t131         
+  %t135 = fsub float %t119, %t131         
+  %t136 = fadd float %t134, %t132         
+  %t137 = fsub float %t134, %t132         
+  %t138 = fmul float %t137, 0x3FF6A09E60000000
+  %t139 = fadd float %t133, %t135         
+  %t140 = fmul float %t139, 0x3FFD906BC0000000
+  %t141 = fmul float %t135, 0x3FF1517A80000000
+  %t142 = fsub float %t141, %t140         
+  %t143 = fmul float %t133, 0xC004E7AEA0000000
+  %t144 = fadd float %t143, %t140         
+  %t145 = fsub float %t144, %t136         
+  %t146 = fsub float %t138, %t145         
+  %t147 = fadd float %t142, %t146         
+  %t148 = fadd float %t113, %t136         
+  store float %t148, float* %t25, align 4
+  %t149 = fsub float %t113, %t136         
+  store float %t149, float* %t24, align 4
+  %t150 = fadd float %t115, %t145         
+  store float %t150, float* %t12, align 4
+  %t151 = fsub float %t115, %t145         
+  store float %t151, float* %t22, align 4
+  %t152 = fadd float %t116, %t146         
+  store float %t152, float* %t14, align 4
+  %t153 = fsub float %t116, %t146         
+  store float %t153, float* %t20, align 4
+  %t154 = fadd float %t114, %t147         
+  store float %t154, float* %t18, align 4
+  %t155 = fsub float %t114, %t147         
+  store float %t155, float* %t16, align 4
+  br label %bb156
+
+bb156:                                          
+  %t157 = add i32 %t10, 1                   
+  %t158 = icmp eq i32 %t157, 8              
+  br i1 %t158, label %bb159, label %bb9
+
+bb159:                                          
+  %t160 = add i32 %a4, 7                    
+  %t161 = add i32 %a4, 1                    
+  %t162 = add i32 %a4, 6                    
+  %t163 = add i32 %a4, 2                    
+  %t164 = add i32 %a4, 5                    
+  %t165 = add i32 %a4, 4                    
+  %t166 = add i32 %a4, 3                    
+  br label %bb167
+
+bb167:                                          
+  %t168 = phi i32 [ 0, %bb159 ], [ %t293, %bb167 ]
+  %t169 = getelementptr i8** %a3, i32 %t168
+  %t170 = shl i32 %t168, 3                  
+  %t171 = or i32 %t170, 4                   
+  %t172 = getelementptr [64 x float]* %t, i32 0, i32 %t171
+  %t173 = or i32 %t170, 2                   
+  %t174 = getelementptr [64 x float]* %t, i32 0, i32 %t173
+  %t175 = or i32 %t170, 6                   
+  %t176 = getelementptr [64 x float]* %t, i32 0, i32 %t175
+  %t177 = or i32 %t170, 5                   
+  %t178 = getelementptr [64 x float]* %t, i32 0, i32 %t177
+  %t179 = or i32 %t170, 3                   
+  %t180 = getelementptr [64 x float]* %t, i32 0, i32 %t179
+  %t181 = or i32 %t170, 1                   
+  %t182 = getelementptr [64 x float]* %t, i32 0, i32 %t181
+  %t183 = or i32 %t170, 7                   
+  %t184 = getelementptr [64 x float]* %t, i32 0, i32 %t183
+  %t185 = getelementptr [64 x float]* %t, i32 0, i32 %t170
+  %t186 = load i8** %t169, align 4          
+  %t187 = getelementptr inbounds i8* %t186, i32 %a4
+  %t188 = load float* %t185, align 4        
+  %t189 = load float* %t172, align 4        
+  %t190 = fadd float %t188, %t189         
+  %t191 = fsub float %t188, %t189         
+  %t192 = load float* %t174, align 4        
+  %t193 = load float* %t176, align 4        
+  %t194 = fadd float %t192, %t193         
+  %t195 = fsub float %t192, %t193         
+  %t196 = fmul float %t195, 0x3FF6A09E60000000
+  %t197 = fsub float %t196, %t194         
+  %t198 = fadd float %t190, %t194         
+  %t199 = fsub float %t190, %t194         
+  %t200 = fadd float %t191, %t197         
+  %t201 = fsub float %t191, %t197         
+  %t202 = load float* %t178, align 4        
+  %t203 = load float* %t180, align 4        
+  %t204 = fadd float %t202, %t203         
+  %t205 = fsub float %t202, %t203         
+  %t206 = load float* %t182, align 4        
+  %t207 = load float* %t184, align 4        
+  %t208 = fadd float %t206, %t207         
+  %t209 = fsub float %t206, %t207         
+  %t210 = fadd float %t208, %t204         
+  %t211 = fsub float %t208, %t204         
+  %t212 = fmul float %t211, 0x3FF6A09E60000000
+  %t213 = fadd float %t205, %t209         
+  %t214 = fmul float %t213, 0x3FFD906BC0000000
+  %t215 = fmul float %t209, 0x3FF1517A80000000
+  %t216 = fsub float %t215, %t214         
+  %t217 = fmul float %t205, 0xC004E7AEA0000000
+  %t218 = fadd float %t217, %t214         
+  %t219 = fsub float %t218, %t210         
+  %t220 = fsub float %t212, %t219         
+  %t221 = fadd float %t216, %t220         
+  %t222 = fadd float %t198, %t210         
+  %t223 = fptosi float %t222 to i32         
+  %t224 = add nsw i32 %t223, 4              
+  %t225 = lshr i32 %t224, 3                 
+  %t226 = and i32 %t225, 1023               
+  %t227 = add i32 %t226, 128                
+  %t228 = getelementptr inbounds i8* %t6, i32 %t227
+  %t229 = load i8* %t228, align 1           
+  store i8 %t229, i8* %t187, align 1
+  %t230 = fsub float %t198, %t210         
+  %t231 = fptosi float %t230 to i32         
+  %t232 = add nsw i32 %t231, 4              
+  %t233 = lshr i32 %t232, 3                 
+  %t234 = and i32 %t233, 1023               
+  %t235 = add i32 %t234, 128                
+  %t236 = getelementptr inbounds i8* %t6, i32 %t235
+  %t237 = load i8* %t236, align 1           
+  %t238 = getelementptr inbounds i8* %t186, i32 %t160
+  store i8 %t237, i8* %t238, align 1
+  %t239 = fadd float %t200, %t219         
+  %t240 = fptosi float %t239 to i32         
+  %t241 = add nsw i32 %t240, 4              
+  %t242 = lshr i32 %t241, 3                 
+  %t243 = and i32 %t242, 1023               
+  %t244 = add i32 %t243, 128                
+  %t245 = getelementptr inbounds i8* %t6, i32 %t244
+  %t246 = load i8* %t245, align 1           
+  %t247 = getelementptr inbounds i8* %t186, i32 %t161
+  store i8 %t246, i8* %t247, align 1
+  %t248 = fsub float %t200, %t219         
+  %t249 = fptosi float %t248 to i32         
+  %t250 = add nsw i32 %t249, 4              
+  %t251 = lshr i32 %t250, 3                 
+  %t252 = and i32 %t251, 1023               
+  %t253 = add i32 %t252, 128                
+  %t254 = getelementptr inbounds i8* %t6, i32 %t253
+  %t255 = load i8* %t254, align 1           
+  %t256 = getelementptr inbounds i8* %t186, i32 %t162
+  store i8 %t255, i8* %t256, align 1
+  %t257 = fadd float %t201, %t220         
+  %t258 = fptosi float %t257 to i32         
+  %t259 = add nsw i32 %t258, 4              
+  %t260 = lshr i32 %t259, 3                 
+  %t261 = and i32 %t260, 1023               
+  %t262 = add i32 %t261, 128                
+  %t263 = getelementptr inbounds i8* %t6, i32 %t262
+  %t264 = load i8* %t263, align 1           
+  %t265 = getelementptr inbounds i8* %t186, i32 %t163
+  store i8 %t264, i8* %t265, align 1
+  %t266 = fsub float %t201, %t220         
+  %t267 = fptosi float %t266 to i32         
+  %t268 = add nsw i32 %t267, 4              
+  %t269 = lshr i32 %t268, 3                 
+  %t270 = and i32 %t269, 1023               
+  %t271 = add i32 %t270, 128                
+  %t272 = getelementptr inbounds i8* %t6, i32 %t271
+  %t273 = load i8* %t272, align 1           
+  %t274 = getelementptr inbounds i8* %t186, i32 %t164
+  store i8 %t273, i8* %t274, align 1
+  %t275 = fadd float %t199, %t221         
+  %t276 = fptosi float %t275 to i32         
+  %t277 = add nsw i32 %t276, 4              
+  %t278 = lshr i32 %t277, 3                 
+  %t279 = and i32 %t278, 1023               
+  %t280 = add i32 %t279, 128                
+  %t281 = getelementptr inbounds i8* %t6, i32 %t280
+  %t282 = load i8* %t281, align 1           
+  %t283 = getelementptr inbounds i8* %t186, i32 %t165
+  store i8 %t282, i8* %t283, align 1
+  %t284 = fsub float %t199, %t221         
+  %t285 = fptosi float %t284 to i32         
+  %t286 = add nsw i32 %t285, 4              
+  %t287 = lshr i32 %t286, 3                 
+  %t288 = and i32 %t287, 1023               
+  %t289 = add i32 %t288, 128                
+  %t290 = getelementptr inbounds i8* %t6, i32 %t289
+  %t291 = load i8* %t290, align 1           
+  %t292 = getelementptr inbounds i8* %t186, i32 %t166
+  store i8 %t291, i8* %t292, align 1
+  %t293 = add nsw i32 %t168, 1              
+  %t294 = icmp eq i32 %t293, 8              
+  br i1 %t294, label %bb295, label %bb167
+
+bb295:                                          
+  ret void
+}
+
+%struct.ct_data_s = type { %union.anon, %union.anon }
+%struct.gz_header = type { i32, i32, i32, i32, i8*, i32, i32, i8*, i32, i8*, i32, i32, i32 }
+%struct.internal_state = type { %struct.z_stream*, i32, i8*, i32, i8*, i32, i32, %struct.gz_header*, i32, i8, i32, i32, i32, i32, i8*, i32, i16*, i16*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [573 x %struct.ct_data_s], [61 x %struct.ct_data_s], [39 x %struct.ct_data_s], %struct.tree_desc_s, %struct.tree_desc_s, %struct.tree_desc_s, [16 x i16], [573 x i32], i32, i32, [573 x i8], i8*, i32, i32, i16*, i32, i32, i32, i32, i16, i32 }
+%struct.static_tree_desc = type { i32 }
+%struct.tree_desc_s = type { %struct.ct_data_s*, i32, %struct.static_tree_desc* }
+%struct.z_stream = type { i8*, i32, i32, i8*, i32, i32, i8*, %struct.internal_state*, i8* (i8*, i32, i32)*, void (i8*, i8*)*, i8*, i32, i32, i32 }
+%union.anon = type { i16 }
+
+define arm_apcscc i32 @longest_match(%struct.internal_state* %s, i32 %cur_match) nounwind optsize {
+entry:
+  %0 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 31 ; <i32*> [#uses=1]
+  %1 = load i32* %0, align 4                      ; <i32> [#uses=2]
+  %2 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 14 ; <i8**> [#uses=1]
+  %3 = load i8** %2, align 4                      ; <i8*> [#uses=27]
+  %4 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 27 ; <i32*> [#uses=1]
+  %5 = load i32* %4, align 4                      ; <i32> [#uses=17]
+  %6 = getelementptr inbounds i8* %3, i32 %5      ; <i8*> [#uses=1]
+  %7 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 30 ; <i32*> [#uses=1]
+  %8 = load i32* %7, align 4                      ; <i32> [#uses=4]
+  %9 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 36 ; <i32*> [#uses=1]
+  %10 = load i32* %9, align 4                     ; <i32> [#uses=2]
+  %11 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 11 ; <i32*> [#uses=1]
+  %12 = load i32* %11, align 4                    ; <i32> [#uses=2]
+  %13 = add i32 %12, -262                         ; <i32> [#uses=1]
+  %14 = icmp ugt i32 %5, %13                      ; <i1> [#uses=1]
+  br i1 %14, label %bb, label %bb2
+
+bb:                                               ; preds = %entry
+  %15 = add i32 %5, 262                           ; <i32> [#uses=1]
+  %16 = sub i32 %15, %12                          ; <i32> [#uses=1]
+  br label %bb2
+
+bb2:                                              ; preds = %bb, %entry
+  %iftmp.48.0 = phi i32 [ %16, %bb ], [ 0, %entry ] ; <i32> [#uses=1]
+  %17 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 16 ; <i16**> [#uses=1]
+  %18 = load i16** %17, align 4                   ; <i16*> [#uses=1]
+  %19 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 13 ; <i32*> [#uses=1]
+  %20 = load i32* %19, align 4                    ; <i32> [#uses=1]
+  %.sum = add i32 %5, 258                         ; <i32> [#uses=2]
+  %21 = getelementptr inbounds i8* %3, i32 %.sum  ; <i8*> [#uses=1]
+  %22 = add nsw i32 %5, -1                        ; <i32> [#uses=1]
+  %.sum30 = add i32 %22, %8                       ; <i32> [#uses=1]
+  %23 = getelementptr inbounds i8* %3, i32 %.sum30 ; <i8*> [#uses=1]
+  %24 = load i8* %23, align 1                     ; <i8> [#uses=1]
+  %.sum31 = add i32 %8, %5                        ; <i32> [#uses=1]
+  %25 = getelementptr inbounds i8* %3, i32 %.sum31 ; <i8*> [#uses=1]
+  %26 = load i8* %25, align 1                     ; <i8> [#uses=1]
+  %27 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 35 ; <i32*> [#uses=1]
+  %28 = load i32* %27, align 4                    ; <i32> [#uses=1]
+  %29 = lshr i32 %1, 2                            ; <i32> [#uses=1]
+  %30 = icmp ult i32 %8, %28                      ; <i1> [#uses=1]
+  %. = select i1 %30, i32 %1, i32 %29             ; <i32> [#uses=1]
+  %31 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 29 ; <i32*> [#uses=1]
+  %32 = load i32* %31, align 4                    ; <i32> [#uses=4]
+  %33 = icmp ugt i32 %10, %32                     ; <i1> [#uses=1]
+  %nice_match.0.ph = select i1 %33, i32 %32, i32 %10 ; <i32> [#uses=1]
+  %34 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 28 ; <i32*> [#uses=1]
+  %35 = ptrtoint i8* %21 to i32                   ; <i32> [#uses=1]
+  %36 = add nsw i32 %5, 257                       ; <i32> [#uses=1]
+  %tmp81 = add i32 %., -1                         ; <i32> [#uses=1]
+  br label %bb6
+
+bb6:                                              ; preds = %bb24, %bb2
+  %indvar78 = phi i32 [ 0, %bb2 ], [ %indvar.next79, %bb24 ] ; <i32> [#uses=2]
+  %best_len.2 = phi i32 [ %8, %bb2 ], [ %best_len.0, %bb24 ] ; <i32> [#uses=8]
+  %scan_end1.1 = phi i8 [ %24, %bb2 ], [ %scan_end1.0, %bb24 ] ; <i8> [#uses=6]
+  %cur_match_addr.0 = phi i32 [ %cur_match, %bb2 ], [ %90, %bb24 ] ; <i32> [#uses=14]
+  %scan_end.1 = phi i8 [ %26, %bb2 ], [ %scan_end.0, %bb24 ] ; <i8> [#uses=6]
+  %37 = getelementptr inbounds i8* %3, i32 %cur_match_addr.0 ; <i8*> [#uses=1]
+  %.sum32 = add i32 %cur_match_addr.0, %best_len.2 ; <i32> [#uses=1]
+  %38 = getelementptr inbounds i8* %3, i32 %.sum32 ; <i8*> [#uses=1]
+  %39 = load i8* %38, align 1                     ; <i8> [#uses=1]
+  %40 = icmp eq i8 %39, %scan_end.1               ; <i1> [#uses=1]
+  br i1 %40, label %bb7, label %bb23
+
+bb7:                                              ; preds = %bb6
+  %41 = add nsw i32 %best_len.2, -1               ; <i32> [#uses=1]
+  %.sum33 = add i32 %41, %cur_match_addr.0        ; <i32> [#uses=1]
+  %42 = getelementptr inbounds i8* %3, i32 %.sum33 ; <i8*> [#uses=1]
+  %43 = load i8* %42, align 1                     ; <i8> [#uses=1]
+  %44 = icmp eq i8 %43, %scan_end1.1              ; <i1> [#uses=1]
+  br i1 %44, label %bb8, label %bb23
+
+bb8:                                              ; preds = %bb7
+  %45 = load i8* %37, align 1                     ; <i8> [#uses=1]
+  %46 = load i8* %6, align 1                      ; <i8> [#uses=1]
+  %47 = icmp eq i8 %45, %46                       ; <i1> [#uses=1]
+  br i1 %47, label %bb9, label %bb23
+
+bb9:                                              ; preds = %bb8
+  %.sum34 = add i32 %cur_match_addr.0, 1          ; <i32> [#uses=1]
+  %48 = getelementptr inbounds i8* %3, i32 %.sum34 ; <i8*> [#uses=1]
+  %49 = load i8* %48, align 1                     ; <i8> [#uses=1]
+  %.sum88 = add i32 %5, 1                         ; <i32> [#uses=1]
+  %50 = getelementptr inbounds i8* %3, i32 %.sum88 ; <i8*> [#uses=1]
+  %51 = load i8* %50, align 1                     ; <i8> [#uses=1]
+  %52 = icmp eq i8 %49, %51                       ; <i1> [#uses=1]
+  br i1 %52, label %bb10, label %bb23
+
+bb10:                                             ; preds = %bb9
+  %tmp39 = add i32 %cur_match_addr.0, 10          ; <i32> [#uses=1]
+  %tmp41 = add i32 %cur_match_addr.0, 9           ; <i32> [#uses=1]
+  %tmp44 = add i32 %cur_match_addr.0, 8           ; <i32> [#uses=1]
+  %tmp47 = add i32 %cur_match_addr.0, 7           ; <i32> [#uses=1]
+  %tmp50 = add i32 %cur_match_addr.0, 6           ; <i32> [#uses=1]
+  %tmp53 = add i32 %cur_match_addr.0, 5           ; <i32> [#uses=1]
+  %tmp56 = add i32 %cur_match_addr.0, 4           ; <i32> [#uses=1]
+  %tmp59 = add i32 %cur_match_addr.0, 3           ; <i32> [#uses=1]
+  br label %bb11
+
+bb11:                                             ; preds = %bb18, %bb10
+  %indvar = phi i32 [ %indvar.next, %bb18 ], [ 0, %bb10 ] ; <i32> [#uses=2]
+  %tmp = shl i32 %indvar, 3                       ; <i32> [#uses=16]
+  %tmp40 = add i32 %tmp39, %tmp                   ; <i32> [#uses=1]
+  %scevgep = getelementptr i8* %3, i32 %tmp40     ; <i8*> [#uses=1]
+  %tmp42 = add i32 %tmp41, %tmp                   ; <i32> [#uses=1]
+  %scevgep43 = getelementptr i8* %3, i32 %tmp42   ; <i8*> [#uses=1]
+  %tmp45 = add i32 %tmp44, %tmp                   ; <i32> [#uses=1]
+  %scevgep46 = getelementptr i8* %3, i32 %tmp45   ; <i8*> [#uses=1]
+  %tmp48 = add i32 %tmp47, %tmp                   ; <i32> [#uses=1]
+  %scevgep49 = getelementptr i8* %3, i32 %tmp48   ; <i8*> [#uses=1]
+  %tmp51 = add i32 %tmp50, %tmp                   ; <i32> [#uses=1]
+  %scevgep52 = getelementptr i8* %3, i32 %tmp51   ; <i8*> [#uses=1]
+  %tmp54 = add i32 %tmp53, %tmp                   ; <i32> [#uses=1]
+  %scevgep55 = getelementptr i8* %3, i32 %tmp54   ; <i8*> [#uses=1]
+  %tmp60 = add i32 %tmp59, %tmp                   ; <i32> [#uses=1]
+  %scevgep61 = getelementptr i8* %3, i32 %tmp60   ; <i8*> [#uses=1]
+  %tmp62 = add i32 %tmp, 10                       ; <i32> [#uses=1]
+  %.sum89 = add i32 %5, %tmp62                    ; <i32> [#uses=2]
+  %scevgep63 = getelementptr i8* %3, i32 %.sum89  ; <i8*> [#uses=2]
+  %tmp64 = add i32 %tmp, 9                        ; <i32> [#uses=1]
+  %.sum90 = add i32 %5, %tmp64                    ; <i32> [#uses=1]
+  %scevgep65 = getelementptr i8* %3, i32 %.sum90  ; <i8*> [#uses=2]
+  %tmp66 = add i32 %tmp, 8                        ; <i32> [#uses=1]
+  %.sum91 = add i32 %5, %tmp66                    ; <i32> [#uses=1]
+  %scevgep67 = getelementptr i8* %3, i32 %.sum91  ; <i8*> [#uses=2]
+  %tmp6883 = or i32 %tmp, 7                       ; <i32> [#uses=1]
+  %.sum92 = add i32 %5, %tmp6883                  ; <i32> [#uses=1]
+  %scevgep69 = getelementptr i8* %3, i32 %.sum92  ; <i8*> [#uses=2]
+  %tmp7084 = or i32 %tmp, 6                       ; <i32> [#uses=1]
+  %.sum93 = add i32 %5, %tmp7084                  ; <i32> [#uses=1]
+  %scevgep71 = getelementptr i8* %3, i32 %.sum93  ; <i8*> [#uses=2]
+  %tmp7285 = or i32 %tmp, 5                       ; <i32> [#uses=1]
+  %.sum94 = add i32 %5, %tmp7285                  ; <i32> [#uses=1]
+  %scevgep73 = getelementptr i8* %3, i32 %.sum94  ; <i8*> [#uses=2]
+  %tmp7486 = or i32 %tmp, 4                       ; <i32> [#uses=1]
+  %.sum95 = add i32 %5, %tmp7486                  ; <i32> [#uses=1]
+  %scevgep75 = getelementptr i8* %3, i32 %.sum95  ; <i8*> [#uses=2]
+  %tmp7687 = or i32 %tmp, 3                       ; <i32> [#uses=1]
+  %.sum96 = add i32 %5, %tmp7687                  ; <i32> [#uses=1]
+  %scevgep77 = getelementptr i8* %3, i32 %.sum96  ; <i8*> [#uses=2]
+  %53 = load i8* %scevgep77, align 1              ; <i8> [#uses=1]
+  %54 = load i8* %scevgep61, align 1              ; <i8> [#uses=1]
+  %55 = icmp eq i8 %53, %54                       ; <i1> [#uses=1]
+  br i1 %55, label %bb12, label %bb20
+
+bb12:                                             ; preds = %bb11
+  %tmp57 = add i32 %tmp56, %tmp                   ; <i32> [#uses=1]
+  %scevgep58 = getelementptr i8* %3, i32 %tmp57   ; <i8*> [#uses=1]
+  %56 = load i8* %scevgep75, align 1              ; <i8> [#uses=1]
+  %57 = load i8* %scevgep58, align 1              ; <i8> [#uses=1]
+  %58 = icmp eq i8 %56, %57                       ; <i1> [#uses=1]
+  br i1 %58, label %bb13, label %bb20
+
+bb13:                                             ; preds = %bb12
+  %59 = load i8* %scevgep73, align 1              ; <i8> [#uses=1]
+  %60 = load i8* %scevgep55, align 1              ; <i8> [#uses=1]
+  %61 = icmp eq i8 %59, %60                       ; <i1> [#uses=1]
+  br i1 %61, label %bb14, label %bb20
+
+bb14:                                             ; preds = %bb13
+  %62 = load i8* %scevgep71, align 1              ; <i8> [#uses=1]
+  %63 = load i8* %scevgep52, align 1              ; <i8> [#uses=1]
+  %64 = icmp eq i8 %62, %63                       ; <i1> [#uses=1]
+  br i1 %64, label %bb15, label %bb20
+
+bb15:                                             ; preds = %bb14
+  %65 = load i8* %scevgep69, align 1              ; <i8> [#uses=1]
+  %66 = load i8* %scevgep49, align 1              ; <i8> [#uses=1]
+  %67 = icmp eq i8 %65, %66                       ; <i1> [#uses=1]
+  br i1 %67, label %bb16, label %bb20
+
+bb16:                                             ; preds = %bb15
+  %68 = load i8* %scevgep67, align 1              ; <i8> [#uses=1]
+  %69 = load i8* %scevgep46, align 1              ; <i8> [#uses=1]
+  %70 = icmp eq i8 %68, %69                       ; <i1> [#uses=1]
+  br i1 %70, label %bb17, label %bb20
+
+bb17:                                             ; preds = %bb16
+  %71 = load i8* %scevgep65, align 1              ; <i8> [#uses=1]
+  %72 = load i8* %scevgep43, align 1              ; <i8> [#uses=1]
+  %73 = icmp eq i8 %71, %72                       ; <i1> [#uses=1]
+  br i1 %73, label %bb18, label %bb20
+
+bb18:                                             ; preds = %bb17
+  %74 = load i8* %scevgep63, align 1              ; <i8> [#uses=1]
+  %75 = load i8* %scevgep, align 1                ; <i8> [#uses=1]
+  %76 = icmp eq i8 %74, %75                       ; <i1> [#uses=1]
+  %77 = icmp slt i32 %.sum89, %.sum               ; <i1> [#uses=1]
+  %or.cond = and i1 %76, %77                      ; <i1> [#uses=1]
+  %indvar.next = add i32 %indvar, 1               ; <i32> [#uses=1]
+  br i1 %or.cond, label %bb11, label %bb20
+
+bb20:                                             ; preds = %bb18, %bb17, %bb16, %bb15, %bb14, %bb13, %bb12, %bb11
+  %scan.3 = phi i8* [ %scevgep77, %bb11 ], [ %scevgep75, %bb12 ], [ %scevgep73, %bb13 ], [ %scevgep71, %bb14 ], [ %scevgep69, %bb15 ], [ %scevgep67, %bb16 ], [ %scevgep65, %bb17 ], [ %scevgep63, %bb18 ] ; <i8*> [#uses=1]
+  %78 = ptrtoint i8* %scan.3 to i32               ; <i32> [#uses=1]
+  %79 = sub nsw i32 %78, %35                      ; <i32> [#uses=2]
+  %80 = add i32 %79, 258                          ; <i32> [#uses=5]
+  %81 = icmp sgt i32 %80, %best_len.2             ; <i1> [#uses=1]
+  br i1 %81, label %bb21, label %bb23
+
+bb21:                                             ; preds = %bb20
+  store i32 %cur_match_addr.0, i32* %34, align 4
+  %82 = icmp slt i32 %80, %nice_match.0.ph        ; <i1> [#uses=1]
+  br i1 %82, label %bb22, label %bb25
+
+bb22:                                             ; preds = %bb21
+  %.sum37 = add i32 %36, %79                      ; <i32> [#uses=1]
+  %83 = getelementptr inbounds i8* %3, i32 %.sum37 ; <i8*> [#uses=1]
+  %84 = load i8* %83, align 1                     ; <i8> [#uses=1]
+  %.sum38 = add i32 %80, %5                       ; <i32> [#uses=1]
+  %85 = getelementptr inbounds i8* %3, i32 %.sum38 ; <i8*> [#uses=1]
+  %86 = load i8* %85, align 1                     ; <i8> [#uses=1]
+  br label %bb23
+
+bb23:                                             ; preds = %bb22, %bb20, %bb9, %bb8, %bb7, %bb6
+  %best_len.0 = phi i32 [ %best_len.2, %bb6 ], [ %best_len.2, %bb7 ], [ %best_len.2, %bb8 ], [ %best_len.2, %bb9 ], [ %80, %bb22 ], [ %best_len.2, %bb20 ] ; <i32> [#uses=3]
+  %scan_end1.0 = phi i8 [ %scan_end1.1, %bb6 ], [ %scan_end1.1, %bb7 ], [ %scan_end1.1, %bb8 ], [ %scan_end1.1, %bb9 ], [ %84, %bb22 ], [ %scan_end1.1, %bb20 ] ; <i8> [#uses=1]
+  %scan_end.0 = phi i8 [ %scan_end.1, %bb6 ], [ %scan_end.1, %bb7 ], [ %scan_end.1, %bb8 ], [ %scan_end.1, %bb9 ], [ %86, %bb22 ], [ %scan_end.1, %bb20 ] ; <i8> [#uses=1]
+  %87 = and i32 %cur_match_addr.0, %20            ; <i32> [#uses=1]
+  %88 = getelementptr inbounds i16* %18, i32 %87  ; <i16*> [#uses=1]
+  %89 = load i16* %88, align 2                    ; <i16> [#uses=1]
+  %90 = zext i16 %89 to i32                       ; <i32> [#uses=2]
+  %91 = icmp ugt i32 %90, %iftmp.48.0             ; <i1> [#uses=1]
+  br i1 %91, label %bb24, label %bb25
+
+bb24:                                             ; preds = %bb23
+
+; LSR should use count-down iteration to avoid requiring the trip count
+; in a register, and it shouldn't require any reloads here.
+
+; CHECK:      sub.w   r9, r9, #1
+; CHECK-NEXT: cmp.w   r9, #0
+; CHECK-NEXT: bne.w   
+
+  %92 = icmp eq i32 %tmp81, %indvar78             ; <i1> [#uses=1]
+  %indvar.next79 = add i32 %indvar78, 1           ; <i32> [#uses=1]
+  br i1 %92, label %bb25, label %bb6
+
+bb25:                                             ; preds = %bb24, %bb23, %bb21
+  %best_len.1 = phi i32 [ %best_len.0, %bb23 ], [ %best_len.0, %bb24 ], [ %80, %bb21 ] ; <i32> [#uses=2]
+  %93 = icmp ugt i32 %best_len.1, %32             ; <i1> [#uses=1]
+  %merge = select i1 %93, i32 %32, i32 %best_len.1 ; <i32> [#uses=1]
+  ret i32 %merge
+}
diff --git a/test/CodeGen/ARM/mul_const.ll b/test/CodeGen/ARM/mul_const.ll
index 93188cd..8c10246 100644
--- a/test/CodeGen/ARM/mul_const.ll
+++ b/test/CodeGen/ARM/mul_const.ll
@@ -1,17 +1,43 @@
 ; RUN: llc < %s -march=arm | FileCheck %s
 
-define i32 @t1(i32 %v) nounwind readnone {
+define i32 @t9(i32 %v) nounwind readnone {
 entry:
-; CHECK: t1:
+; CHECK: t9:
 ; CHECK: add r0, r0, r0, lsl #3
 	%0 = mul i32 %v, 9
 	ret i32 %0
 }
 
-define i32 @t2(i32 %v) nounwind readnone {
+define i32 @t7(i32 %v) nounwind readnone {
 entry:
-; CHECK: t2:
+; CHECK: t7:
 ; CHECK: rsb r0, r0, r0, lsl #3
 	%0 = mul i32 %v, 7
 	ret i32 %0
 }
+
+define i32 @t5(i32 %v) nounwind readnone {
+entry:
+; CHECK: t5:
+; CHECK: add r0, r0, r0, lsl #2
+        %0 = mul i32 %v, 5
+        ret i32 %0
+}
+
+define i32 @t3(i32 %v) nounwind readnone {
+entry:
+; CHECK: t3:
+; CHECK: add r0, r0, r0, lsl #1
+        %0 = mul i32 %v, 3
+        ret i32 %0
+}
+
+define i32 @t12288(i32 %v) nounwind readnone {
+entry:
+; CHECK: t12288:
+; CHECK: add r0, r0, r0, lsl #1
+; CHECK: mov     r0, r0, lsl #12
+        %0 = mul i32 %v, 12288
+        ret i32 %0
+}
+
diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll
new file mode 100644
index 0000000..3ba82cc
--- /dev/null
+++ b/test/CodeGen/ARM/reg_sequence.ll
@@ -0,0 +1,348 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+; Implementing vld / vst as REG_SEQUENCE eliminates the extra vmov's.
+
+%struct.int16x8_t = type { <8 x i16> }
+%struct.int32x4_t = type { <4 x i32> }
+%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
+%struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
+%struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
+%struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
+
+define arm_apcscc void @t1(i16* %i_ptr, i16* %o_ptr, %struct.int32x4_t* nocapture %vT0ptr, %struct.int32x4_t* nocapture %vT1ptr) nounwind {
+entry:
+; CHECK:        t1:
+; CHECK:        vld1.16
+; CHECK-NOT:    vmov d
+; CHECK:        vmovl.s16
+; CHECK:        vshrn.i32
+; CHECK:        vshrn.i32
+; CHECK-NOT:    vmov d
+; CHECK-NEXT:   vst1.16
+  %0 = getelementptr inbounds %struct.int32x4_t* %vT0ptr, i32 0, i32 0 ; <<4 x i32>*> [#uses=1]
+  %1 = load <4 x i32>* %0, align 16               ; <<4 x i32>> [#uses=1]
+  %2 = getelementptr inbounds %struct.int32x4_t* %vT1ptr, i32 0, i32 0 ; <<4 x i32>*> [#uses=1]
+  %3 = load <4 x i32>* %2, align 16               ; <<4 x i32>> [#uses=1]
+  %4 = bitcast i16* %i_ptr to i8*                 ; <i8*> [#uses=1]
+  %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4) ; <<8 x i16>> [#uses=1]
+  %6 = bitcast <8 x i16> %5 to <2 x double>       ; <<2 x double>> [#uses=2]
+  %7 = extractelement <2 x double> %6, i32 0      ; <double> [#uses=1]
+  %8 = bitcast double %7 to <4 x i16>             ; <<4 x i16>> [#uses=1]
+  %9 = tail call <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16> %8) ; <<4 x i32>> [#uses=1]
+  %10 = extractelement <2 x double> %6, i32 1     ; <double> [#uses=1]
+  %11 = bitcast double %10 to <4 x i16>           ; <<4 x i16>> [#uses=1]
+  %12 = tail call <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16> %11) ; <<4 x i32>> [#uses=1]
+  %13 = mul <4 x i32> %1, %9                      ; <<4 x i32>> [#uses=1]
+  %14 = mul <4 x i32> %3, %12                     ; <<4 x i32>> [#uses=1]
+  %15 = tail call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %13, <4 x i32> <i32 -12, i32 -12, i32 -12, i32 -12>) ; <<4 x i16>> [#uses=1]
+  %16 = tail call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %14, <4 x i32> <i32 -12, i32 -12, i32 -12, i32 -12>) ; <<4 x i16>> [#uses=1]
+  %17 = shufflevector <4 x i16> %15, <4 x i16> %16, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; <<8 x i16>> [#uses=1]
+  %18 = bitcast i16* %o_ptr to i8*                ; <i8*> [#uses=1]
+  tail call void @llvm.arm.neon.vst1.v8i16(i8* %18, <8 x i16> %17)
+  ret void
+}
+
+define arm_apcscc void @t2(i16* %i_ptr, i16* %o_ptr, %struct.int16x8_t* nocapture %vT0ptr, %struct.int16x8_t* nocapture %vT1ptr) nounwind {
+entry:
+; CHECK:        t2:
+; CHECK:        vld1.16
+; CHECK:        vld1.16
+; CHECK-NOT:    vmov
+; CHECK:        vmul.i16
+; CHECK:        vmul.i16
+; CHECK-NOT:    vmov
+; CHECK:        vst1.16
+; CHECK:        vst1.16
+  %0 = getelementptr inbounds %struct.int16x8_t* %vT0ptr, i32 0, i32 0 ; <<8 x i16>*> [#uses=1]
+  %1 = load <8 x i16>* %0, align 16               ; <<8 x i16>> [#uses=1]
+  %2 = getelementptr inbounds %struct.int16x8_t* %vT1ptr, i32 0, i32 0 ; <<8 x i16>*> [#uses=1]
+  %3 = load <8 x i16>* %2, align 16               ; <<8 x i16>> [#uses=1]
+  %4 = bitcast i16* %i_ptr to i8*                 ; <i8*> [#uses=1]
+  %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4) ; <<8 x i16>> [#uses=1]
+  %6 = getelementptr inbounds i16* %i_ptr, i32 8  ; <i16*> [#uses=1]
+  %7 = bitcast i16* %6 to i8*                     ; <i8*> [#uses=1]
+  %8 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %7) ; <<8 x i16>> [#uses=1]
+  %9 = mul <8 x i16> %1, %5                       ; <<8 x i16>> [#uses=1]
+  %10 = mul <8 x i16> %3, %8                      ; <<8 x i16>> [#uses=1]
+  %11 = bitcast i16* %o_ptr to i8*                ; <i8*> [#uses=1]
+  tail call void @llvm.arm.neon.vst1.v8i16(i8* %11, <8 x i16> %9)
+  %12 = getelementptr inbounds i16* %o_ptr, i32 8 ; <i16*> [#uses=1]
+  %13 = bitcast i16* %12 to i8*                   ; <i8*> [#uses=1]
+  tail call void @llvm.arm.neon.vst1.v8i16(i8* %13, <8 x i16> %10)
+  ret void
+}
+
+define <8 x i8> @t3(i8* %A, i8* %B) nounwind {
+; CHECK:        t3:
+; CHECK:        vld3.8
+; CHECK:        vmul.i8
+; CHECK-NOT:    vmov
+; CHECK:        vst3.8
+  %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+  %tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0 ; <<8 x i8>> [#uses=1]
+  %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2 ; <<8 x i8>> [#uses=1]
+  %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 1 ; <<8 x i8>> [#uses=1]
+  %tmp5 = sub <8 x i8> %tmp3, %tmp4
+  %tmp6 = add <8 x i8> %tmp2, %tmp3               ; <<8 x i8>> [#uses=1]
+  %tmp7 = mul <8 x i8> %tmp4, %tmp2
+  tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> %tmp5, <8 x i8> %tmp6, <8 x i8> %tmp7)
+  ret <8 x i8> %tmp4
+}
+
+define arm_apcscc void @t4(i32* %in, i32* %out) nounwind {
+entry:
+; CHECK:        t4:
+; CHECK:        vld2.32
+; CHECK-NOT:    vmov
+; CHECK:        vld2.32
+; CHECK-NOT:    vmov
+; CHECK:        bne
+  %tmp1 = bitcast i32* %in to i8*                 ; <i8*> [#uses=1]
+  %tmp2 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+  %tmp3 = getelementptr inbounds i32* %in, i32 8  ; <i32*> [#uses=1]
+  %tmp4 = bitcast i32* %tmp3 to i8*               ; <i8*> [#uses=1]
+  %tmp5 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp4) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+  %tmp8 = bitcast i32* %out to i8*                ; <i8*> [#uses=1]
+  br i1 undef, label %return1, label %return2
+
+return1:
+; CHECK:        %return1
+; CHECK-NOT:    vmov
+; CHECK-NEXT:   vadd.i32
+; CHECK-NEXT:   vadd.i32
+; CHECK-NEXT:   vst2.32
+  %tmp52 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1]
+  %tmp57 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1 ; <<4 x i32>> [#uses=1]
+  %tmp = extractvalue %struct.__neon_int32x4x2_t %tmp5, 0 ; <<4 x i32>> [#uses=1]
+  %tmp39 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
+  %tmp6 = add <4 x i32> %tmp52, %tmp              ; <<4 x i32>> [#uses=1]
+  %tmp7 = add <4 x i32> %tmp57, %tmp39            ; <<4 x i32>> [#uses=1]
+  tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp6, <4 x i32> %tmp7)
+  ret void
+
+return2:
+; CHECK:        %return2
+; CHECK:        vadd.i32
+; CHECK:        vmov q1, q3
+; CHECK-NOT:    vmov
+; CHECK:        vst2.32 {d0, d1, d2, d3}
+  %tmp100 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1]
+  %tmp101 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
+  %tmp102 = add <4 x i32> %tmp100, %tmp101              ; <<4 x i32>> [#uses=1]
+  tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp102, <4 x i32> %tmp101)
+  call void @llvm.trap()
+  unreachable
+}
+
+define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind {
+; CHECK:        t5:
+; CHECK:        vldmia
+; CHECK:        vmov q1, q0
+; CHECK-NOT:    vmov
+; CHECK:        vld2.16 {d0[1], d2[1]}, [r0]
+; CHECK-NOT:    vmov
+; CHECK:        vadd.i16
+  %tmp0 = bitcast i16* %A to i8*                  ; <i8*> [#uses=1]
+  %tmp1 = load <8 x i16>* %B                      ; <<8 x i16>> [#uses=2]
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1) ; <%struct.__neon_int16x8x2_t> [#uses=2]
+  %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 ; <<8 x i16>> [#uses=1]
+  %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 ; <<8 x i16>> [#uses=1]
+  %tmp5 = add <8 x i16> %tmp3, %tmp4              ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %tmp5
+}
+
+define <8 x i8> @t6(i8* %A, <8 x i8>* %B) nounwind {
+; CHECK:        t6:
+; CHECK:        vldr.64
+; CHECK:        vmov d1, d0
+; CHECK-NEXT:   vld2.8 {d0[1], d1[1]}
+  %tmp1 = load <8 x i8>* %B                       ; <<8 x i8>> [#uses=2]
+  %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2]
+  %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 ; <<8 x i8>> [#uses=1]
+  %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 ; <<8 x i8>> [#uses=1]
+  %tmp5 = add <8 x i8> %tmp3, %tmp4               ; <<8 x i8>> [#uses=1]
+  ret <8 x i8> %tmp5
+}
+
+define arm_apcscc void @t7(i32* %iptr, i32* %optr) nounwind {
+entry:
+; CHECK:        t7:
+; CHECK:        vld2.32
+; CHECK:        vst2.32
+; CHECK:        vld1.32 {d0, d1},
+; CHECK:        vmov q1, q0
+; CHECK-NOT:    vmov
+; CHECK:        vuzp.32 q0, q1
+; CHECK:        vst1.32
+  %0 = bitcast i32* %iptr to i8*                  ; <i8*> [#uses=2]
+  %1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %0) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+  %tmp57 = extractvalue %struct.__neon_int32x4x2_t %1, 0 ; <<4 x i32>> [#uses=1]
+  %tmp60 = extractvalue %struct.__neon_int32x4x2_t %1, 1 ; <<4 x i32>> [#uses=1]
+  %2 = bitcast i32* %optr to i8*                  ; <i8*> [#uses=2]
+  tail call void @llvm.arm.neon.vst2.v4i32(i8* %2, <4 x i32> %tmp57, <4 x i32> %tmp60)
+  %3 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %0) ; <<4 x i32>> [#uses=1]
+  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> ; <<4 x i32>> [#uses=1]
+  tail call void @llvm.arm.neon.vst1.v4i32(i8* %2, <4 x i32> %4)
+  ret void
+}
+
+; PR7156
+define arm_aapcs_vfpcc i32 @t8() nounwind {
+; CHECK: t8:
+; CHECK: vrsqrte.f32 q0, q0
+bb.nph55.bb.nph55.split_crit_edge:
+  br label %bb3
+
+bb3:                                              ; preds = %bb3, %bb.nph55.bb.nph55.split_crit_edge
+  br i1 undef, label %bb5, label %bb3
+
+bb5:                                              ; preds = %bb3
+  br label %bb.i25
+
+bb.i25:                                           ; preds = %bb.i25, %bb5
+  %0 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
+  %1 = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %0) nounwind ; <<4 x float>> [#uses=1]
+  %2 = fmul <4 x float> %1, undef                 ; <<4 x float>> [#uses=1]
+  %3 = fmul <4 x float> undef, %2                 ; <<4 x float>> [#uses=1]
+  %tmp26.i = bitcast <4 x float> %3 to <2 x double> ; <<2 x double>> [#uses=1]
+  %4 = extractelement <2 x double> %tmp26.i, i32 0 ; <double> [#uses=1]
+  %5 = bitcast double %4 to <2 x float>           ; <<2 x float>> [#uses=1]
+  %6 = extractelement <2 x float> %5, i32 1       ; <float> [#uses=1]
+  store float %6, float* undef, align 4
+  br i1 undef, label %bb6, label %bb.i25
+
+bb6:                                              ; preds = %bb.i25
+  br i1 undef, label %bb7, label %bb14
+
+bb7:                                              ; preds = %bb6
+  br label %bb.i49
+
+bb.i49:                                           ; preds = %bb.i49, %bb7
+  br i1 undef, label %bb.i19, label %bb.i49
+
+bb.i19:                                           ; preds = %bb.i19, %bb.i49
+  br i1 undef, label %exit, label %bb.i19
+
+exit:          ; preds = %bb.i19
+  unreachable
+
+bb14:                                             ; preds = %bb6
+  ret i32 0
+}
+
+%0 = type { %1, %1, %1, %1 }
+%1 = type { %2 }
+%2 = type { <4 x float> }
+%3 = type { %0, %1 }
+
+; PR7157
+define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind {
+; CHECK:        t9:
+; CHECK:        vldr.64
+; CHECK:        vmov.i8 d1
+; CHECK-NEXT:   vstmia r0, {d2,d3}
+; CHECK-NEXT:   vstmia r0, {d0,d1}
+  %3 = bitcast double 0.000000e+00 to <2 x float> ; <<2 x float>> [#uses=2]
+  %4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
+  store <4 x float> %4, <4 x float>* undef, align 16
+  %5 = shufflevector <2 x float> %3, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
+  store <4 x float> %5, <4 x float>* undef, align 16
+  br label %8
+
+; <label>:6                                       ; preds = %8
+  br i1 undef, label %7, label %10
+
+; <label>:7                                       ; preds = %6
+  br label %8
+
+; <label>:8                                       ; preds = %7, %2
+  br i1 undef, label %6, label %9
+
+; <label>:9                                       ; preds = %8
+  ret float undef
+
+; <label>:10                                      ; preds = %6
+  ret float 9.990000e+02
+}
+
+; PR7162
+define arm_aapcs_vfpcc i32 @t10() nounwind {
+entry:
+; CHECK: t10:
+; CHECK: vmov.i32 q1, #0x3F000000
+; CHECK: vdup.32 q0, d0[0]
+; CHECK: vmov d0, d1
+; CHECK: vmla.f32 q0, q0, d0[0]
+  %0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
+  %1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1]
+  %2 = insertelement <4 x float> %1, float undef, i32 2 ; <<4 x float>> [#uses=1]
+  %3 = insertelement <4 x float> %2, float undef, i32 3 ; <<4 x float>> [#uses=1]
+  %tmp54.i = bitcast <4 x float> %3 to <2 x double> ; <<2 x double>> [#uses=1]
+  %4 = extractelement <2 x double> %tmp54.i, i32 1 ; <double> [#uses=1]
+  %5 = bitcast double %4 to <2 x float>           ; <<2 x float>> [#uses=1]
+  %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
+  %7 = fmul <4 x float> undef, %6                 ; <<4 x float>> [#uses=1]
+  %8 = fadd <4 x float> %7, undef                 ; <<4 x float>> [#uses=1]
+  %9 = fadd <4 x float> %8, undef                 ; <<4 x float>> [#uses=1]
+  %10 = shufflevector <4 x float> undef, <4 x float> %9, <4 x i32> <i32 0, i32 1, i32 2, i32 7> ; <<4 x float>> [#uses=1]
+  %11 = fmul <4 x float> %10, <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01> ; <<4 x float>> [#uses=1]
+  %12 = shufflevector <4 x float> %11, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> ; <<4 x float>> [#uses=1]
+  %13 = shufflevector <4 x float> %12, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
+  %14 = fmul <4 x float> %13, undef               ; <<4 x float>> [#uses=1]
+  %15 = fadd <4 x float> undef, %14               ; <<4 x float>> [#uses=1]
+  %16 = shufflevector <4 x float> undef, <4 x float> %15, <4 x i32> <i32 0, i32 1, i32 6, i32 3> ; <<4 x float>> [#uses=1]
+  %17 = fmul <4 x float> %16, undef               ; <<4 x float>> [#uses=1]
+  %18 = extractelement <4 x float> %17, i32 2     ; <float> [#uses=1]
+  store float %18, float* undef, align 4
+  br i1 undef, label %exit, label %bb14
+
+exit:          ; preds = %bb.i19
+  unreachable
+
+bb14:                                             ; preds = %bb6
+  ret i32 0
+}
+
+; This test crashes the coalescer because live variables were not updated properly.
+define <8 x i8> @t11(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind {
+  %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+  %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1]
+  %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+  %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1]
+  %tmp2bd = add <8 x i8> zeroinitializer, %tmp2d  ; <<8 x i8>> [#uses=1]
+  %tmp2abcd = mul <8 x i8> zeroinitializer, %tmp2bd ; <<8 x i8>> [#uses=1]
+  %tmp2ef = sub <8 x i8> zeroinitializer, %tmp2f  ; <<8 x i8>> [#uses=1]
+  %tmp2efgh = mul <8 x i8> %tmp2ef, undef         ; <<8 x i8>> [#uses=2]
+  call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp2efgh)
+  %tmp2 = sub <8 x i8> %tmp2efgh, %tmp2abcd       ; <<8 x i8>> [#uses=1]
+  %tmp7 = mul <8 x i8> undef, %tmp2               ; <<8 x i8>> [#uses=1]
+  tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp7)
+  ret <8 x i8> undef
+}
+
+declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*) nounwind readonly
+
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*) nounwind readonly
+
+declare <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16>) nounwind readnone
+
+declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>) nounwind
+
+declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>) nounwind
+
+declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
+
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*) nounwind readonly
+
+declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*) nounwind readonly
+
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind readonly
+
+declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind readonly
+
+declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>) nounwind
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+
+declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/ARM/spill-q.ll b/test/CodeGen/ARM/spill-q.ll
index 5ad7ecc..03de0c8 100644
--- a/test/CodeGen/ARM/spill-q.ll
+++ b/test/CodeGen/ARM/spill-q.ll
@@ -46,7 +46,8 @@ bb4:                                              ; preds = %bb193, %entry
   %20 = shufflevector <2 x float> %19, <2 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
   %21 = fadd <4 x float> zeroinitializer, %20     ; <<4 x float>> [#uses=2]
   %22 = fcmp ogt <4 x float> %besterror.0.2264, %21 ; <<4 x i1>> [#uses=0]
-  br i1 undef, label %bb193, label %bb186
+  %tmp = extractelement <4 x i1> %22, i32 0
+  br i1 %tmp, label %bb193, label %bb186
 
 bb186:                                            ; preds = %bb4
   br label %bb193
diff --git a/test/CodeGen/ARM/trap.ll b/test/CodeGen/ARM/trap.ll
new file mode 100644
index 0000000..763dff3
--- /dev/null
+++ b/test/CodeGen/ARM/trap.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm | FileCheck %s
+; rdar://7961298
+
+define arm_apcscc void @t() nounwind {
+entry:
+; CHECK: t:
+; CHECK: trap
+  call void @llvm.trap()
+  unreachable
+}
+
+declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/ARM/vcgt.ll b/test/CodeGen/ARM/vcgt.ll
index 6b11ba5..194093c 100644
--- a/test/CodeGen/ARM/vcgt.ll
+++ b/test/CodeGen/ARM/vcgt.ll
@@ -158,5 +158,18 @@ define <4 x i32> @vacgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
 	ret <4 x i32> %tmp3
 }
 
+; rdar://7923010
+define <4 x i32> @vcgt_zext(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK: vcgt_zext:
+;CHECK: vcgt.f32 q0
+;CHECK: vmov.i32 q1, #0x1
+;CHECK: vand q0, q0, q1
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = fcmp ogt <4 x float> %tmp1, %tmp2
+        %tmp4 = zext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
 declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) nounwind readnone
 declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) nounwind readnone