67 files changed, 1717 insertions, 106 deletions
diff --git a/test/CodeGen/ARM/2007-03-13-InstrSched.ll b/test/CodeGen/ARM/2007-03-13-InstrSched.ll
index 33f935e..a63cdd4 100644
--- a/test/CodeGen/ARM/2007-03-13-InstrSched.ll
+++ b/test/CodeGen/ARM/2007-03-13-InstrSched.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=arm-apple-darwin -relocation-model=pic \
 ; RUN:   -mattr=+v6 | grep r9
 ; RUN: llc < %s -mtriple=arm-apple-darwin -relocation-model=pic \
-; RUN:   -mattr=+v6 -arm-reserve-r9 -ifcvt-limit=0 -stats |& grep asm-printer
+; RUN:   -mattr=+v6 -arm-reserve-r9 -ifcvt-limit=0 -stats 2>&1 | grep asm-printer
 ; | grep 35
 
 define void @test(i32 %tmp56222, i32 %tmp36224, i32 %tmp46223, i32 %i.0196.0.ph, i32 %tmp8, i32* %tmp1011, i32** %tmp1, i32* %d2.1.out, i32* %d3.1.out, i32* %d0.1.out, i32* %d1.1.out) {
diff --git a/test/CodeGen/ARM/2007-04-03-PEIBug.ll b/test/CodeGen/ARM/2007-04-03-PEIBug.ll
index b543c57..8d3337c 100644
--- a/test/CodeGen/ARM/2007-04-03-PEIBug.ll
+++ b/test/CodeGen/ARM/2007-04-03-PEIBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | not grep {add.*#0}
+; RUN: llc < %s -march=arm | not grep "add.*#0"
 
 define i32 @foo() {
 entry:
diff --git a/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll b/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll
index d2eb85d..670048b 100644
--- a/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll
+++ b/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | not grep {str.*\\!}
+; RUN: llc < %s -march=arm | not grep "str.*\!"
 
 	%struct.shape_edge_t = type { %struct.shape_edge_t*, %struct.shape_edge_t*, i32, i32, i32, i32 }
 	%struct.shape_path_t = type { %struct.shape_edge_t*, %struct.shape_edge_t*, i32, i32, i32, i32, i32, i32 }
diff --git a/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll b/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll
index fd2f462..3754db0 100644
--- a/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll
+++ b/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm-linux-gnueabi -regalloc=fast
+; RUN: llc < %s -mtriple=arm-linux-gnueabi -regalloc=fast -optimize-regalloc=0
 ; PR1925
 
 	%struct.encode_aux_nearestmatch = type { i32*, i32*, i32*, i32*, i32, i32 }
diff --git a/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll b/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll
index 44da8e7..5fbed0d 100644
--- a/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll
+++ b/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=fast
+; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=fast -optimize-regalloc=0
 ; PR1925
 
 	%"struct.kc::impl_Ccode_option" = type { %"struct.kc::impl_abstract_phylum" }
diff --git a/test/CodeGen/ARM/2009-04-06-AsmModifier.ll b/test/CodeGen/ARM/2009-04-06-AsmModifier.ll
index 3526722..7342f69 100644
--- a/test/CodeGen/ARM/2009-04-06-AsmModifier.ll
+++ b/test/CodeGen/ARM/2009-04-06-AsmModifier.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | grep {swi 107}
+; RUN: llc < %s -march=arm | grep "swi 107"
 
 define i32 @_swilseek(i32) nounwind {
 entry:
diff --git a/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll b/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll
index 813bf3c..7d4cc6e 100644
--- a/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll
+++ b/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -regalloc=fast -verify-machineinstrs
+; RUN: llc < %s -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs
 target triple = "arm-pc-linux-gnu"
 
 ; This test case would accidentally use the same physreg for two virtregs
diff --git a/test/CodeGen/ARM/2011-12-14-machine-sink.ll b/test/CodeGen/ARM/2011-12-14-machine-sink.ll
index 5ce600d..b21bb00 100644
--- a/test/CodeGen/ARM/2011-12-14-machine-sink.ll
+++ b/test/CodeGen/ARM/2011-12-14-machine-sink.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -o /dev/null -stats |& FileCheck %s -check-prefix=STATS
+; RUN: llc < %s -o /dev/null -stats 2>&1 | FileCheck %s -check-prefix=STATS
 ; Radar 10266272
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios4.0.0"
diff --git a/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll b/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll
index 872eca3..f1c85f1 100644
--- a/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll
+++ b/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll
@@ -60,8 +60,16 @@ for.end:                                          ; preds = %entry
   ret void
 }
 
+; Check that pseudo-expansion preserves <undef> flags.
+define void @foo3(i8* %p) nounwind ssp {
+entry:
+  tail call void @llvm.arm.neon.vst2.v4f32(i8* %p, <4 x float> undef, <4 x float> undef, i32 4)
+  ret void
+}
+
 declare arm_aapcs_vfpcc void @bar(i8*, float, float, float)
 declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
 
 !0 = metadata !{metadata !"omnipotent char", metadata !1}
 !1 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/ARM/2012-04-24-SplitEHCriticalEdge.ll b/test/CodeGen/ARM/2012-04-24-SplitEHCriticalEdge.ll
new file mode 100644
index 0000000..b3a7e34
--- /dev/null
+++ b/test/CodeGen/ARM/2012-04-24-SplitEHCriticalEdge.ll
@@ -0,0 +1,71 @@
+; RUN: llc -mtriple=thumbv7-apple-ios -relocation-model=pic -disable-fp-elim -mcpu=cortex-a8 < %s
+
+; CodeGen SplitCriticalEdge() shouldn't try to break edge to a landing pad.
+; rdar://11300144
+
+%0 = type opaque
+%class.FunctionInterpreter.3.15.31 = type { %class.Parser.1.13.29, %class.Parser.1.13.29*, %struct.ParserVariable.2.14.30*, i32 }
+%class.Parser.1.13.29 = type { i32 (...)**, %class.Parser.1.13.29* }
+%struct.ParserVariable.2.14.30 = type opaque
+%struct.ParseErrorMsg.0.12.28 = type { i32, i32, i32 }
+
+@_ZTI13ParseErrorMsg = external hidden unnamed_addr constant { i8*, i8* }
+@"OBJC_IVAR_$_MUMathExpressionDoubleBased.mInterpreter" = external hidden global i32, section "__DATA, __objc_ivar", align 4
+@"\01L_OBJC_SELECTOR_REFERENCES_14" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+
+declare i8* @objc_msgSend(i8*, i8*, ...)
+
+declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+declare void @__cxa_end_catch()
+
+declare void @_ZSt9terminatev()
+
+define hidden double @t(%0* %self, i8* nocapture %_cmd) optsize ssp {
+entry:
+  %call = invoke double undef(%class.FunctionInterpreter.3.15.31* undef) optsize
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          catch i8* bitcast ({ i8*, i8* }* @_ZTI13ParseErrorMsg to i8*)
+  br i1 undef, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, %struct.ParseErrorMsg.0.12.28*)*)(i8* undef, i8* undef, %struct.ParseErrorMsg.0.12.28* undef) optsize
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:                                     ; preds = %catch
+  br label %try.cont
+
+try.cont:                                         ; preds = %invoke.cont2, %entry
+  %value.0 = phi double [ 0x7FF8000000000000, %invoke.cont2 ], [ %call, %entry ]
+  ret double %value.0
+
+lpad1:                                            ; preds = %catch
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  invoke void @__cxa_end_catch()
+          to label %eh.resume unwind label %terminate.lpad
+
+eh.resume:                                        ; preds = %lpad1, %lpad
+  resume { i8*, i32 } undef
+
+terminate.lpad:                                   ; preds = %lpad1
+  %2 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          catch i8* null
+  unreachable
+}
+
+declare i32 @__gxx_personality_sj0(...)
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
diff --git a/test/CodeGen/ARM/2012-05-29-TailDupBug.ll b/test/CodeGen/ARM/2012-05-29-TailDupBug.ll
new file mode 100644
index 0000000..1a57f04
--- /dev/null
+++ b/test/CodeGen/ARM/2012-05-29-TailDupBug.ll
@@ -0,0 +1,140 @@
+; RUN: llc -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -verify-machineinstrs < %s
+
+; Teach taildup to update livein set to appease verifier.
+; rdar://11538365
+
+%struct.__CFString.2 = type opaque
+
+declare void @CFRelease(i8*)
+
+define hidden fastcc i32 @t() ssp {
+entry:
+  %mylocale.i.i = alloca [256 x i8], align 1
+  br i1 undef, label %return, label %CFStringIsHyphenationAvailableForLocale.exit
+
+CFStringIsHyphenationAvailableForLocale.exit:     ; preds = %entry
+  br i1 undef, label %return, label %if.end
+
+if.end:                                           ; preds = %CFStringIsHyphenationAvailableForLocale.exit
+  br i1 undef, label %if.end8.thread.i, label %if.then.i
+
+if.then.i:                                        ; preds = %if.end
+  br i1 undef, label %if.end8.thread.i, label %if.end8.i
+
+if.end8.thread.i:                                 ; preds = %if.then.i, %if.end
+  unreachable
+
+if.end8.i:                                        ; preds = %if.then.i
+  br i1 undef, label %if.then11.i, label %__CFHyphenationPullTokenizer.exit
+
+if.then11.i:                                      ; preds = %if.end8.i
+  unreachable
+
+__CFHyphenationPullTokenizer.exit:                ; preds = %if.end8.i
+  br i1 undef, label %if.end68, label %if.then3
+
+if.then3:                                         ; preds = %__CFHyphenationPullTokenizer.exit
+  br i1 undef, label %cond.end, label %cond.false
+
+cond.false:                                       ; preds = %if.then3
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %if.then3
+  br i1 undef, label %while.end, label %while.body
+
+while.body:                                       ; preds = %cond.end
+  unreachable
+
+while.end:                                        ; preds = %cond.end
+  br i1 undef, label %if.end5.i, label %if.then.i16
+
+if.then.i16:                                      ; preds = %while.end
+  br i1 undef, label %if.then4.i, label %if.end5.i
+
+if.then4.i:                                       ; preds = %if.then.i16
+  br i1 false, label %cleanup.thread, label %if.end.i20
+
+if.end5.i:                                        ; preds = %if.then.i16, %while.end
+  unreachable
+
+if.end.i20:                                       ; preds = %if.then4.i
+  br label %for.body.i146.i
+
+for.body.i146.i:                                  ; preds = %for.body.i146.i, %if.end.i20
+  br i1 undef, label %if.end20.i, label %for.body.i146.i
+
+if.end20.i:                                       ; preds = %for.body.i146.i
+  br i1 undef, label %cleanup.thread, label %if.end23.i
+
+if.end23.i:                                       ; preds = %if.end20.i
+  br label %for.body.i94.i
+
+for.body.i94.i:                                   ; preds = %for.body.i94.i, %if.end23.i
+  br i1 undef, label %if.then28.i, label %for.body.i94.i
+
+if.then28.i:                                      ; preds = %for.body.i94.i
+  br i1 undef, label %cond.true.i26, label %land.lhs.true
+
+cond.true.i26:                                    ; preds = %if.then28.i
+  br label %land.lhs.true
+
+land.lhs.true:                                    ; preds = %cond.true.i26, %if.then28.i
+  br i1 false, label %cleanup.thread, label %if.end35
+
+if.end35:                                         ; preds = %land.lhs.true
+  br i1 undef, label %cleanup.thread, label %if.end45
+
+if.end45:                                         ; preds = %if.end35
+  br i1 undef, label %if.then50, label %if.end.i37
+
+if.end.i37:                                       ; preds = %if.end45
+  br label %if.then50
+
+if.then50:                                        ; preds = %if.end.i37, %if.end45
+  br i1 undef, label %__CFHyphenationGetHyphensForString.exit, label %if.end.i
+
+if.end.i:                                         ; preds = %if.then50
+  br i1 undef, label %cleanup.i, label %cond.true.i
+
+cond.true.i:                                      ; preds = %if.end.i
+  br i1 undef, label %for.cond16.preheader.i, label %for.cond57.preheader.i
+
+for.cond16.preheader.i:                           ; preds = %cond.true.i
+  %cmp1791.i = icmp sgt i32 undef, 1
+  br i1 %cmp1791.i, label %for.body18.i, label %for.cond57.preheader.i
+
+for.cond57.preheader.i:                           ; preds = %for.cond16.preheader.i, %cond.true.i
+  %sub69.i = add i32 undef, -2
+  br label %cleanup.i
+
+for.body18.i:                                     ; preds = %for.cond16.preheader.i
+  store i16 0, i16* undef, align 2
+  br label %while.body.i
+
+while.body.i:                                     ; preds = %while.body.i, %for.body18.i
+  br label %while.body.i
+
+cleanup.i:                                        ; preds = %for.cond57.preheader.i, %if.end.i
+  br label %__CFHyphenationGetHyphensForString.exit
+
+__CFHyphenationGetHyphensForString.exit:          ; preds = %cleanup.i, %if.then50
+  %retval.1.i = phi i32 [ 0, %cleanup.i ], [ -1, %if.then50 ]
+  %phitmp = bitcast %struct.__CFString.2* null to i8*
+  br label %if.end68
+
+cleanup.thread:                                   ; preds = %if.end35, %land.lhs.true, %if.end20.i, %if.then4.i
+  call void @llvm.stackrestore(i8* null)
+  br label %return
+
+if.end68:                                         ; preds = %__CFHyphenationGetHyphensForString.exit, %__CFHyphenationPullTokenizer.exit
+  %hyphenCount.2 = phi i32 [ %retval.1.i, %__CFHyphenationGetHyphensForString.exit ], [ 0, %__CFHyphenationPullTokenizer.exit ]
+  %_token.1 = phi i8* [ %phitmp, %__CFHyphenationGetHyphensForString.exit ], [ undef, %__CFHyphenationPullTokenizer.exit ]
+  call void @CFRelease(i8* %_token.1)
+  br label %return
+
+return:                                           ; preds = %if.end68, %cleanup.thread, %CFStringIsHyphenationAvailableForLocale.exit, %entry
+  %retval.1 = phi i32 [ %hyphenCount.2, %if.end68 ], [ -1, %CFStringIsHyphenationAvailableForLocale.exit ], [ -1, %cleanup.thread ], [ -1, %entry ]
+  ret i32 %retval.1
+}
+
+declare void @llvm.stackrestore(i8*) nounwind
diff --git a/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll b/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll
new file mode 100644
index 0000000..b05ec63
--- /dev/null
+++ b/test/CodeGen/ARM/2012-06-12-SchedMemLatency.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -o /dev/null "-mtriple=thumbv7-apple-ios" -debug-only=post-RA-sched 2> %t
+; RUN: FileCheck %s < %t
+; REQUIRES: asserts
+; Make sure that mayalias store-load dependencies have one cycle
+; latency regardless of whether they are barriers or not.
+
+; CHECK: ** List Scheduling
+; CHECK: SU(2){{.*}}STR{{.*}}Volatile
+; CHECK-NOT: ch SU
+; CHECK: ch SU(3): Latency=1
+; CHECK-NOT: ch SU
+; CHECK: SU(3){{.*}}LDR{{.*}}Volatile
+; CHECK-NOT: ch SU
+; CHECK: ch SU(2): Latency=1
+; CHECK-NOT: ch SU
+; CHECK: ** List Scheduling
+; CHECK: SU(2){{.*}}STR{{.*}}
+; CHECK-NOT: ch SU
+; CHECK: ch SU(3): Latency=1
+; CHECK-NOT: ch SU
+; CHECK: SU(3){{.*}}LDR{{.*}}
+; CHECK-NOT: ch SU
+; CHECK: ch SU(2): Latency=1
+; CHECK-NOT: ch SU
+define i32 @f1(i32* nocapture %p1, i32* nocapture %p2) nounwind {
+entry:
+  store volatile i32 65540, i32* %p1, align 4, !tbaa !0
+  %0 = load volatile i32* %p2, align 4, !tbaa !0
+  ret i32 %0
+}
+
+define i32 @f2(i32* nocapture %p1, i32* nocapture %p2) nounwind {
+entry:
+  store i32 65540, i32* %p1, align 4, !tbaa !0
+  %0 = load i32* %p2, align 4, !tbaa !0
+  ret i32 %0
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/ARM/2012-08-04-DtripleSpillReload.ll b/test/CodeGen/ARM/2012-08-04-DtripleSpillReload.ll
new file mode 100644
index 0000000..e4ad45b
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-04-DtripleSpillReload.ll
@@ -0,0 +1,174 @@
+; RUN: llc < %s
+; PR13377
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+target triple = "armv7-none-linux-gnueabi"
+
+%0 = type { <4 x float> }
+
+define arm_aapcs_vfpcc void @foo(float, i1 zeroext, i1 zeroext) nounwind uwtable {
+  br i1 undef, label %4, label %5
+
+; <label>:4                                       ; preds = %3
+  unreachable
+
+; <label>:5                                       ; preds = %3
+  br i1 undef, label %7, label %6
+
+; <label>:6                                       ; preds = %5
+  unreachable
+
+; <label>:7                                       ; preds = %5
+  br i1 undef, label %8, label %10
+
+; <label>:8                                       ; preds = %7
+  br i1 undef, label %9, label %10
+
+; <label>:9                                       ; preds = %8
+  br i1 undef, label %11, label %10
+
+; <label>:10                                      ; preds = %9, %8, %7
+  unreachable
+
+; <label>:11                                      ; preds = %9
+  br i1 undef, label %13, label %12
+
+; <label>:12                                      ; preds = %11
+  unreachable
+
+; <label>:13                                      ; preds = %11
+  br i1 undef, label %15, label %14
+
+; <label>:14                                      ; preds = %13
+  unreachable
+
+; <label>:15                                      ; preds = %13
+  br i1 undef, label %18, label %16
+
+; <label>:16                                      ; preds = %15
+  br i1 undef, label %17, label %18
+
+; <label>:17                                      ; preds = %16
+  unreachable
+
+; <label>:18                                      ; preds = %16, %15
+  br i1 undef, label %68, label %19
+
+; <label>:19                                      ; preds = %18
+  br label %20
+
+; <label>:20                                      ; preds = %20, %19
+  br i1 undef, label %21, label %20
+
+; <label>:21                                      ; preds = %20
+  br i1 undef, label %22, label %68
+
+; <label>:22                                      ; preds = %21
+  br i1 undef, label %23, label %24
+
+; <label>:23                                      ; preds = %22
+  unreachable
+
+; <label>:24                                      ; preds = %22
+  br i1 undef, label %26, label %25
+
+; <label>:25                                      ; preds = %24
+  unreachable
+
+; <label>:26                                      ; preds = %24
+  br i1 undef, label %28, label %27
+
+; <label>:27                                      ; preds = %26
+  unreachable
+
+; <label>:28                                      ; preds = %26
+  br i1 undef, label %29, label %30, !prof !0
+
+; <label>:29                                      ; preds = %28
+  br label %30
+
+; <label>:30                                      ; preds = %29, %28
+  br i1 undef, label %31, label %32, !prof !0
+
+; <label>:31                                      ; preds = %30
+  br label %32
+
+; <label>:32                                      ; preds = %31, %30
+  br i1 undef, label %34, label %33
+
+; <label>:33                                      ; preds = %32
+  unreachable
+
+; <label>:34                                      ; preds = %32
+  br i1 undef, label %35, label %36, !prof !0
+
+; <label>:35                                      ; preds = %34
+  br label %36
+
+; <label>:36                                      ; preds = %35, %34
+  br i1 undef, label %37, label %38, !prof !0
+
+; <label>:37                                      ; preds = %36
+  br label %38
+
+; <label>:38                                      ; preds = %37, %36
+  br i1 undef, label %39, label %67
+
+; <label>:39                                      ; preds = %38
+  br i1 undef, label %40, label %41
+
+; <label>:40                                      ; preds = %39
+  br i1 undef, label %64, label %41
+
+; <label>:41                                      ; preds = %40, %39
+  br i1 undef, label %64, label %42
+
+; <label>:42                                      ; preds = %41
+  %43 = fadd <4 x float> undef, undef
+  %44 = fadd <4 x float> undef, undef
+  %45 = fmul <4 x float> undef, undef
+  %46 = fmul <4 x float> %45, %43
+  %47 = fmul <4 x float> undef, %44
+  %48 = load <4 x float>* undef, align 8, !tbaa !1
+  %49 = bitcast <4 x float> %48 to <2 x i64>
+  %50 = shufflevector <2 x i64> %49, <2 x i64> undef, <1 x i32> <i32 1>
+  %51 = bitcast <1 x i64> %50 to <2 x float>
+  %52 = shufflevector <2 x float> %51, <2 x float> undef, <4 x i32> zeroinitializer
+  %53 = bitcast <4 x float> %52 to <2 x i64>
+  %54 = shufflevector <2 x i64> %53, <2 x i64> undef, <1 x i32> zeroinitializer
+  %55 = bitcast <1 x i64> %54 to <2 x float>
+  %56 = extractelement <2 x float> %55, i32 0
+  %57 = insertelement <4 x float> undef, float %56, i32 2
+  %58 = insertelement <4 x float> %57, float 1.000000e+00, i32 3
+  %59 = fsub <4 x float> %47, %58
+  %60 = fmul <4 x float> undef, undef
+  %61 = fmul <4 x float> %59, %60
+  %62 = fmul <4 x float> %61, <float 6.000000e+01, float 6.000000e+01, float 6.000000e+01, float 6.000000e+01>
+  %63 = fadd <4 x float> %47, %62
+  store <4 x float> %46, <4 x float>* undef, align 8, !tbaa !1
+  call arm_aapcs_vfpcc  void @bar(%0* undef, float 0.000000e+00) nounwind
+  call arm_aapcs_vfpcc  void @bar(%0* undef, float 0.000000e+00) nounwind
+  store <4 x float> %63, <4 x float>* undef, align 8, !tbaa !1
+  unreachable
+
+; <label>:64                                      ; preds = %41, %40
+  br i1 undef, label %65, label %66
+
+; <label>:65                                      ; preds = %64
+  unreachable
+
+; <label>:66                                      ; preds = %64
+  unreachable
+
+; <label>:67                                      ; preds = %38
+  unreachable
+
+; <label>:68                                      ; preds = %21, %18
+  ret void
+}
+
+declare arm_aapcs_vfpcc void @bar(%0*, float)
+
+!0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/ARM/2012-08-08-legalize-unaligned.ll b/test/CodeGen/ARM/2012-08-08-legalize-unaligned.ll
new file mode 100644
index 0000000..bdcd1b6
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-08-legalize-unaligned.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s
+; PR13111
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32"
+target triple = "armv7-none-linux-gnueabi"
+
+define void @test_hi_char8() noinline {
+entry:
+  %0 = load <4 x i8>* undef, align 1
+  store <4 x i8> %0, <4 x i8>* null, align 4
+  ret void
+}
diff --git a/test/CodeGen/ARM/2012-08-09-neon-extload.ll b/test/CodeGen/ARM/2012-08-09-neon-extload.ll
new file mode 100644
index 0000000..b55f1ca
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-09-neon-extload.ll
@@ -0,0 +1,102 @@
+; RUN: llc -mtriple=armv7-none-linux-gnueabi < %s | FileCheck %s
+
+@var_v2i8 = global <2 x i8> zeroinitializer
+@var_v4i8 = global <4 x i8> zeroinitializer
+
+@var_v2i16 = global <2 x i16> zeroinitializer
+@var_v4i16 = global <4 x i16> zeroinitializer
+
+@var_v2i32 = global <2 x i32> zeroinitializer
+@var_v4i32 = global <4 x i32> zeroinitializer
+
+@var_v2i64 = global <2 x i64> zeroinitializer
+
+define void @test_v2i8tov2i32() {
+; CHECK: test_v2i8tov2i32:
+
+  %i8val = load <2 x i8>* @var_v2i8
+
+  %i32val = sext <2 x i8> %i8val to <2 x i32>
+  store <2 x i32> %i32val, <2 x i32>* @var_v2i32
+; CHECK: vld1.16 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :16]
+; CHECK: vmovl.s8 {{q[0-9]+}}, d[[LOAD]]
+; CHECK: vmovl.s16 {{q[0-9]+}}, {{d[0-9]+}}
+
+  ret void
+}
+
+define void @test_v2i8tov2i64() {
+; CHECK: test_v2i8tov2i64:
+
+  %i8val = load <2 x i8>* @var_v2i8
+
+  %i64val = sext <2 x i8> %i8val to <2 x i64>
+  store <2 x i64> %i64val, <2 x i64>* @var_v2i64
+; CHECK: vld1.16 {d{{[0-9]+}}[0]}, [{{r[0-9]+}}, :16]
+; CHECK: vmovl.s8 {{q[0-9]+}}, d[[LOAD]]
+; CHECK: vmovl.s16 {{q[0-9]+}}, {{d[0-9]+}}
+; CHECK: vmovl.s32 {{q[0-9]+}}, {{d[0-9]+}}
+
+;  %i64val = sext <2 x i8> %i8val to <2 x i64>
+;  store <2 x i64> %i64val, <2 x i64>* @var_v2i64
+
+  ret void
+}
+
+define void @test_v4i8tov4i16() {
+; CHECK: test_v4i8tov4i16:
+
+  %i8val = load <4 x i8>* @var_v4i8
+
+  %i16val = sext <4 x i8> %i8val to <4 x i16>
+  store <4 x i16> %i16val, <4 x i16>* @var_v4i16
+; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :32]
+; CHECK: vmovl.s8 {{q[0-9]+}}, d[[LOAD]]
+; CHECK-NOT: vmovl.s16
+
+  ret void
+; CHECK: bx lr
+}
+
+define void @test_v4i8tov4i32() {
+; CHECK: test_v4i8tov4i32:
+
+  %i8val = load <4 x i8>* @var_v4i8
+
+  %i16val = sext <4 x i8> %i8val to <4 x i32>
+  store <4 x i32> %i16val, <4 x i32>* @var_v4i32
+; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :32]
+; CHECK: vmovl.s8 {{q[0-9]+}}, d[[LOAD]]
+; CHECK: vmovl.s16 {{q[0-9]+}}, {{d[0-9]+}}
+
+  ret void
+}
+
+define void @test_v2i16tov2i32() {
+; CHECK: test_v2i16tov2i32:
+
+  %i16val = load <2 x i16>* @var_v2i16
+
+  %i32val = sext <2 x i16> %i16val to <2 x i32>
+  store <2 x i32> %i32val, <2 x i32>* @var_v2i32
+; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :32]
+; CHECK: vmovl.s16 {{q[0-9]+}}, d[[LOAD]]
+; CHECK-NOT: vmovl
+
+  ret void
+; CHECK: bx lr
+}
+
+define void @test_v2i16tov2i64() {
+; CHECK: test_v2i16tov2i64:
+
+  %i16val = load <2 x i16>* @var_v2i16
+
+  %i64val = sext <2 x i16> %i16val to <2 x i64>
+  store <2 x i64> %i64val, <2 x i64>* @var_v2i64
+; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :32]
+; CHECK: vmovl.s16 {{q[0-9]+}}, d[[LOAD]]
+; CHECK: vmovl.s32 {{q[0-9]+}}, d[[LOAD]]
+
+  ret void
+}
diff --git a/test/CodeGen/ARM/2012-08-13-bfi.ll b/test/CodeGen/ARM/2012-08-13-bfi.ll
new file mode 100644
index 0000000..8263833
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-13-bfi.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=thumb -mcpu=cortex-a8 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: foo
+; CHECK-NOT: bfi
+; CHECK: bx
+define i32 @foo(i8 zeroext %i) nounwind uwtable readnone ssp {
+  %1 = and i8 %i, 15
+  %2 = zext i8 %1 to i32
+  %3 = icmp ult i8 %1, 10
+  %4 = or i32 %2, 48
+  %5 = add nsw i32 %2, 55
+  %6 = select i1 %3, i32 %4, i32 %5
+  ret i32 %6
+}
diff --git a/test/CodeGen/ARM/addrmode.ll b/test/CodeGen/ARM/addrmode.ll
index 9ccff07..6da9089 100644
--- a/test/CodeGen/ARM/addrmode.ll
+++ b/test/CodeGen/ARM/addrmode.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -stats |& grep asm-printer | grep 4
+; RUN: llc < %s -march=arm -stats 2>&1 | grep asm-printer | grep 4
 
 define i32 @t1(i32 %a) {
 	%b = mul i32 %a, 9
diff --git a/test/CodeGen/ARM/aliases.ll b/test/CodeGen/ARM/aliases.ll
index 31c5007..d668334 100644
--- a/test/CodeGen/ARM/aliases.ll
+++ b/test/CodeGen/ARM/aliases.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=arm-linux-gnueabi -o %t
-; RUN: grep { = } %t   | count 5
+; RUN: grep " = " %t   | count 5
 ; RUN: grep globl %t | count 4
 ; RUN: grep weak %t  | count 1
 
diff --git a/test/CodeGen/ARM/arm-modifier.ll b/test/CodeGen/ARM/arm-modifier.ll
index 396de37..5e12d8e 100644
--- a/test/CodeGen/ARM/arm-modifier.ll
+++ b/test/CodeGen/ARM/arm-modifier.ll
@@ -57,3 +57,12 @@ store i64 %0, i64* @f3_var, align 4
 store i64 %1, i64* @f3_var, align 4
 ret void
 }
+
+define i64 @f4(i64* %val) nounwind {
+entry:
+  ;CHECK: f4
+  ;CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], {{r[0-9]?[13579]}}, [r0]
+  ;CHECK: mov r0, [[REG1]]
+  %0 = tail call i64 asm sideeffect "ldrexd $0, ${0:H}, [$1]", "=&r,r,*Qo"(i64* %val, i64* %val) nounwind
+  ret i64 %0
+}
diff --git a/test/CodeGen/ARM/bicZext.ll b/test/CodeGen/ARM/bicZext.ll
new file mode 100644
index 0000000..cf4b7ba
--- /dev/null
+++ b/test/CodeGen/ARM/bicZext.ll
@@ -0,0 +1,19 @@
+; RUN: llc %s -o - | FileCheck %s
+; ModuleID = 'bic.c'
+target triple = "thumbv7-apple-ios3.0.0"
+
+define zeroext i16 @foo16(i16 zeroext %f) nounwind readnone optsize ssp {
+entry:
+  ; CHECK: .thumb_func	_foo16
+  ; CHECK: {{bic[^#]*#3}}
+  %and = and i16 %f, -4
+  ret i16 %and
+}
+
+define i32 @foo32(i32 %f) nounwind readnone optsize ssp {
+entry:
+  ; CHECK: .thumb_func	_foo32
+  ; CHECK: {{bic[^#]*#3}}
+  %and = and i32 %f, -4
+  ret i32 %and
+}
diff --git a/test/CodeGen/ARM/call_nolink.ll b/test/CodeGen/ARM/call_nolink.ll
index efe29d8..00b1688 100644
--- a/test/CodeGen/ARM/call_nolink.ll
+++ b/test/CodeGen/ARM/call_nolink.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN:   not grep {bx lr}
+; RUN:   not grep "bx lr"
 
 	%struct.anon = type { i32 (i32, i32, i32)*, i32, i32, [3 x i32], i8*, i8*, i8* }
 @r = external global [14 x i32]		; <[14 x i32]*> [#uses=4]
diff --git a/test/CodeGen/ARM/cmn.ll b/test/CodeGen/ARM/cmn.ll
new file mode 100644
index 0000000..ef73165
--- /dev/null
+++ b/test/CodeGen/ARM/cmn.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple thumbv7-apple-ios | FileCheck %s
+; <rdar://problem/7569620>
+
+define i32 @compare_i_gt(i32 %a) {
+entry:
+; CHECK:     compare_i_gt
+; CHECK-NOT: mvn
+; CHECK:     cmn
+  %cmp = icmp sgt i32 %a, -78
+  %. = zext i1 %cmp to i32
+  ret i32 %.
+}
+
+define i32 @compare_r_eq(i32 %a, i32 %b) {
+entry:
+; CHECK: compare_r_eq
+; CHECK: cmn
+  %sub = sub nsw i32 0, %b
+  %cmp = icmp eq i32 %a, %sub
+  %. = zext i1 %cmp to i32
+  ret i32 %.
+}
diff --git a/test/CodeGen/ARM/coalesce-subregs.ll b/test/CodeGen/ARM/coalesce-subregs.ll
new file mode 100644
index 0000000..fb0f4c6
--- /dev/null
+++ b/test/CodeGen/ARM/coalesce-subregs.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -mcpu=cortex-a9 | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios0.0.0"
+
+; CHECK: f
+; The vld2 and vst2 are not aligned wrt each other, the second Q loaded is the
+; first one stored.
+; The coalescer must find a super-register larger than QQ to eliminate the copy
+; setting up the vst2 data.
+; CHECK: vld2
+; CHECK-NOT: vorr
+; CHECK-NOT: vmov
+; CHECK: vst2
+define void @f(float* %p, i32 %c) nounwind ssp {
+entry:
+  %0 = bitcast float* %p to i8*
+  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4)
+  %vld221 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
+  %add.ptr = getelementptr inbounds float* %p, i32 8
+  %1 = bitcast float* %add.ptr to i8*
+  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %vld221, <4 x float> undef, i32 4)
+  ret void
+}
+
+; CHECK: f1
+; FIXME: This function still has copies.
+define void @f1(float* %p, i32 %c) nounwind ssp {
+entry:
+  %0 = bitcast float* %p to i8*
+  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4)
+  %vld221 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
+  %add.ptr = getelementptr inbounds float* %p, i32 8
+  %1 = bitcast float* %add.ptr to i8*
+  %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
+  %vld2215 = extractvalue { <4 x float>, <4 x float> } %vld22, 0
+  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %vld221, <4 x float> %vld2215, i32 4)
+  ret void
+}
+
+; CHECK: f2
+; FIXME: This function still has copies.
+define void @f2(float* %p, i32 %c) nounwind ssp {
+entry:
+  %0 = bitcast float* %p to i8*
+  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %0, i32 4)
+  %vld224 = extractvalue { <4 x float>, <4 x float> } %vld2, 1
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %qq0.0.1.0 = phi <4 x float> [ %vld224, %entry ], [ %vld2216, %do.body ]
+  %c.addr.0 = phi i32 [ %c, %entry ], [ %dec, %do.body ]
+  %p.addr.0 = phi float* [ %p, %entry ], [ %add.ptr, %do.body ]
+  %add.ptr = getelementptr inbounds float* %p.addr.0, i32 8
+  %1 = bitcast float* %add.ptr to i8*
+  %vld22 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
+  %vld2215 = extractvalue { <4 x float>, <4 x float> } %vld22, 0
+  %vld2216 = extractvalue { <4 x float>, <4 x float> } %vld22, 1
+  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %qq0.0.1.0, <4 x float> %vld2215, i32 4)
+  %dec = add nsw i32 %c.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  ret void
+}
+
+declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32) nounwind readonly
+declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
diff --git a/test/CodeGen/ARM/crash-greedy.ll b/test/CodeGen/ARM/crash-greedy.ll
index 8a865e2..a3d49f6 100644
--- a/test/CodeGen/ARM/crash-greedy.ll
+++ b/test/CodeGen/ARM/crash-greedy.ll
@@ -82,3 +82,49 @@ if.then195:                                       ; preds = %if.then84
 if.end251:                                        ; preds = %if.then195, %if.then84, %entry
   ret void
 }
+
+; Coalescer failure: removeCopyByCommutingDef leaves a bad kill flag
+; behind.
+define void @rdar11950722() nounwind readonly optsize ssp align 2 {
+entry:
+  br i1 undef, label %land.lhs.true7, label %lor.lhs.false.i
+
+lor.lhs.false.i:
+  br i1 undef, label %if.then10.i, label %land.lhs.true7
+
+if.then10.i:
+  %xFlags.1.i = select i1 undef, i32 0, i32 undef
+  br i1 undef, label %land.lhs.true33.i, label %f.exit
+
+land.lhs.true33.i:
+  %and26.i = and i32 %xFlags.1.i, 8
+  %cmp27.i = icmp eq i32 %and26.i, 0
+  %and29.i = and i32 %xFlags.1.i, 2147483645
+  %xFlags.1.and29.i = select i1 %cmp27.i, i32 %xFlags.1.i, i32 %and29.i
+  %and34.i = and i32 %xFlags.1.i, 8
+  %cmp35.i = icmp eq i32 %and34.i, 0
+  %and37.i = and i32 %xFlags.1.i, 2147483645
+  %yFlags.1.and37.i = select i1 %cmp35.i, i32 %xFlags.1.i, i32 %and37.i
+  br label %f.exit
+
+f.exit:
+  %xFlags.3.i = phi i32 [ %xFlags.1.and29.i, %land.lhs.true33.i ], [ %xFlags.1.i, %if.then10.i ]
+  %yFlags.2.i = phi i32 [ %yFlags.1.and37.i, %land.lhs.true33.i ], [ %xFlags.1.i, %if.then10.i ]
+  %cmp40.i = icmp eq i32 %xFlags.3.i, %yFlags.2.i
+  br i1 %cmp40.i, label %land.lhs.true7, label %land.end
+
+land.lhs.true7:
+  br i1 undef, label %land.lhs.true34, label %lor.lhs.false27
+
+lor.lhs.false27:
+  br i1 undef, label %land.lhs.true34, label %land.end
+
+land.lhs.true34:
+  br i1 undef, label %land.end, label %lor.lhs.false44
+
+lor.lhs.false44:
+  ret void
+
+land.end:
+  ret void
+}
diff --git a/test/CodeGen/ARM/cse-libcalls.ll b/test/CodeGen/ARM/cse-libcalls.ll
index 1d011be..62b9e43 100644
--- a/test/CodeGen/ARM/cse-libcalls.ll
+++ b/test/CodeGen/ARM/cse-libcalls.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | grep {bl.\*__ltdf} | count 1
+; RUN: llc < %s -march=arm | grep "bl.*__ltdf" | count 1
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
 
diff --git a/test/CodeGen/ARM/data-in-code-annotations.ll b/test/CodeGen/ARM/data-in-code-annotations.ll
new file mode 100644
index 0000000..a66a9d1
--- /dev/null
+++ b/test/CodeGen/ARM/data-in-code-annotations.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -mtriple=armv7-apple-darwin | FileCheck %s
+
+define double @f1() nounwind {
+; CHECK: f1:
+; CHECK: .data_region
+; CHECK: .long 1413754129
+; CHECK: .long 1074340347
+; CHECK: .end_data_region
+  ret double 0x400921FB54442D11
+}
+
+
+define i32 @f2()  {
+; CHECK: f2:
+; CHECK: .data_region jt32
+; CHECK: .end_data_region
+
+entry:
+  switch i32 undef, label %return [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb6
+    i32 3, label %sw.bb13
+    i32 4, label %sw.bb20
+  ]
+
+sw.bb:                                            ; preds = %entry
+  br label %return
+
+sw.bb6:                                           ; preds = %entry
+  br label %return
+
+sw.bb13:                                          ; preds = %entry
+  br label %return
+
+sw.bb20:                                          ; preds = %entry
+  %div = sdiv i32 undef, undef
+  br label %return
+
+return:                                           ; preds = %sw.bb20, %sw.bb13, %sw.bb6, %sw.bb, %entry
+  %retval.0 = phi i32 [ %div, %sw.bb20 ], [ undef, %sw.bb13 ], [ undef, %sw.bb6 ], [ undef, %sw.bb ], [ 0, %entry ]
+  ret i32 %retval.0
+}
diff --git a/test/CodeGen/ARM/debug-info-branch-folding.ll b/test/CodeGen/ARM/debug-info-branch-folding.ll
index 9bdae43..4f4ff8e 100644
--- a/test/CodeGen/ARM/debug-info-branch-folding.ll
+++ b/test/CodeGen/ARM/debug-info-branch-folding.ll
@@ -3,16 +3,17 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-
 target triple = "thumbv7-apple-macosx10.6.7"
 
 ;CHECK: 	vadd.f32	q4, q8, q8
-;CHECK-NEXT: Ltmp
-;CHECK-NEXT: 	@DEBUG_VALUE: y <- Q4+0
-;CHECK-NEXT:    @DEBUG_VALUE: x <- Q4+0
+;CHECK-NEXT: Ltmp1
+
+;CHECK:@DEBUG_VALUE: x <- Q4+0
+;CHECK-NEXT:@DEBUG_VALUE: y <- Q4+0
 
 
 @.str = external constant [13 x i8]
 
 declare <4 x float> @test0001(float) nounwind readnone ssp
 
-define i32 @main(i32 %argc, i8** nocapture %argv) nounwind ssp {
+define i32 @main(i32 %argc, i8** nocapture %argv, i1 %cond) nounwind ssp {
 entry:
   br label %for.body9
 
@@ -21,7 +22,7 @@ for.body9:                                        ; preds = %for.body9, %entry
   tail call void @llvm.dbg.value(metadata !{<4 x float> %add19}, i64 0, metadata !27), !dbg !39
   %add20 = fadd <4 x float> undef, <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, !dbg !39
   tail call void @llvm.dbg.value(metadata !{<4 x float> %add20}, i64 0, metadata !28), !dbg !39
-  br i1 undef, label %for.end54, label %for.body9, !dbg !44
+  br i1 %cond, label %for.end54, label %for.body9, !dbg !44
 
 for.end54:                                        ; preds = %for.body9
   %tmp115 = extractelement <4 x float> %add19, i32 1
@@ -52,7 +53,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !7 = metadata !{i32 589860, metadata !2, metadata !"float", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 589857, i64 0, i64 3}         ; [ DW_TAG_subrange_type ]
-!10 = metadata !{i32 589870, i32 0, metadata !1, metadata !"main", metadata !"main", metadata !"", metadata !1, i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i8**)* @main, null} ; [ DW_TAG_subprogram ]
+!10 = metadata !{i32 589870, i32 0, metadata !1, metadata !"main", metadata !"main", metadata !"", metadata !1, i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i8**, i1)* @main, null} ; [ DW_TAG_subprogram ]
 !11 = metadata !{i32 589845, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
 !12 = metadata !{metadata !13}
 !13 = metadata !{i32 589860, metadata !2, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
diff --git a/test/CodeGen/ARM/divmod.ll b/test/CodeGen/ARM/divmod.ll
index 49c4103..7fbf8f4 100644
--- a/test/CodeGen/ARM/divmod.ll
+++ b/test/CodeGen/ARM/divmod.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm-apple-ios5.0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=cortex-a8 | FileCheck %s
 
 define void @foo(i32 %x, i32 %y, i32* nocapture %P) nounwind ssp {
 entry:
@@ -56,3 +56,17 @@ bb1:
 
 declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readnone
 declare i8* @__memset_chk(i8*, i32, i32, i32) nounwind
+
+; rdar://11714607
+define i32 @howmany(i32 %x, i32 %y) nounwind {
+entry:
+; CHECK: howmany:
+; CHECK: bl ___udivmodsi4
+; CHECK-NOT: ___udivsi3
+  %rem = urem i32 %x, %y
+  %div = udiv i32 %x, %y
+  %not.cmp = icmp ne i32 %rem, 0
+  %add = zext i1 %not.cmp to i32
+  %cond = add i32 %add, %div
+  ret i32 %cond
+}
diff --git a/test/CodeGen/ARM/fabss.ll b/test/CodeGen/ARM/fabss.ll
index 45c322d..bcb4ee7 100644
--- a/test/CodeGen/ARM/fabss.ll
+++ b/test/CodeGen/ARM/fabss.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NFP0
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8
-; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=CORTEXA9
+; RUN: llc < %s -mtriple=arm-apple-ios -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
+; RUN: llc < %s -mtriple=arm-apple-ios -mattr=+neon | FileCheck %s -check-prefix=NFP0
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 | FileCheck %s -check-prefix=CORTEXA8
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a9 | FileCheck %s -check-prefix=CORTEXA9
 
 define float @test(float %a, float %b) {
 entry:
         %dum = fadd float %a, %b
-	%0 = tail call float @fabsf(float %dum)
+	%0 = tail call float @fabsf(float %dum) readnone
         %dum1 = fadd float %0, %b
 	ret float %dum1
 }
diff --git a/test/CodeGen/ARM/fast-isel-call-multi-reg-return.ll b/test/CodeGen/ARM/fast-isel-call-multi-reg-return.ll
new file mode 100644
index 0000000..14721a4
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-call-multi-reg-return.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+
+; Fast-isel can't handle non-double multi-reg retvals.
+; This test just check to make sure we don't hit the assert in FinishCall.
+define <16 x i8> @foo() nounwind ssp {
+entry:
+  ret <16 x i8> zeroinitializer
+}
+
+define void @t1() nounwind ssp {
+entry:
+; ARM: @t1
+; THUMB: @t1
+  %call = call <16 x i8> @foo()
+  ret void
+}
diff --git a/test/CodeGen/ARM/fast-isel-call.ll b/test/CodeGen/ARM/fast-isel-call.ll
index dd460b2..edc805a 100644
--- a/test/CodeGen/ARM/fast-isel-call.ll
+++ b/test/CodeGen/ARM/fast-isel-call.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=THUMB-LONG
 
 define i32 @t0(i1 zeroext %a) nounwind {
   %1 = zext i1 %a to i32
@@ -99,6 +101,11 @@ entry:
 ; ARM: uxtb r9, r12
 ; ARM: str r9, [sp, #4]
 ; ARM: bl _bar
+; ARM-LONG: @t10
+; ARM-LONG: movw lr, :lower16:L_bar$non_lazy_ptr
+; ARM-LONG: movt lr, :upper16:L_bar$non_lazy_ptr
+; ARM-LONG: ldr lr, [lr]
+; ARM-LONG: blx lr
 ; THUMB: @t10
 ; THUMB: movs r0, #0
 ; THUMB: movt r0, #0
@@ -121,8 +128,96 @@ entry:
 ; THUMB: uxtb.w r9, r12
 ; THUMB: str.w r9, [sp, #4]
 ; THUMB: bl _bar
+; THUMB-LONG: @t10
+; THUMB-LONG: movw lr, :lower16:L_bar$non_lazy_ptr
+; THUMB-LONG: movt lr, :upper16:L_bar$non_lazy_ptr
+; THUMB-LONG: ldr.w lr, [lr]
+; THUMB-LONG: blx lr
   %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70)
   ret i32 0
 }
 
 declare i32 @bar(i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext)
+
+define i32 @bar0(i32 %i) nounwind {
+  ret i32 0
+}
+
+define void @foo3() uwtable {
+; ARM: movw    r0, #0
+; ARM: movw    r1, :lower16:_bar0
+; ARM: movt    r1, :upper16:_bar0
+; ARM: blx     r1
+; THUMB: movs    r0, #0
+; THUMB: movw    r1, :lower16:_bar0
+; THUMB: movt    r1, :upper16:_bar0
+; THUMB: blx     r1
+  %fptr = alloca i32 (i32)*, align 8
+  store i32 (i32)* @bar0, i32 (i32)** %fptr, align 8
+  %1 = load i32 (i32)** %fptr, align 8
+  %call = call i32 %1(i32 0)
+  ret void
+}
+
+define i32 @LibCall(i32 %a, i32 %b) {
+entry:
+; ARM: LibCall
+; ARM: bl ___udivsi3
+; ARM-LONG: LibCall
+; ARM-LONG: movw r2, :lower16:L___udivsi3$non_lazy_ptr
+; ARM-LONG: movt r2, :upper16:L___udivsi3$non_lazy_ptr
+; ARM-LONG: ldr r2, [r2]
+; ARM-LONG: blx r2
+; THUMB: LibCall
+; THUMB: bl ___udivsi3
+; THUMB-LONG: LibCall
+; THUMB-LONG: movw r2, :lower16:L___udivsi3$non_lazy_ptr
+; THUMB-LONG: movt r2, :upper16:L___udivsi3$non_lazy_ptr
+; THUMB-LONG: ldr r2, [r2]
+; THUMB-LONG: blx r2
+        %tmp1 = udiv i32 %a, %b         ; <i32> [#uses=1]
+        ret i32 %tmp1
+}
+
+define i32 @VarArg() nounwind {
+entry:
+  %i = alloca i32, align 4
+  %j = alloca i32, align 4
+  %k = alloca i32, align 4
+  %m = alloca i32, align 4
+  %n = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %0 = load i32* %i, align 4
+  %1 = load i32* %j, align 4
+  %2 = load i32* %k, align 4
+  %3 = load i32* %m, align 4
+  %4 = load i32* %n, align 4
+; ARM: VarArg
+; ARM: mov r7, sp
+; ARM: movw r0, #5
+; ARM: ldr r1, [r7, #-4]
+; ARM: ldr r2, [r7, #-8]
+; ARM: ldr r3, [r7, #-12]
+; ARM: ldr r9, [sp, #16]
+; ARM: ldr r12, [sp, #12]
+; ARM: str r9, [sp]
+; ARM: str r12, [sp, #4]
+; ARM: bl _CallVariadic
+; THUMB: mov r7, sp
+; THUMB: movs r0, #5
+; THUMB: movt r0, #0
+; THUMB: ldr r1, [sp, #28]
+; THUMB: ldr r2, [sp, #24]
+; THUMB: ldr r3, [sp, #20]
+; THUMB: ldr.w r9, [sp, #16]
+; THUMB: ldr.w r12, [sp, #12]
+; THUMB: str.w r9, [sp]
+; THUMB: str.w r12, [sp, #4]
+; THUMB: bl _CallVariadic
+  %call = call i32 (i32, ...)* @CallVariadic(i32 5, i32 %0, i32 %1, i32 %2, i32 %3, i32 %4)
+  store i32 %call, i32* %tmp, align 4
+  %5 = load i32* %tmp, align 4
+  ret i32 %5
+}
+
+declare i32 @CallVariadic(i32, ...)
diff --git a/test/CodeGen/ARM/fast-isel-frameaddr.ll b/test/CodeGen/ARM/fast-isel-frameaddr.ll
new file mode 100644
index 0000000..8f7b294
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-frameaddr.ll
@@ -0,0 +1,100 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv7-apple-darwin | FileCheck %s --check-prefix=DARWIN-ARM
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=LINUX-ARM
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=thumbv7-apple-darwin | FileCheck %s --check-prefix=DARWIN-THUMB2
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=thumbv7-linux-gnueabi | FileCheck %s --check-prefix=LINUX-THUMB2
+
+define i8* @frameaddr_index0() nounwind {
+entry:
+; DARWIN-ARM: frameaddr_index0:
+; DARWIN-ARM: push {r7}
+; DARWIN-ARM: mov r7, sp
+; DARWIN-ARM: mov r0, r7
+
+; DARWIN-THUMB2: frameaddr_index0:
+; DARWIN-THUMB2: str r7, [sp, #-4]!
+; DARWIN-THUMB2: mov r7, sp
+; DARWIN-THUMB2: mov r0, r7
+
+; LINUX-ARM: frameaddr_index0:
+; LINUX-ARM: push {r11}
+; LINUX-ARM: mov r11, sp
+; LINUX-ARM: mov r0, r11
+
+; LINUX-THUMB2: frameaddr_index0:
+; LINUX-THUMB2: str r7, [sp, #-4]!
+; LINUX-THUMB2: mov r7, sp
+; LINUX-THUMB2: mov r0, r7
+
+  %0 = call i8* @llvm.frameaddress(i32 0)
+  ret i8* %0
+}
+
+define i8* @frameaddr_index1() nounwind {
+entry:
+; DARWIN-ARM: frameaddr_index1:
+; DARWIN-ARM: push {r7}
+; DARWIN-ARM: mov r7, sp
+; DARWIN-ARM: mov r0, r7
+; DARWIN-ARM: ldr r0, [r0]
+
+; DARWIN-THUMB2: frameaddr_index1:
+; DARWIN-THUMB2: str r7, [sp, #-4]!
+; DARWIN-THUMB2: mov r7, sp
+; DARWIN-THUMB2: mov r0, r7
+; DARWIN-THUMB2: ldr r0, [r0]
+
+; LINUX-ARM: frameaddr_index1:
+; LINUX-ARM: push {r11}
+; LINUX-ARM: mov r11, sp
+; LINUX-ARM: mov r0, r11
+; LINUX-ARM: ldr r0, [r0]
+
+; LINUX-THUMB2: frameaddr_index1:
+; LINUX-THUMB2: str r7, [sp, #-4]!
+; LINUX-THUMB2: mov r7, sp
+; LINUX-THUMB2: mov r0, r7
+; LINUX-THUMB2: ldr r0, [r0]
+
+  %0 = call i8* @llvm.frameaddress(i32 1)
+  ret i8* %0
+}
+
+define i8* @frameaddr_index3() nounwind {
+entry:
+; DARWIN-ARM: frameaddr_index3:
+; DARWIN-ARM: push {r7}
+; DARWIN-ARM: mov r7, sp
+; DARWIN-ARM: mov r0, r7
+; DARWIN-ARM: ldr r0, [r0]
+; DARWIN-ARM: ldr r0, [r0]
+; DARWIN-ARM: ldr r0, [r0]
+
+; DARWIN-THUMB2: frameaddr_index3:
+; DARWIN-THUMB2: str r7, [sp, #-4]!
+; DARWIN-THUMB2: mov r7, sp
+; DARWIN-THUMB2: mov r0, r7
+; DARWIN-THUMB2: ldr r0, [r0]
+; DARWIN-THUMB2: ldr r0, [r0]
+; DARWIN-THUMB2: ldr r0, [r0]
+
+; LINUX-ARM: frameaddr_index3:
+; LINUX-ARM: push {r11}
+; LINUX-ARM: mov r11, sp
+; LINUX-ARM: mov r0, r11
+; LINUX-ARM: ldr r0, [r0]
+; LINUX-ARM: ldr r0, [r0]
+; LINUX-ARM: ldr r0, [r0]
+
+; LINUX-THUMB2: frameaddr_index3:
+; LINUX-THUMB2: str r7, [sp, #-4]!
+; LINUX-THUMB2: mov r7, sp
+; LINUX-THUMB2: mov r0, r7
+; LINUX-THUMB2: ldr r0, [r0]
+; LINUX-THUMB2: ldr r0, [r0]
+; LINUX-THUMB2: ldr r0, [r0]
+
+  %0 = call i8* @llvm.frameaddress(i32 3)
+  ret i8* %0
+}
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/ARM/fast-isel-intrinsic.ll b/test/CodeGen/ARM/fast-isel-intrinsic.ll
index e6bdfa7..b73fcef 100644
--- a/test/CodeGen/ARM/fast-isel-intrinsic.ll
+++ b/test/CodeGen/ARM/fast-isel-intrinsic.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=THUMB-LONG
 
 @message1 = global [60 x i8] c"The LLVM Compiler Infrastructure\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", align 1
 @temp = common global [60 x i8] zeroinitializer, align 1
@@ -13,6 +15,11 @@ define void @t1() nounwind ssp {
 ; ARM: movw r2, #10
 ; ARM: uxtb r1, r1
 ; ARM: bl _memset
+; ARM-LONG: t1
+; ARM-LONG: movw r3, :lower16:L_memset$non_lazy_ptr
+; ARM-LONG: movt r3, :upper16:L_memset$non_lazy_ptr
+; ARM-LONG: ldr r3, [r3]
+; ARM-LONG: blx r3
 ; THUMB: t1
 ; THUMB: movw r0, :lower16:_message1
 ; THUMB: movt r0, :upper16:_message1
@@ -23,6 +30,11 @@ define void @t1() nounwind ssp {
 ; THUMB: movt r2, #0
 ; THUMB: uxtb r1, r1
 ; THUMB: bl _memset
+; THUMB-LONG: t1
+; THUMB-LONG: movw r3, :lower16:L_memset$non_lazy_ptr
+; THUMB-LONG: movt r3, :upper16:L_memset$non_lazy_ptr
+; THUMB-LONG: ldr r3, [r3]
+; THUMB-LONG: blx r3
   call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @message1, i32 0, i32 5), i8 64, i32 10, i32 1, i1 false)
   ret void
 }
@@ -41,6 +53,11 @@ define void @t2() nounwind ssp {
 ; ARM: mov r0, r1
 ; ARM: ldr r1, [sp]                @ 4-byte Reload
 ; ARM: bl _memcpy
+; ARM-LONG: t2
+; ARM-LONG: movw r3, :lower16:L_memcpy$non_lazy_ptr
+; ARM-LONG: movt r3, :upper16:L_memcpy$non_lazy_ptr
+; ARM-LONG: ldr r3, [r3]
+; ARM-LONG: blx r3
 ; THUMB: t2
 ; THUMB: movw r0, :lower16:L_temp$non_lazy_ptr
 ; THUMB: movt r0, :upper16:L_temp$non_lazy_ptr
@@ -51,6 +68,11 @@ define void @t2() nounwind ssp {
 ; THUMB: movt r2, #0
 ; THUMB: mov r0, r1
 ; THUMB: bl _memcpy
+; THUMB-LONG: t2
+; THUMB-LONG: movw r3, :lower16:L_memcpy$non_lazy_ptr
+; THUMB-LONG: movt r3, :upper16:L_memcpy$non_lazy_ptr
+; THUMB-LONG: ldr r3, [r3]
+; THUMB-LONG: blx r3
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 17, i32 1, i1 false)
   ret void
 }
@@ -67,6 +89,11 @@ define void @t3() nounwind ssp {
 ; ARM: movw r2, #10
 ; ARM: mov r0, r1
 ; ARM: bl _memmove
+; ARM-LONG: t3
+; ARM-LONG: movw r3, :lower16:L_memmove$non_lazy_ptr
+; ARM-LONG: movt r3, :upper16:L_memmove$non_lazy_ptr
+; ARM-LONG: ldr r3, [r3]
+; ARM-LONG: blx r3
 ; THUMB: t3
 ; THUMB: movw r0, :lower16:L_temp$non_lazy_ptr
 ; THUMB: movt r0, :upper16:L_temp$non_lazy_ptr
@@ -77,6 +104,11 @@ define void @t3() nounwind ssp {
 ; THUMB: movt r2, #0
 ; THUMB: mov r0, r1
 ; THUMB: bl _memmove
+; THUMB-LONG: t3
+; THUMB-LONG: movw r3, :lower16:L_memmove$non_lazy_ptr
+; THUMB-LONG: movt r3, :upper16:L_memmove$non_lazy_ptr
+; THUMB-LONG: ldr r3, [r3]
+; THUMB-LONG: blx r3
   call void @llvm.memmove.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 10, i32 1, i1 false)
   ret void
 }
diff --git a/test/CodeGen/ARM/fast-isel-shifter.ll b/test/CodeGen/ARM/fast-isel-shifter.ll
new file mode 100644
index 0000000..111818b
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-shifter.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
+
+define i32 @shl() nounwind ssp {
+entry:
+; ARM: shl
+; ARM: lsl r0, r0, #2
+  %shl = shl i32 -1, 2
+  ret i32 %shl
+}
+
+define i32 @shl_reg(i32 %src1, i32 %src2) nounwind ssp {
+entry:
+; ARM: shl_reg
+; ARM: lsl r0, r0, r1
+  %shl = shl i32 %src1, %src2
+  ret i32 %shl
+}
+
+define i32 @lshr() nounwind ssp {
+entry:
+; ARM: lshr
+; ARM: lsr r0, r0, #2
+  %lshr = lshr i32 -1, 2
+  ret i32 %lshr
+}
+
+define i32 @lshr_reg(i32 %src1, i32 %src2) nounwind ssp {
+entry:
+; ARM: lshr_reg
+; ARM: lsr r0, r0, r1
+  %lshr = lshr i32 %src1, %src2
+  ret i32 %lshr
+}
+
+define i32 @ashr() nounwind ssp {
+entry:
+; ARM: ashr
+; ARM: asr r0, r0, #2
+  %ashr = ashr i32 -1, 2
+  ret i32 %ashr
+}
+
+define i32 @ashr_reg(i32 %src1, i32 %src2) nounwind ssp {
+entry:
+; ARM: ashr_reg
+; ARM: asr r0, r0, r1
+  %ashr = ashr i32 %src1, %src2
+  ret i32 %ashr
+}
+
diff --git a/test/CodeGen/ARM/fast-isel.ll b/test/CodeGen/ARM/fast-isel.ll
index 417e2d9..ecd5fe2 100644
--- a/test/CodeGen/ARM/fast-isel.ll
+++ b/test/CodeGen/ARM/fast-isel.ll
@@ -226,3 +226,15 @@ define i32 @urem_fold(i32 %a) nounwind {
   %rem = urem i32 %a, 32
   ret i32 %rem
 }
+
+define i32 @test7() noreturn nounwind  {
+entry:
+; ARM: @test7
+; THUMB: @test7
+; ARM: trap
+; THUMB: trap
+  tail call void @llvm.trap( )
+  unreachable
+}
+
+declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/ARM/fcopysign.ll b/test/CodeGen/ARM/fcopysign.ll
index 27fa2b0..5511d24 100644
--- a/test/CodeGen/ARM/fcopysign.ll
+++ b/test/CodeGen/ARM/fcopysign.ll
@@ -11,7 +11,7 @@ entry:
 ; HARD: test1:
 ; HARD: vmov.i32 [[REG1:(d[0-9]+)]], #0x80000000
 ; HARD: vbsl [[REG1]], d
-  %0 = tail call float @copysignf(float %x, float %y) nounwind
+  %0 = tail call float @copysignf(float %x, float %y) nounwind readnone
   ret float %0
 }
 
@@ -25,7 +25,7 @@ entry:
 ; HARD: vmov.i32 [[REG2:(d[0-9]+)]], #0x80000000
 ; HARD: vshl.i64 [[REG2]], [[REG2]], #32
 ; HARD: vbsl [[REG2]], d1, d0
-  %0 = tail call double @copysign(double %x, double %y) nounwind
+  %0 = tail call double @copysign(double %x, double %y) nounwind readnone
   ret double %0
 }
 
@@ -36,7 +36,7 @@ entry:
 ; SOFT: vshl.i64 [[REG3]], [[REG3]], #32
 ; SOFT: vbsl [[REG3]],
   %0 = fmul double %x, %y
-  %1 = tail call double @copysign(double %0, double %z) nounwind
+  %1 = tail call double @copysign(double %0, double %z) nounwind readnone
   ret double %1
 }
 
diff --git a/test/CodeGen/ARM/floorf.ll b/test/CodeGen/ARM/floorf.ll
new file mode 100644
index 0000000..492fc36
--- /dev/null
+++ b/test/CodeGen/ARM/floorf.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mtriple=arm-unknown-unknown < %s | FileCheck %s
+
+; CHECK: test1
+define float @test1() nounwind uwtable readnone ssp {
+; CHECK-NOT: floorf
+  %foo = call float @floorf(float 0x4000CCCCC0000000) nounwind readnone
+  ret float %foo
+}
+
+; CHECK: test2
+define float @test2() nounwind uwtable readnone ssp {
+; CHECK-NOT: ceilf
+  %foo = call float @ceilf(float 0x4000CCCCC0000000) nounwind readnone
+  ret float %foo
+}
+
+; CHECK: test3
+define float @test3() nounwind uwtable readnone ssp {
+; CHECK-NOT: truncf
+  %foo = call float @truncf(float 0x4000CCCCC0000000) nounwind readnone
+  ret float %foo
+}
+
+declare float @floorf(float) nounwind readnone
+declare float @ceilf(float) nounwind readnone
+declare float @truncf(float) nounwind readnone
+
+
+
diff --git a/test/CodeGen/ARM/fmuls.ll b/test/CodeGen/ARM/fmuls.ll
index bc118b8..3c3182b 100644
--- a/test/CodeGen/ARM/fmuls.ll
+++ b/test/CodeGen/ARM/fmuls.ll
@@ -21,3 +21,12 @@ entry:
 ; CORTEXA8: 	vmul.f32	d0, d1, d0
 ; CORTEXA9: test:
 ; CORTEXA9: 	vmul.f32	s{{.}}, s{{.}}, s{{.}}
+
+; VFP2: test2
+define float @test2(float %a) nounwind {
+; CHECK-NOT: mul
+; CHECK: mov pc, lr
+  %ret = fmul float %a, 1.0
+  ret float %ret
+}
+
diff --git a/test/CodeGen/ARM/fparith.ll b/test/CodeGen/ARM/fparith.ll
index ce6d6b2..40ea33b 100644
--- a/test/CodeGen/ARM/fparith.ll
+++ b/test/CodeGen/ARM/fparith.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mattr=+vfp2 | FileCheck %s
 
 define float @f1(float %a, float %b) {
 ;CHECK: f1:
@@ -84,7 +84,7 @@ define float @f11(float %a) {
 ;CHECK: f11:
 ;CHECK: bic
 entry:
-	%tmp1 = call float @fabsf( float %a )		; <float> [#uses=1]
+	%tmp1 = call float @fabsf( float %a ) readnone	; <float> [#uses=1]
 	ret float %tmp1
 }
 
@@ -94,7 +94,7 @@ define double @f12(double %a) {
 ;CHECK: f12:
 ;CHECK: vabs.f64
 entry:
-	%tmp1 = call double @fabs( double %a )		; <double> [#uses=1]
+	%tmp1 = call double @fabs( double %a ) readnone	; <double> [#uses=1]
 	ret double %tmp1
 }
 
diff --git a/test/CodeGen/ARM/fusedMAC.ll b/test/CodeGen/ARM/fusedMAC.ll
index 802d1b8..303d165 100644
--- a/test/CodeGen/ARM/fusedMAC.ll
+++ b/test/CodeGen/ARM/fusedMAC.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv7-eabi -mattr=+neon,+vfp4 | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-eabi -mattr=+neon,+vfp4 -fp-contract=fast | FileCheck %s
 ; Check generated fused MAC and MLS.
 
 define double @fusedMACTest1(double %d1, double %d2, double %d3) {
@@ -138,8 +138,16 @@ entry:
 ; CHECK: vfms.f64
   %tmp1 = fsub double -0.0, %b
   %tmp2 = tail call double @llvm.fma.f64(double %a, double %tmp1, double %c) nounwind readnone
-  %tmp3 = fsub double -0.0, %tmp2
-  ret double %tmp3
+  ret double %tmp2
+}
+
+define float @test_fnms_f32(float %a, float %b, float* %c) nounwind readnone ssp {
+; CHECK: test_fnms_f32
+; CHECK: vfnms.f32
+  %tmp1 = load float* %c, align 4
+  %tmp2 = fsub float -0.0, %tmp1
+  %tmp3 = tail call float @llvm.fma.f32(float %a, float %b, float %tmp2) nounwind readnone
+  ret float %tmp3 
 }
 
 define double @test_fnms_f64(double %a, double %b, double %c) nounwind readnone ssp {
@@ -158,7 +166,8 @@ entry:
 ; CHECK: vfnms.f64
   %tmp1 = fsub double -0.0, %b
   %tmp2 = tail call double @llvm.fma.f64(double %a, double %tmp1, double %c) nounwind readnone
-  ret double %tmp2
+  %tmp3 = fsub double -0.0, %tmp2
+  ret double %tmp3
 }
 
 define double @test_fnma_f64(double %a, double %b, double %c) nounwind readnone ssp {
@@ -180,6 +189,36 @@ entry:
   ret double %tmp3
 }
 
+define float @test_fma_const_fold(float %a, float %b) nounwind {
+; CHECK: test_fma_const_fold
+; CHECK-NOT: vfma
+; CHECK-NOT: vmul
+; CHECK: vadd
+  %ret = call float @llvm.fma.f32(float %a, float 1.0, float %b)
+  ret float %ret
+}
+
+define float @test_fma_canonicalize(float %a, float %b) nounwind {
+; CHECK: test_fma_canonicalize
+; CHECK: vmov.f32 [[R1:s[0-9]+]], #2.000000e+00
+; CHECK: vfma.f32 {{s[0-9]+}}, {{s[0-9]+}}, [[R1]]
+  %ret = call float @llvm.fma.f32(float 2.0, float %a, float %b)
+  ret float %ret
+}
+
+; Check that very wide vector fma's can be split into legal fma's.
+define void @test_fma_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float>* %p) nounwind readnone ssp {
+; CHECK: test_fma_v8f32
+; CHECK: vfma.f32
+; CHECK: vfma.f32
+entry:
+  %call = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) nounwind readnone
+  store <8 x float> %call, <8 x float>* %p, align 16
+  ret void
+}
+
+
 declare float @llvm.fma.f32(float, float, float) nounwind readnone
 declare double @llvm.fma.f64(double, double, double) nounwind readnone
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/iabs.ll b/test/CodeGen/ARM/iabs.ll
index 89e309d..600a8c2 100644
--- a/test/CodeGen/ARM/iabs.ll
+++ b/test/CodeGen/ARM/iabs.ll
@@ -10,7 +10,25 @@ define i32 @test(i32 %a) {
         %b = icmp sgt i32 %a, -1
         %abs = select i1 %b, i32 %a, i32 %tmp1neg
         ret i32 %abs
-; CHECK:  movs r0, r0
+; CHECK:  cmp
 ; CHECK:  rsbmi r0, r0, #0
 ; CHECK:  bx lr
 }
+
+; rdar://11633193
+;; 3 instructions will be generated for abs(a-b):
+;;   subs
+;;   rsbmi
+;;   bx
+define i32 @test2(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK: test2
+; CHECK: subs
+; CHECK-NEXT: rsbmi
+; CHECK-NEXT: bx
+  %sub = sub nsw i32 %a, %b
+  %cmp = icmp sgt i32 %sub, -1
+  %sub1 = sub nsw i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub1
+  ret i32 %cond
+}
diff --git a/test/CodeGen/ARM/ldrd.ll b/test/CodeGen/ARM/ldrd.ll
index 3f8fd75..73b546d 100644
--- a/test/CodeGen/ARM/ldrd.ll
+++ b/test/CodeGen/ARM/ldrd.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast | FileCheck %s -check-prefix=A8
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast | FileCheck %s -check-prefix=M3
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=A8
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=M3
 ; rdar://6949835
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY
@@ -18,7 +18,6 @@ entry:
 
 ; M3: t:
 ; M3-NOT: ldrd
-; M3: ldm.w r2, {r2, r3}
 
 	%0 = load i64** @b, align 4
 	%1 = load i64* %0, align 4
diff --git a/test/CodeGen/ARM/lsr-scale-addr-mode.ll b/test/CodeGen/ARM/lsr-scale-addr-mode.ll
index 8130019..0c8d387 100644
--- a/test/CodeGen/ARM/lsr-scale-addr-mode.ll
+++ b/test/CodeGen/ARM/lsr-scale-addr-mode.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm | grep lsl | grep -F {lsl #2\]}
+; RUN: llc < %s -march=arm | grep lsl | grep -F "lsl #2]"
 ; Should use scaled addressing mode.
 
 define void @sintzero(i32* %a) nounwind {
diff --git a/test/CodeGen/ARM/movt-movw-global.ll b/test/CodeGen/ARM/movt-movw-global.ll
index 991d728..bbedea1 100644
--- a/test/CodeGen/ARM/movt-movw-global.ll
+++ b/test/CodeGen/ARM/movt-movw-global.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -mtriple=armv7-eabi      | FileCheck %s -check-prefix=EABI
-; RUN: llc < %s -mtriple=armv7-apple-ios -relocation-model=dynamic-no-pic | FileCheck %s -check-prefix=IOS
-; RUN: llc < %s -mtriple=armv7-apple-ios -relocation-model=pic            | FileCheck %s -check-prefix=IOS-PIC
-; RUN: llc < %s -mtriple=armv7-apple-ios -relocation-model=static         | FileCheck %s -check-prefix=IOS-STATIC
+; RUN: llc < %s -verify-machineinstrs -mtriple=armv7-eabi      | FileCheck %s -check-prefix=EABI
+; RUN: llc < %s -verify-machineinstrs -mtriple=armv7-apple-ios -relocation-model=dynamic-no-pic | FileCheck %s -check-prefix=IOS
+; RUN: llc < %s -verify-machineinstrs -mtriple=armv7-apple-ios -relocation-model=pic            | FileCheck %s -check-prefix=IOS-PIC
+; RUN: llc < %s -verify-machineinstrs -mtriple=armv7-apple-ios -relocation-model=static         | FileCheck %s -check-prefix=IOS-STATIC
 
 @foo = common global i32 0
 
diff --git a/test/CodeGen/ARM/neon_div.ll b/test/CodeGen/ARM/neon_div.ll
index de48fee..4a82c36 100644
--- a/test/CodeGen/ARM/neon_div.ll
+++ b/test/CodeGen/ARM/neon_div.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -march=arm -mattr=+neon -pre-RA-sched=source | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+neon -pre-RA-sched=source -disable-post-ra | FileCheck %s
 
 define <8 x i8> @sdivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vrecpe.f32
-;CHECK: vrecpe.f32
 ;CHECK: vmovn.i32
+;CHECK: vrecpe.f32
 ;CHECK: vmovn.i32
 ;CHECK: vmovn.i16
 	%tmp1 = load <8 x i8>* %A
@@ -15,10 +15,10 @@ define <8 x i8> @sdivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 define <8 x i8> @udivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vrecpe.f32
 ;CHECK: vrecps.f32
+;CHECK: vmovn.i32
 ;CHECK: vrecpe.f32
 ;CHECK: vrecps.f32
 ;CHECK: vmovn.i32
-;CHECK: vmovn.i32
 ;CHECK: vqmovun.s16
 	%tmp1 = load <8 x i8>* %A
 	%tmp2 = load <8 x i8>* %B
diff --git a/test/CodeGen/ARM/opt-shuff-tstore.ll b/test/CodeGen/ARM/opt-shuff-tstore.ll
index b4da552..df98e23 100644
--- a/test/CodeGen/ARM/opt-shuff-tstore.ll
+++ b/test/CodeGen/ARM/opt-shuff-tstore.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=cortex-a9 -mtriple=arm-linux-unknown -promote-elements -mattr=+neon < %s | FileCheck %s
+; RUN: llc -mcpu=cortex-a9 -mtriple=arm-linux-unknown -mattr=+neon < %s | FileCheck %s
 
 ; CHECK: func_4_8
 ; CHECK: vst1.32
diff --git a/test/CodeGen/ARM/pr13249.ll b/test/CodeGen/ARM/pr13249.ll
new file mode 100644
index 0000000..4bc8810
--- /dev/null
+++ b/test/CodeGen/ARM/pr13249.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple armv7--linux-gnueabi
+
+define arm_aapcscc i8* @__strtok_r_1c(i8* %arg, i8 signext %arg1, i8** nocapture %arg2) nounwind {
+bb:
+  br label %bb3
+
+bb3:                                              ; preds = %bb3, %bb
+  %tmp = phi i8* [ %tmp5, %bb3 ], [ %arg, %bb ]
+  %tmp4 = load i8* %tmp, align 1
+  %tmp5 = getelementptr inbounds i8* %tmp, i32 1
+  br i1 undef, label %bb3, label %bb7
+
+bb7:                                              ; preds = %bb13, %bb3
+  %tmp8 = phi i8 [ %tmp14, %bb13 ], [ %tmp4, %bb3 ]
+  %tmp9 = phi i8* [ %tmp12, %bb13 ], [ %tmp, %bb3 ]
+  %tmp10 = icmp ne i8 %tmp8, %arg1
+  %tmp12 = getelementptr inbounds i8* %tmp9, i32 1
+  br i1 %tmp10, label %bb13, label %bb15
+
+bb13:                                             ; preds = %bb7
+  %tmp14 = load i8* %tmp12, align 1
+  br label %bb7
+
+bb15:                                             ; preds = %bb7
+  store i8* %tmp9, i8** %arg2, align 4
+  ret i8* %tmp
+}
diff --git a/test/CodeGen/ARM/select.ll b/test/CodeGen/ARM/select.ll
index 3e07da8..418d4f3 100644
--- a/test/CodeGen/ARM/select.ll
+++ b/test/CodeGen/ARM/select.ll
@@ -113,3 +113,29 @@ entry:
   call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, [2 x i32], i32, float)*)(i8* undef, i8* undef, [2 x i32] %tmp493, i32 0, float 1.000000e+00) optsize
   ret void
 }
+
+; CHECK: f10
+define float @f10(i32 %a, i32 %b) nounwind uwtable readnone ssp {
+; CHECK-NOT: floatsisf
+  %1 = icmp eq i32 %a, %b
+  %2 = zext i1 %1 to i32
+  %3 = sitofp i32 %2 to float
+  ret float %3
+}
+
+; CHECK: f11
+define float @f11(i32 %a, i32 %b) nounwind uwtable readnone ssp {
+; CHECK-NOT: floatsisf
+  %1 = icmp eq i32 %a, %b
+  %2 = sitofp i1 %1 to float
+  ret float %2
+}
+
+; CHECK: f12
+define float @f12(i32 %a, i32 %b) nounwind uwtable readnone ssp {
+; CHECK-NOT: floatunsisf
+  %1 = icmp eq i32 %a, %b
+  %2 = uitofp i1 %1 to float
+  ret float %2
+}
+
diff --git a/test/CodeGen/ARM/smml.ll b/test/CodeGen/ARM/smml.ll
new file mode 100644
index 0000000..99df0d4
--- /dev/null
+++ b/test/CodeGen/ARM/smml.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+define i32 @f(i32 %a, i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+; CHECK-NOT: smmls
+  %conv4 = zext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %conv2 = sext i32 %c to i64
+  %mul = mul nsw i64 %conv2, %conv1
+  %shr5 = lshr i64 %mul, 32
+  %sub = sub nsw i64 %conv4, %shr5
+  %conv3 = trunc i64 %sub to i32
+  ret i32 %conv3
+}
diff --git a/test/CodeGen/ARM/str_pre-2.ll b/test/CodeGen/ARM/str_pre-2.ll
index 983ba45..5ce2bce 100644
--- a/test/CodeGen/ARM/str_pre-2.ll
+++ b/test/CodeGen/ARM/str_pre-2.ll
@@ -1,13 +1,12 @@
-; RUN: llc < %s -mtriple=armv6-linux-gnu -regalloc=basic | FileCheck %s
-
-; The greedy register allocator uses a single CSR here, invalidating the test.
+; RUN: llc < %s -mtriple=armv6-linux-gnu | FileCheck %s
 
 @b = external global i64*
 
 define i64 @t(i64 %a) nounwind readonly {
 entry:
-; CHECK: push {lr}
-; CHECK: pop {lr}
+; CHECK: push {r4, r5, lr}
+; CHECK: pop {r4, r5, pc}
+        call void asm sideeffect "", "~{r4},~{r5}"() nounwind
 	%0 = load i64** @b, align 4
 	%1 = load i64* %0, align 4
 	%2 = mul i64 %1, %a
diff --git a/test/CodeGen/ARM/str_pre.ll b/test/CodeGen/ARM/str_pre.ll
index e56e3f2..d8b3f0e 100644
--- a/test/CodeGen/ARM/str_pre.ll
+++ b/test/CodeGen/ARM/str_pre.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm | \
-; RUN:   grep {str.*\\!} | count 2
+; RUN:   grep "str.*\!" | count 2
 
 define void @test1(i32* %X, i32* %A, i32** %dest) {
         %B = load i32* %A               ; <i32> [#uses=1]
diff --git a/test/CodeGen/ARM/struct_byval.ll b/test/CodeGen/ARM/struct_byval.ll
new file mode 100644
index 0000000..99ba475
--- /dev/null
+++ b/test/CodeGen/ARM/struct_byval.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=armv7-apple-ios6.0 | FileCheck %s
+
+; rdar://9877866
+%struct.SmallStruct = type { i32, [8 x i32], [37 x i8] }
+%struct.LargeStruct = type { i32, [1001 x i8], [300 x i32] }
+
+define i32 @f() nounwind ssp {
+entry:
+; CHECK: f:
+; CHECK: ldr
+; CHECK: str
+; CHECK-NOT:bne
+  %st = alloca %struct.SmallStruct, align 4
+  %call = call i32 @e1(%struct.SmallStruct* byval %st)
+  ret i32 0
+}
+
+; Generate a loop for large struct byval
+define i32 @g() nounwind ssp {
+entry:
+; CHECK: g:
+; CHECK: ldr
+; CHECK: sub
+; CHECK: str
+; CHECK: bne
+  %st = alloca %struct.LargeStruct, align 4
+  %call = call i32 @e2(%struct.LargeStruct* byval %st)
+  ret i32 0
+}
+
+; Generate a loop using NEON instructions
+define i32 @h() nounwind ssp {
+entry:
+; CHECK: h:
+; CHECK: vld1
+; CHECK: sub
+; CHECK: vst1
+; CHECK: bne
+  %st = alloca %struct.LargeStruct, align 16
+  %call = call i32 @e3(%struct.LargeStruct* byval align 16 %st)
+  ret i32 0
+}
+
+declare i32 @e1(%struct.SmallStruct* nocapture byval %in) nounwind
+declare i32 @e2(%struct.LargeStruct* nocapture byval %in) nounwind
+declare i32 @e3(%struct.LargeStruct* nocapture byval align 16 %in) nounwind
diff --git a/test/CodeGen/ARM/sub-cmp-peephole.ll b/test/CodeGen/ARM/sub-cmp-peephole.ll
new file mode 100644
index 0000000..6fcbdee
--- /dev/null
+++ b/test/CodeGen/ARM/sub-cmp-peephole.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s
+
+define i32 @f(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: f:
+; CHECK: subs
+; CHECK-NOT: cmp
+  %cmp = icmp sgt i32 %a, %b
+  %sub = sub nsw i32 %a, %b
+  %sub. = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sub.
+}
+
+define i32 @g(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: g:
+; CHECK: subs
+; CHECK-NOT: cmp
+  %cmp = icmp slt i32 %a, %b
+  %sub = sub nsw i32 %b, %a
+  %sub. = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sub.
+}
+
+define i32 @h(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: h:
+; CHECK: subs
+; CHECK-NOT: cmp
+  %cmp = icmp sgt i32 %a, 3
+  %sub = sub nsw i32 %a, 3
+  %sub. = select i1 %cmp, i32 %sub, i32 %b
+  ret i32 %sub.
+}
+
+; rdar://11725965
+define i32 @i(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK: i:
+; CHECK: subs
+; CHECK-NOT: cmp
+  %cmp = icmp ult i32 %a, %b
+  %sub = sub i32 %b, %a
+  %sub. = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sub.
+}
+; If CPSR is live-out, we can't remove cmp if there exists
+; a swapped sub.
+define i32 @j(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: j:
+; CHECK: sub
+; CHECK: cmp
+  %cmp = icmp eq i32 %b, %a
+  %sub = sub nsw i32 %a, %b
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = icmp sgt i32 %b, %a
+  %sel = select i1 %cmp2, i32 %sub, i32 %a
+  ret i32 %sel
+
+if.else:
+  ret i32 %sub
+}
diff --git a/test/CodeGen/ARM/sub.ll b/test/CodeGen/ARM/sub.ll
index 06ea703..474043a 100644
--- a/test/CodeGen/ARM/sub.ll
+++ b/test/CodeGen/ARM/sub.ll
@@ -36,3 +36,15 @@ entry:
   %sel = select i1 %cmp, i32 1, i32 %sub
   ret i32 %sel
 }
+
+; rdar://11726136
+define i32 @f5(i32 %x) {
+entry:
+; CHECK: f5
+; CHECK: movw r1, #65535
+; CHECK-NOT: movt
+; CHECK-NOT: add
+; CHECK: sub r0, r0, r1
+  %sub = add i32 %x, -65535
+  ret i32 %sub
+}
diff --git a/test/CodeGen/ARM/thread_pointer.ll b/test/CodeGen/ARM/thread_pointer.ll
index 3143387..c403fa5 100644
--- a/test/CodeGen/ARM/thread_pointer.ll
+++ b/test/CodeGen/ARM/thread_pointer.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN:     grep {__aeabi_read_tp}
+; RUN:     grep "__aeabi_read_tp"
 
 define i8* @test() {
 entry:
diff --git a/test/CodeGen/ARM/thumb2-it-block.ll b/test/CodeGen/ARM/thumb2-it-block.ll
index 28fd469..a25352c 100644
--- a/test/CodeGen/ARM/thumb2-it-block.ll
+++ b/test/CodeGen/ARM/thumb2-it-block.ll
@@ -3,10 +3,10 @@
 
 define i32 @test(i32 %a, i32 %b) {
 entry:
-; CHECK:        movs.w
+; CHECK:        cmp
 ; CHECK-NEXT:   it    mi
 ; CHECK-NEXT:   rsbmi
-; CHECK-NEXT:   movs.w
+; CHECK-NEXT:   cmp
 ; CHECK-NEXT:   it    mi
 ; CHECK-NEXT:   rsbmi
  %cmp1 = icmp slt i32 %a, 0
diff --git a/test/CodeGen/ARM/tls-models.ll b/test/CodeGen/ARM/tls-models.ll
new file mode 100644
index 0000000..a5f3c90
--- /dev/null
+++ b/test/CodeGen/ARM/tls-models.ll
@@ -0,0 +1,117 @@
+; RUN: llc -march=arm -mtriple=arm-linux-gnueabi < %s | FileCheck -check-prefix=CHECK-NONPIC %s
+; RUN: llc -march=arm -mtriple=arm-linux-gnueabi -relocation-model=pic < %s | FileCheck -check-prefix=CHECK-PIC %s
+
+
+@external_gd = external thread_local global i32
+@internal_gd = internal thread_local global i32 42
+
+@external_ld = external thread_local(localdynamic) global i32
+@internal_ld = internal thread_local(localdynamic) global i32 42
+
+@external_ie = external thread_local(initialexec) global i32
+@internal_ie = internal thread_local(initialexec) global i32 42
+
+@external_le = external thread_local(localexec) global i32
+@internal_le = internal thread_local(localexec) global i32 42
+
+; ----- no model specified -----
+
+define i32* @f1() {
+entry:
+  ret i32* @external_gd
+
+  ; Non-PIC code can use initial-exec, PIC code has to use general dynamic.
+  ; CHECK-NONPIC:   f1:
+  ; CHECK-NONPIC:   external_gd(gottpoff)
+  ; CHECK-PIC:      f1:
+  ; CHECK-PIC:      external_gd(tlsgd)
+}
+
+define i32* @f2() {
+entry:
+  ret i32* @internal_gd
+
+  ; Non-PIC code can use local exec, PIC code can use local dynamic,
+  ; but that is not implemented, so falls back to general dynamic.
+  ; CHECK-NONPIC:   f2:
+  ; CHECK-NONPIC:   internal_gd(tpoff)
+  ; CHECK-PIC:      f2:
+  ; CHECK-PIC:      internal_gd(tlsgd)
+}
+
+
+; ----- localdynamic specified -----
+
+define i32* @f3() {
+entry:
+  ret i32* @external_ld
+
+  ; Non-PIC code can use initial exec, PIC should use local dynamic,
+  ; but that is not implemented, so falls back to general dynamic.
+  ; CHECK-NONPIC:   f3:
+  ; CHECK-NONPIC:   external_ld(gottpoff)
+  ; CHECK-PIC:      f3:
+  ; CHECK-PIC:      external_ld(tlsgd)
+}
+
+define i32* @f4() {
+entry:
+  ret i32* @internal_ld
+
+  ; Non-PIC code can use local exec, PIC code can use local dynamic,
+  ; but that is not implemented, so it falls back to general dynamic.
+  ; CHECK-NONPIC:   f4:
+  ; CHECK-NONPIC:   internal_ld(tpoff)
+  ; CHECK-PIC:      f4:
+  ; CHECK-PIC:      internal_ld(tlsgd)
+}
+
+
+; ----- initialexec specified -----
+
+define i32* @f5() {
+entry:
+  ret i32* @external_ie
+
+  ; Non-PIC and PIC code will use initial exec as specified.
+  ; CHECK-NONPIC:   f5:
+  ; CHECK-NONPIC:   external_ie(gottpoff)
+  ; CHECK-PIC:      f5:
+  ; CHECK-PIC:      external_ie(gottpoff)
+}
+
+define i32* @f6() {
+entry:
+  ret i32* @internal_ie
+
+  ; Non-PIC code can use local exec, PIC code use initial exec as specified.
+  ; CHECK-NONPIC:   f6:
+  ; CHECK-NONPIC:   internal_ie(tpoff)
+  ; CHECK-PIC:      f6:
+  ; CHECK-PIC:      internal_ie(gottpoff)
+}
+
+
+; ----- localexec specified -----
+
+define i32* @f7() {
+entry:
+  ret i32* @external_le
+
+  ; Non-PIC and PIC code will use local exec as specified.
+  ; CHECK-NONPIC:   f7:
+  ; CHECK-NONPIC:   external_le(tpoff)
+  ; CHECK-PIC:      f7:
+  ; CHECK-PIC:      external_le(tpoff)
+}
+
+define i32* @f8() {
+entry:
+  ret i32* @internal_le
+
+  ; Non-PIC and PIC code will use local exec as specified.
+  ; CHECK-NONPIC:   f8:
+  ; CHECK-NONPIC:   internal_le(tpoff)
+  ; CHECK-PIC:      f8:
+  ; CHECK-PIC:      internal_le(tpoff)
+}
diff --git a/test/CodeGen/ARM/tls1.ll b/test/CodeGen/ARM/tls1.ll
index 1087094..ec4278c 100644
--- a/test/CodeGen/ARM/tls1.ll
+++ b/test/CodeGen/ARM/tls1.ll
@@ -1,9 +1,9 @@
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN:     grep {i(tpoff)}
+; RUN:     grep "i(tpoff)"
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN:     grep {__aeabi_read_tp}
+; RUN:     grep "__aeabi_read_tp"
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi \
-; RUN:     -relocation-model=pic | grep {__tls_get_addr}
+; RUN:     -relocation-model=pic | grep "__tls_get_addr"
 
 
 @i = thread_local global i32 15		; <i32*> [#uses=2]
diff --git a/test/CodeGen/ARM/tls3.ll b/test/CodeGen/ARM/tls3.ll
index df7a4ca..e0e944f 100644
--- a/test/CodeGen/ARM/tls3.ll
+++ b/test/CodeGen/ARM/tls3.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN:     grep {tbss}
+; RUN:     grep "tbss"
 
 %struct.anon = type { i32, i32 }
 @teste = internal thread_local global %struct.anon zeroinitializer		; <%struct.anon*> [#uses=1]
diff --git a/test/CodeGen/ARM/twoaddrinstr.ll b/test/CodeGen/ARM/twoaddrinstr.ll
new file mode 100644
index 0000000..4e227dd
--- /dev/null
+++ b/test/CodeGen/ARM/twoaddrinstr.ll
@@ -0,0 +1,21 @@
+; Tests for the two-address instruction pass.
+; RUN: llc -march=arm -mcpu=cortex-a9 < %s | FileCheck %s
+
+define void @PR13378() nounwind {
+; This was orriginally a crasher trying to schedule the instructions.
+; CHECK:      PR13378:
+; CHECK:        vldmia
+; CHECK-NEXT:   vmov.f32
+; CHECK-NEXT:   vstmia
+; CHECK-NEXT:   vstmia
+; CHECK-NEXT:   vmov.f32
+; CHECK-NEXT:   vstmia
+
+entry:
+  %0 = load <4 x float>* undef
+  store <4 x float> zeroinitializer, <4 x float>* undef
+  store <4 x float> %0, <4 x float>* undef
+  %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 3
+  store <4 x float> %1, <4 x float>* undef
+  unreachable
+}
diff --git a/test/CodeGen/ARM/unsafe-fsub.ll b/test/CodeGen/ARM/unsafe-fsub.ll
new file mode 100644
index 0000000..3a4477d
--- /dev/null
+++ b/test/CodeGen/ARM/unsafe-fsub.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=arm -mcpu=cortex-a9 < %s | FileCheck -check-prefix=SAFE %s
+; RUN: llc -march=arm -mcpu=cortex-a9 -enable-unsafe-fp-math < %s | FileCheck -check-prefix=FAST %s
+
+target triple = "armv7-apple-ios"
+
+; SAFE: test
+; FAST: test
+define float @test(float %x, float %y) {
+entry:
+; SAFE: vmul.f32
+; SAFE: vsub.f32
+; FAST: mov r0, #0
+  %0 = fmul float %x, %y
+  %1 = fsub float %0, %0
+  ret float %1
+}
+
+
diff --git a/test/CodeGen/ARM/vcnt.ll b/test/CodeGen/ARM/vcnt.ll
index 450f90d..9f55c24 100644
--- a/test/CodeGen/ARM/vcnt.ll
+++ b/test/CodeGen/ARM/vcnt.ll
@@ -1,79 +1,80 @@
 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; NB: this tests vcnt, vclz, and vcls
 
 define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind {
 ;CHECK: vcnt8:
-;CHECK: vcnt.8
+;CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}}
 	%tmp1 = load <8 x i8>* %A
-	%tmp2 = call <8 x i8> @llvm.arm.neon.vcnt.v8i8(<8 x i8> %tmp1)
+	%tmp2 = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %tmp1)
 	ret <8 x i8> %tmp2
 }
 
 define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind {
 ;CHECK: vcntQ8:
-;CHECK: vcnt.8
+;CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}}
 	%tmp1 = load <16 x i8>* %A
-	%tmp2 = call <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8> %tmp1)
+	%tmp2 = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %tmp1)
 	ret <16 x i8> %tmp2
 }
 
-declare <8 x i8>  @llvm.arm.neon.vcnt.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i8>  @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone
 
 define <8 x i8> @vclz8(<8 x i8>* %A) nounwind {
 ;CHECK: vclz8:
-;CHECK: vclz.i8
+;CHECK: vclz.i8 {{d[0-9]+}}, {{d[0-9]+}}
 	%tmp1 = load <8 x i8>* %A
-	%tmp2 = call <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8> %tmp1)
+	%tmp2 = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %tmp1, i1 0)
 	ret <8 x i8> %tmp2
 }
 
 define <4 x i16> @vclz16(<4 x i16>* %A) nounwind {
 ;CHECK: vclz16:
-;CHECK: vclz.i16
+;CHECK: vclz.i16 {{d[0-9]+}}, {{d[0-9]+}}
 	%tmp1 = load <4 x i16>* %A
-	%tmp2 = call <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16> %tmp1)
+	%tmp2 = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %tmp1, i1 0)
 	ret <4 x i16> %tmp2
 }
 
 define <2 x i32> @vclz32(<2 x i32>* %A) nounwind {
 ;CHECK: vclz32:
-;CHECK: vclz.i32
+;CHECK: vclz.i32 {{d[0-9]+}}, {{d[0-9]+}}
 	%tmp1 = load <2 x i32>* %A
-	%tmp2 = call <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32> %tmp1)
+	%tmp2 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %tmp1, i1 0)
 	ret <2 x i32> %tmp2
 }
 
 define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind {
 ;CHECK: vclzQ8:
-;CHECK: vclz.i8
+;CHECK: vclz.i8 {{q[0-9]+}}, {{q[0-9]+}}
 	%tmp1 = load <16 x i8>* %A
-	%tmp2 = call <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8> %tmp1)
+	%tmp2 = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %tmp1, i1 0)
 	ret <16 x i8> %tmp2
 }
 
 define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind {
 ;CHECK: vclzQ16:
-;CHECK: vclz.i16
+;CHECK: vclz.i16 {{q[0-9]+}}, {{q[0-9]+}}
 	%tmp1 = load <8 x i16>* %A
-	%tmp2 = call <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16> %tmp1)
+	%tmp2 = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %tmp1, i1 0)
 	ret <8 x i16> %tmp2
 }
 
 define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind {
 ;CHECK: vclzQ32:
-;CHECK: vclz.i32
+;CHECK: vclz.i32 {{q[0-9]+}}, {{q[0-9]+}}
 	%tmp1 = load <4 x i32>* %A
-	%tmp2 = call <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32> %tmp1)
+	%tmp2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %tmp1, i1 0)
 	ret <4 x i32> %tmp2
 }
 
-declare <8 x i8>  @llvm.arm.neon.vclz.v8i8(<8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32>) nounwind readnone
+declare <8 x i8>  @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone
+declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
 
-declare <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32>) nounwind readnone
+declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone
+declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
 
 define <8 x i8> @vclss8(<8 x i8>* %A) nounwind {
 ;CHECK: vclss8:
diff --git a/test/CodeGen/ARM/vector-extend-narrow.ll b/test/CodeGen/ARM/vector-extend-narrow.ll
index 1ec36da..8fd3db2 100644
--- a/test/CodeGen/ARM/vector-extend-narrow.ll
+++ b/test/CodeGen/ARM/vector-extend-narrow.ll
@@ -20,7 +20,9 @@ define float @f(<4 x i16>* nocapture %in) {
 
 ; CHECK: g:
 define float @g(<4 x i8>* nocapture %in) {
-  ; CHECK: vldr
+; Note: vld1 here is reasonably important. Mixing VFP and NEON
+; instructions is bad on some cores
+  ; CHECK: vld1
   ; CHECK: vmovl.u8
   ; CHECK: vmovl.u16
   %1 = load <4 x i8>* %in
@@ -47,7 +49,9 @@ define <4 x i8> @h(<4 x float> %v) {
 
 ; CHECK: i:
 define <4 x i8> @i(<4 x i8>* %x) {
-  ; CHECK: vldr
+; Note: vld1 here is reasonably important. Mixing VFP and NEON
+; instructions is bad on some cores
+  ; CHECK: vld1
   ; CHECK: vmovl.s8
   ; CHECK: vmovl.s16
   ; CHECK: vrecpe
diff --git a/test/CodeGen/ARM/vfp.ll b/test/CodeGen/ARM/vfp.ll
index 49a6982..7a4b34f 100644
--- a/test/CodeGen/ARM/vfp.ll
+++ b/test/CodeGen/ARM/vfp.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm -mattr=+vfp2 -disable-post-ra | FileCheck %s
-; RUN: llc < %s -march=arm -mattr=+vfp2 -disable-post-ra -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mattr=+vfp2 -disable-post-ra | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mattr=+vfp2 -disable-post-ra -regalloc=basic | FileCheck %s
 
 define void @test(float* %P, double* %D) {
 	%A = load float* %P		; <float> [#uses=1]
@@ -17,11 +17,11 @@ define void @test_abs(float* %P, double* %D) {
 ;CHECK: test_abs:
 	%a = load float* %P		; <float> [#uses=1]
 ;CHECK: vabs.f32
-	%b = call float @fabsf( float %a )		; <float> [#uses=1]
+	%b = call float @fabsf( float %a ) readnone	; <float> [#uses=1]
 	store float %b, float* %P
 	%A = load double* %D		; <double> [#uses=1]
 ;CHECK: vabs.f64
-	%B = call double @fabs( double %A )		; <double> [#uses=1]
+	%B = call double @fabs( double %A ) readnone	; <double> [#uses=1]
 	store double %B, double* %D
 	ret void
 }
diff --git a/test/CodeGen/ARM/vlddup.ll b/test/CodeGen/ARM/vlddup.ll
index 61d73c1..c69473f 100644
--- a/test/CodeGen/ARM/vlddup.ll
+++ b/test/CodeGen/ARM/vlddup.ll
@@ -75,12 +75,12 @@ define <8 x i8> @vld2dupi8(i8* %A) nounwind {
         ret <8 x i8> %tmp5
 }
 
-define <4 x i16> @vld2dupi16(i16* %A) nounwind {
+define <4 x i16> @vld2dupi16(i8* %A) nounwind {
 ;CHECK: vld2dupi16:
 ;Check that a power-of-two alignment smaller than the total size of the memory
 ;being loaded is ignored.
 ;CHECK: vld2.16 {d16[], d17[]}, [r0]
-	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i16* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 	%tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
@@ -94,7 +94,8 @@ define <4 x i16> @vld2dupi16_update(i16** %ptr) nounwind {
 ;CHECK: vld2dupi16_update:
 ;CHECK: vld2.16 {d16[], d17[]}, [r1]!
 	%A = load i16** %ptr
-	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i16* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+        %A2 = bitcast i16* %A to i8*
+	%tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %A2, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 	%tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
@@ -105,11 +106,11 @@ define <4 x i16> @vld2dupi16_update(i16** %ptr) nounwind {
 	ret <4 x i16> %tmp5
 }
 
-define <2 x i32> @vld2dupi32(i32* %A) nounwind {
+define <2 x i32> @vld2dupi32(i8* %A) nounwind {
 ;CHECK: vld2dupi32:
 ;Check the alignment value.  Max for this instruction is 64 bits:
 ;CHECK: vld2.32 {d16[], d17[]}, [r0, :64]
-	%tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i32* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16)
+	%tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16)
 	%tmp1 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 0
 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 1
@@ -119,8 +120,8 @@ define <2 x i32> @vld2dupi32(i32* %A) nounwind {
 }
 
 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
-declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i16*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i32*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
 
 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
@@ -144,11 +145,11 @@ define <8 x i8> @vld3dupi8_update(i8** %ptr, i32 %inc) nounwind {
 	ret <8 x i8> %tmp8
 }
 
-define <4 x i16> @vld3dupi16(i16* %A) nounwind {
+define <4 x i16> @vld3dupi16(i8* %A) nounwind {
 ;CHECK: vld3dupi16:
 ;Check the (default) alignment value. VLD3 does not support alignment.
 ;CHECK: vld3.16 {d16[], d17[], d18[]}, [r0]
-	%tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i16* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8)
+	%tmp0 = tail call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 8)
 	%tmp1 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp0, 1
@@ -161,7 +162,7 @@ define <4 x i16> @vld3dupi16(i16* %A) nounwind {
 }
 
 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i16*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
 
 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
 %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
@@ -171,7 +172,8 @@ define <4 x i16> @vld4dupi16_update(i16** %ptr) nounwind {
 ;CHECK: vld4dupi16_update:
 ;CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1]!
 	%A = load i16** %ptr
-	%tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i16* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1)
+        %A2 = bitcast i16* %A to i8*
+	%tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %A2, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1)
 	%tmp1 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 1
@@ -188,12 +190,12 @@ define <4 x i16> @vld4dupi16_update(i16** %ptr) nounwind {
 	ret <4 x i16> %tmp11
 }
 
-define <2 x i32> @vld4dupi32(i32* %A) nounwind {
+define <2 x i32> @vld4dupi32(i8* %A) nounwind {
 ;CHECK: vld4dupi32:
 ;Check the alignment value.  An 8-byte alignment is allowed here even though
 ;it is smaller than the total size of the memory being loaded.
 ;CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r0, :64]
-	%tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i32* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8)
+	%tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8)
 	%tmp1 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 0
 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
 	%tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 1
@@ -208,5 +210,5 @@ define <2 x i32> @vld4dupi32(i32* %A) nounwind {
         ret <2 x i32> %tmp11
 }
 
-declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i16*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i32*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll
index 61d89bb..74628f0 100644
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@@ -525,3 +525,77 @@ define i16 @vmullWithInconsistentExtensions(<8 x i8> %vec) {
   %3 = extractelement <8 x i16> %2, i32 0
   ret i16 %3
 }
+
+; A constant build_vector created for a vmull with half-width elements must
+; not introduce illegal types. <rdar://problem/11324364>
+define void @vmull_buildvector() nounwind optsize ssp align 2 {
+; CHECK: vmull_buildvector
+entry:
+  br i1 undef, label %for.end179, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.cond.loopexit:                                ; preds = %for.body33, %for.body
+  br i1 undef, label %for.end179, label %for.body
+
+for.body:                                         ; preds = %for.cond.loopexit, %for.body.lr.ph
+  br i1 undef, label %for.cond.loopexit, label %for.body33.lr.ph
+
+for.body33.lr.ph:                                 ; preds = %for.body
+  %.sub = select i1 undef, i32 0, i32 undef
+  br label %for.body33
+
+for.body33:                                       ; preds = %for.body33, %for.body33.lr.ph
+  %add45 = add i32 undef, undef
+  %vld155 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* undef, i32 1)
+  %0 = load i32** undef, align 4
+  %shuffle.i250 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
+  %1 = bitcast <1 x i64> %shuffle.i250 to <8 x i8>
+  %vmovl.i249 = zext <8 x i8> %1 to <8 x i16>
+  %shuffle.i246 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
+  %shuffle.i240 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> <i32 1>
+  %2 = bitcast <1 x i64> %shuffle.i240 to <8 x i8>
+  %3 = bitcast <16 x i8> undef to <2 x i64>
+  %vmovl.i237 = zext <8 x i8> undef to <8 x i16>
+  %shuffle.i234 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
+  %shuffle.i226 = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
+  %vmovl.i225 = zext <8 x i8> undef to <8 x i16>
+  %mul.i223 = mul <8 x i16> %vmovl.i249, %vmovl.i249
+  %vshl_n = shl <8 x i16> %mul.i223, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  %vqsub2.i216 = tail call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>, <8 x i16> %vshl_n) nounwind
+  %mul.i209 = mul <8 x i16> undef, <i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80>
+  %vshr_n130 = lshr <8 x i16> undef, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vshr_n134 = lshr <8 x i16> %mul.i209, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %sub.i205 = sub <8 x i16> <i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80>, %vshr_n130
+  %sub.i203 = sub <8 x i16> <i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80>, %vshr_n134
+  %add.i200 = add <8 x i16> %sub.i205, <i16 96, i16 96, i16 96, i16 96, i16 96, i16 96, i16 96, i16 96>
+  %add.i198 = add <8 x i16> %add.i200, %sub.i203
+  %mul.i194 = mul <8 x i16> %add.i198, %vmovl.i237
+  %mul.i191 = mul <8 x i16> %vshr_n130, undef
+  %add.i192 = add <8 x i16> %mul.i191, %mul.i194
+  %mul.i187 = mul <8 x i16> %vshr_n134, undef
+  %add.i188 = add <8 x i16> %mul.i187, %add.i192
+  %mul.i185 = mul <8 x i16> undef, undef
+  %add.i186 = add <8 x i16> %mul.i185, undef
+  %vrshr_n160 = tail call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %add.i188, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
+  %vrshr_n163 = tail call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %add.i186, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
+  %mul.i184 = mul <8 x i16> undef, %vrshr_n160
+  %mul.i181 = mul <8 x i16> undef, %vmovl.i225
+  %add.i182 = add <8 x i16> %mul.i181, %mul.i184
+  %vrshr_n170 = tail call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %add.i182, <8 x i16> <i16 -7, i16 -7, i16 -7, i16 -7, i16 -7, i16 -7, i16 -7, i16 -7>)
+  %vqmovn1.i180 = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %vrshr_n170) nounwind
+  %4 = bitcast <8 x i8> %vqmovn1.i180 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %4, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %5 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  store <16 x i8> %5, <16 x i8>* undef, align 16
+  %add177 = add nsw i32 undef, 16
+  br i1 undef, label %for.body33, label %for.cond.loopexit
+
+for.end179:                                       ; preds = %for.cond.loopexit, %entry
+  ret void
+}
+
+declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone
diff --git a/test/CodeGen/ARM/vst3.ll b/test/CodeGen/ARM/vst3.ll
index e3372a0..f117ab2 100644
--- a/test/CodeGen/ARM/vst3.ll
+++ b/test/CodeGen/ARM/vst3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon -disable-arm-fast-isel -O0 | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+neon -fast-isel=0 -O0 | FileCheck %s
 
 define void @vst3i8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vst3i8: