750 files changed, 37921 insertions, 1953 deletions
diff --git a/test/Analysis/BasicAA/noalias-geps.ll b/test/Analysis/BasicAA/noalias-geps.ll
new file mode 100644
index 0000000..a93d778
--- /dev/null
+++ b/test/Analysis/BasicAA/noalias-geps.ll
@@ -0,0 +1,54 @@
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+; Check that geps with equal base offsets of noalias base pointers stay noalias.
+define i32 @test(i32* %p, i16 %i) {
+  %pi = getelementptr i32* %p, i32 0
+  %pi.next = getelementptr i32* %p, i32 1
+  %b = icmp eq i16 %i, 0
+  br i1 %b, label %bb1, label %bb2
+
+bb1:
+  %f = getelementptr i32* %pi, i32 1
+  %g = getelementptr i32* %pi.next, i32 1
+  br label %bb3
+bb2:
+  %f2 = getelementptr i32* %pi, i32 1
+  %g2 = getelementptr i32* %pi.next, i32 1
+  br label %bb3
+
+bb3:
+  %ptr_phi = phi i32* [ %f, %bb1 ], [ %f2, %bb2 ]
+  %ptr_phi2 = phi i32* [ %g, %bb1 ], [ %g2, %bb2 ]
+; CHECK: NoAlias: i32* %f1, i32* %g1
+  %f1 = getelementptr i32* %ptr_phi , i32 1
+  %g1 = getelementptr i32* %ptr_phi2 , i32 1
+
+ret i32 0
+}
+
+; Check that geps with equal indices of noalias base pointers stay noalias.
+define i32 @test2([2 x i32]* %p, i32 %i) {
+  %pi = getelementptr [2 x i32]* %p, i32 0
+  %pi.next = getelementptr [2 x i32]* %p, i32 1
+  %b = icmp eq i32 %i, 0
+  br i1 %b, label %bb1, label %bb2
+
+bb1:
+  %f = getelementptr [2 x i32]* %pi, i32 1
+  %g = getelementptr [2 x i32]* %pi.next, i32 1
+  br label %bb3
+bb2:
+  %f2 = getelementptr [2 x i32]* %pi, i32 1
+  %g2 = getelementptr [2 x i32]* %pi.next, i32 1
+  br label %bb3
+bb3:
+  %ptr_phi = phi [2 x i32]* [ %f, %bb1 ], [ %f2, %bb2 ]
+  %ptr_phi2 = phi [2 x i32]* [ %g, %bb1 ], [ %g2, %bb2 ]
+; CHECK: NoAlias: i32* %f1, i32* %g1
+  %f1 = getelementptr [2 x i32]* %ptr_phi , i32 1, i32 %i
+  %g1 = getelementptr [2 x i32]* %ptr_phi2 , i32 1, i32 %i
+
+ret i32 0
+}
diff --git a/test/Analysis/BasicAA/nocapture.ll b/test/Analysis/BasicAA/nocapture.ll
index a8658ec..ffc0a09a 100644
--- a/test/Analysis/BasicAA/nocapture.ll
+++ b/test/Analysis/BasicAA/nocapture.ll
@@ -13,3 +13,24 @@ define i32 @test2() {
        ret i32 %c
 }
 
+declare void @test3(i32** %p, i32* %q) nounwind
+
+define i32 @test4(i32* noalias nocapture %p) nounwind {
+; CHECK: call void @test3
+; CHECK: store i32 0, i32* %p
+; CHECK: store i32 1, i32* %x
+; CHECK: %y = load i32* %p
+; CHECK: ret i32 %y
+entry:
+       %q = alloca i32*
+       ; Here test3 might store %p to %q. This doesn't violate %p's nocapture
+       ; attribute since the copy doesn't outlive the function.
+       call void @test3(i32** %q, i32* %p) nounwind
+       store i32 0, i32* %p
+       %x = load i32** %q
+       ; This store might write to %p and so we can't eliminate the subsequent
+       ; load
+       store i32 1, i32* %x
+       %y = load i32* %p
+       ret i32 %y
+}
diff --git a/test/Analysis/BasicAA/phi-speculation.ll b/test/Analysis/BasicAA/phi-speculation.ll
new file mode 100644
index 0000000..21c6592
--- /dev/null
+++ b/test/Analysis/BasicAA/phi-speculation.ll
@@ -0,0 +1,33 @@
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+; ptr_phi and ptr2_phi do not alias.
+; CHECK: NoAlias: i32* %ptr2_phi, i32* %ptr_phi
+
+define i32 @test_noalias(i32* %ptr2, i32 %count, i32* %coeff) {
+entry:
+  %ptr = getelementptr inbounds i32* %ptr2, i64 1
+  br label %while.body
+
+while.body:
+  %num = phi i32 [ %count, %entry ], [ %dec, %while.body ]
+  %ptr_phi = phi i32* [ %ptr, %entry ], [ %ptr_inc, %while.body ]
+  %ptr2_phi = phi i32* [ %ptr2, %entry ], [ %ptr2_inc, %while.body ]
+  %result.09 = phi i32 [ 0 , %entry ], [ %add, %while.body ]
+  %dec = add nsw i32 %num, -1
+  %0 = load i32* %ptr_phi, align 4
+  store i32 %0, i32* %ptr2_phi, align 4
+  %1 = load i32* %coeff, align 4
+  %2 = load i32* %ptr_phi, align 4
+  %mul = mul nsw i32 %1, %2
+  %add = add nsw i32 %mul, %result.09
+  %tobool = icmp eq i32 %dec, 0
+  %ptr_inc = getelementptr inbounds i32* %ptr_phi, i64 1
+  %ptr2_inc = getelementptr inbounds i32* %ptr2_phi, i64 1
+  br i1 %tobool, label %the_exit, label %while.body
+
+the_exit:
+  ret i32 %add
+}
diff --git a/test/Analysis/BranchProbabilityInfo/basic.ll b/test/Analysis/BranchProbabilityInfo/basic.ll
index 74d06a1..08adfa8 100644
--- a/test/Analysis/BranchProbabilityInfo/basic.ll
+++ b/test/Analysis/BranchProbabilityInfo/basic.ll
@@ -88,3 +88,30 @@ exit:
 }
 
 !1 = metadata !{metadata !"branch_weights", i32 4, i32 4, i32 64, i32 4, i32 4}
+
+define i32 @test4(i32 %x) nounwind uwtable readnone ssp {
+; CHECK: Printing analysis {{.*}} for function 'test4'
+entry:
+  %conv = sext i32 %x to i64
+  switch i64 %conv, label %return [
+    i64 0, label %sw.bb
+    i64 1, label %sw.bb
+    i64 2, label %sw.bb
+    i64 5, label %sw.bb1
+  ], !prof !2
+; CHECK: edge entry -> return probability is 7 / 85
+; CHECK: edge entry -> sw.bb probability is 14 / 85
+; CHECK: edge entry -> sw.bb1 probability is 64 / 85
+
+sw.bb:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 5, %sw.bb1 ], [ 1, %sw.bb ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+!2 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 4, i32 4, i32 64}
diff --git a/test/Analysis/CallGraph/do-nothing-intrinsic.ll b/test/Analysis/CallGraph/do-nothing-intrinsic.ll
new file mode 100644
index 0000000..f28ad10
--- /dev/null
+++ b/test/Analysis/CallGraph/do-nothing-intrinsic.ll
@@ -0,0 +1,13 @@
+; RUN: opt < %s -basiccg
+; PR13903
+
+define void @main() {
+  invoke void @llvm.donothing()
+          to label %ret unwind label %unw
+unw:
+  %tmp = landingpad i8 personality i8 0 cleanup
+  br label %ret
+ret:
+  ret void
+}
+declare void @llvm.donothing() nounwind readnone
diff --git a/test/Analysis/CostModel/X86/arith.ll b/test/Analysis/CostModel/X86/arith.ll
new file mode 100644
index 0000000..37cca8d
--- /dev/null
+++ b/test/Analysis/CostModel/X86/arith.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @add(i32 %arg) {
+  ;CHECK: cost of 1 {{.*}} add
+  %A = add <4 x i32> undef, undef
+  ;CHECK: cost of 4 {{.*}} add
+  %B = add <8 x i32> undef, undef
+  ;CHECK: cost of 1 {{.*}} add
+  %C = add <2 x i64> undef, undef
+  ;CHECK: cost of 4 {{.*}} add
+  %D = add <4 x i64> undef, undef
+  ;CHECK: cost of 8 {{.*}} add
+  %E = add <8 x i64> undef, undef
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+
+define i32 @xor(i32 %arg) {
+  ;CHECK: cost of 1 {{.*}} xor
+  %A = xor <4 x i32> undef, undef
+  ;CHECK: cost of 1 {{.*}} xor
+  %B = xor <8 x i32> undef, undef
+  ;CHECK: cost of 1 {{.*}} xor
+  %C = xor <2 x i64> undef, undef
+  ;CHECK: cost of 1 {{.*}} xor
+  %D = xor <4 x i64> undef, undef
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+
+define i32 @fmul(i32 %arg) {
+  ;CHECK: cost of 1 {{.*}} fmul
+  %A = fmul <4 x float> undef, undef
+  ;CHECK: cost of 1 {{.*}} fmul
+  %B = fmul <8 x float> undef, undef
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/cast.ll b/test/Analysis/CostModel/X86/cast.ll
new file mode 100644
index 0000000..75c97a7
--- /dev/null
+++ b/test/Analysis/CostModel/X86/cast.ll
@@ -0,0 +1,69 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @add(i32 %arg) {
+
+  ; -- Same size registeres --
+  ;CHECK: cost of 1 {{.*}} zext
+  %A = zext <4 x i1> undef to <4 x i32>
+  ;CHECK: cost of 2 {{.*}} sext
+  %B = sext <4 x i1> undef to <4 x i32>
+  ;CHECK: cost of 0 {{.*}} trunc
+  %C = trunc <4 x i32> undef to <4 x i1>
+
+  ; -- Different size registers --
+  ;CHECK-NOT: cost of 1 {{.*}} zext
+  %D = zext <8 x i1> undef to <8 x i32>
+  ;CHECK-NOT: cost of 2 {{.*}} sext
+  %E = sext <8 x i1> undef to <8 x i32>
+  ;CHECK-NOT: cost of 2 {{.*}} trunc
+  %F = trunc <8 x i32> undef to <8 x i1>
+
+  ; -- scalars --
+
+  ;CHECK: cost of 1 {{.*}} zext
+  %G = zext i1 undef to i32
+  ;CHECK: cost of 0 {{.*}} trunc
+  %H = trunc i32 undef to i1
+
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+define i32 @zext_sext(<8 x i1> %in) {
+  ;CHECK: cost of 6 {{.*}} zext
+  %Z = zext <8 x i1> %in to <8 x i32>
+  ;CHECK: cost of 9 {{.*}} sext
+  %S = sext <8 x i1> %in to <8 x i32>
+
+  ;CHECK: cost of 1 {{.*}} sext
+  %A = sext <8 x i16> undef to <8 x i32>
+  ;CHECK: cost of 1 {{.*}} zext
+  %B = zext <8 x i16> undef to <8 x i32>
+  ;CHECK: cost of 1 {{.*}} sext
+  %C = sext <4 x i32> undef to <4 x i64>
+
+  ;CHECK: cost of 1 {{.*}} zext
+  %D = zext <4 x i32> undef to <4 x i64>
+  ;CHECK: cost of 1 {{.*}} trunc
+
+  %E = trunc <4 x i64> undef to <4 x i32>
+  ;CHECK: cost of 1 {{.*}} trunc
+  %F = trunc <8 x i32> undef to <8 x i16>
+
+  ;CHECK: cost of 3 {{.*}} trunc
+  %G = trunc <8 x i64> undef to <8 x i32>
+
+  ret i32 undef
+}
+
+define i32 @masks(<8 x i1> %in) {
+  ;CHECK: cost of 6 {{.*}} zext
+  %Z = zext <8 x i1> %in to <8 x i32>
+  ;CHECK: cost of 9 {{.*}} sext
+  %S = sext <8 x i1> %in to <8 x i32>
+  ret i32 undef
+}
+
diff --git a/test/Analysis/CostModel/X86/cmp.ll b/test/Analysis/CostModel/X86/cmp.ll
new file mode 100644
index 0000000..f868bd1
--- /dev/null
+++ b/test/Analysis/CostModel/X86/cmp.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @cmp(i32 %arg) {
+  ;  -- floats --
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %A = fcmp olt <2 x float> undef, undef
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %B = fcmp olt <4 x float> undef, undef
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %C = fcmp olt <8 x float> undef, undef
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %D = fcmp olt <2 x double> undef, undef
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %E = fcmp olt <4 x double> undef, undef
+
+  ;  -- integers --
+
+  ;CHECK: cost of 1 {{.*}} icmp
+  %F = icmp eq <16 x i8> undef, undef
+  ;CHECK: cost of 1 {{.*}} icmp
+  %G = icmp eq <8 x i16> undef, undef
+  ;CHECK: cost of 1 {{.*}} icmp
+  %H = icmp eq <4 x i32> undef, undef
+  ;CHECK: cost of 1 {{.*}} icmp
+  %I = icmp eq <2 x i64> undef, undef
+  ;CHECK: cost of 4 {{.*}} icmp
+  %J = icmp eq <4 x i64> undef, undef
+  ;CHECK: cost of 4 {{.*}} icmp
+  %K = icmp eq <8 x i32> undef, undef
+  ;CHECK: cost of 4 {{.*}} icmp
+  %L = icmp eq <16 x i16> undef, undef
+  ;CHECK: cost of 4 {{.*}} icmp
+  %M = icmp eq <32 x i8> undef, undef
+
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+
diff --git a/test/Analysis/CostModel/X86/i32.ll b/test/Analysis/CostModel/X86/i32.ll
new file mode 100644
index 0000000..4015e0b
--- /dev/null
+++ b/test/Analysis/CostModel/X86/i32.ll
@@ -0,0 +1,9 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=i386 -mcpu=corei7-avx | FileCheck %s
+
+
+;CHECK: cost of 2 {{.*}} add
+;CHECK: cost of 1 {{.*}} ret
+define i32 @no_info(i32 %arg) {
+  %e = add i64 undef, undef
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/insert-extract-at-zero.ll b/test/Analysis/CostModel/X86/insert-extract-at-zero.ll
new file mode 100644
index 0000000..87bf7c4
--- /dev/null
+++ b/test/Analysis/CostModel/X86/insert-extract-at-zero.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @insert-extract-at-zero-idx(i32 %arg, float %fl) {
+  ;CHECK: cost of 0 {{.*}} extract
+  %A = extractelement <4 x float> undef, i32 0
+  ;CHECK: cost of 1 {{.*}} extract
+  %B = extractelement <4 x i32> undef, i32 0
+  ;CHECK: cost of 1 {{.*}} extract
+  %C = extractelement <4 x float> undef, i32 1
+
+  ;CHECK: cost of 0 {{.*}} extract
+  %D = extractelement <8 x float> undef, i32 0
+  ;CHECK: cost of 1 {{.*}} extract
+  %E = extractelement <8 x float> undef, i32 1
+
+  ;CHECK: cost of 1 {{.*}} extract
+  %F = extractelement <8 x float> undef, i32 %arg
+
+  ;CHECK: cost of 0 {{.*}} insert
+  %G = insertelement <4 x float> undef, float %fl, i32 0
+  ;CHECK: cost of 1 {{.*}} insert
+  %H = insertelement <4 x float> undef, float %fl, i32 1
+  ;CHECK: cost of 1 {{.*}} insert
+  %I = insertelement <4 x i32> undef, i32 %arg, i32 0
+
+  ;CHECK: cost of 0 {{.*}} insert
+  %J = insertelement <4 x double> undef, double undef, i32 0
+
+  ;CHECK: cost of 0 {{.*}} insert
+  %K = insertelement <8 x double> undef, double undef, i32 4
+  ;CHECK: cost of 0 {{.*}} insert
+  %L = insertelement <16 x double> undef, double undef, i32 8
+  ;CHECK: cost of 1 {{.*}} insert
+  %M = insertelement <16 x double> undef, double undef, i32 9
+  ret i32 0
+}
+
diff --git a/test/Analysis/CostModel/X86/lit.local.cfg b/test/Analysis/CostModel/X86/lit.local.cfg
new file mode 100644
index 0000000..a8ad0f1
--- /dev/null
+++ b/test/Analysis/CostModel/X86/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/test/Analysis/CostModel/X86/loop_v2.ll b/test/Analysis/CostModel/X86/loop_v2.ll
new file mode 100644
index 0000000..260a606
--- /dev/null
+++ b/test/Analysis/CostModel/X86/loop_v2.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+define i32 @foo(i32* nocapture %A) nounwind uwtable readonly ssp {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <2 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ]
+  %0 = getelementptr inbounds i32* %A, i64 %index
+  %1 = bitcast i32* %0 to <2 x i32>*
+  %2 = load <2 x i32>* %1, align 4
+  %3 = sext <2 x i32> %2 to <2 x i64>
+  ;CHECK: cost of 1 {{.*}} extract
+  %4 = extractelement <2 x i64> %3, i32 0
+  %5 = getelementptr inbounds i32* %A, i64 %4
+  ;CHECK: cost of 1 {{.*}} extract
+  %6 = extractelement <2 x i64> %3, i32 1
+  %7 = getelementptr inbounds i32* %A, i64 %6
+  %8 = load i32* %5, align 4, !tbaa !0
+  ;CHECK: cost of 1 {{.*}} insert
+  %9 = insertelement <2 x i32> undef, i32 %8, i32 0
+  %10 = load i32* %7, align 4, !tbaa !0
+  ;CHECK: cost of 1 {{.*}} insert
+  %11 = insertelement <2 x i32> %9, i32 %10, i32 1
+  %12 = add nsw <2 x i32> %11, %vec.phi
+  %index.next = add i64 %index, 2
+  %13 = icmp eq i64 %index.next, 192
+  br i1 %13, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  %14 = extractelement <2 x i32> %12, i32 0
+  %15 = extractelement <2 x i32> %12, i32 1
+  %16 = add i32 %14, %15
+  ret i32 %16
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Analysis/CostModel/X86/tiny.ll b/test/Analysis/CostModel/X86/tiny.ll
new file mode 100644
index 0000000..cc7b443
--- /dev/null
+++ b/test/Analysis/CostModel/X86/tiny.ll
@@ -0,0 +1,11 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: cost of 1 {{.*}} add
+;CHECK: cost of 1 {{.*}} ret
+define i32 @no_info(i32 %arg) {
+  %e = add i32 %arg, %arg
+  ret i32 %e
+}
diff --git a/test/Analysis/CostModel/X86/vectorized-loop.ll b/test/Analysis/CostModel/X86/vectorized-loop.ll
new file mode 100644
index 0000000..7919a9c
--- /dev/null
+++ b/test/Analysis/CostModel/X86/vectorized-loop.ll
@@ -0,0 +1,78 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @foo(i32* noalias nocapture %A, i32* noalias nocapture %B, i32 %start, i32 %end) nounwind uwtable ssp {
+entry:
+  ;CHECK: cost of 1 {{.*}} icmp
+  %cmp7 = icmp slt i32 %start, %end
+  br i1 %cmp7, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  ;CHECK: cost of 1 {{.*}} sext
+  %0 = sext i32 %start to i64
+  %1 = sub i32 %end, %start
+  %2 = zext i32 %1 to i64
+  %end.idx = add i64 %2, %0
+  ;CHECK: cost of 1 {{.*}} add
+  %n.vec = and i64 %2, 4294967288
+  %end.idx.rnd.down = add i64 %n.vec, %0
+  ;CHECK: cost of 1 {{.*}} icmp
+  %cmp.zero = icmp eq i64 %n.vec, 0
+  br i1 %cmp.zero, label %middle.block, label %vector.body
+
+vector.body:                                      ; preds = %for.body.lr.ph, %vector.body
+  %index = phi i64 [ %index.next, %vector.body ], [ %0, %for.body.lr.ph ]
+  %3 = add i64 %index, 2
+  %4 = getelementptr inbounds i32* %B, i64 %3
+  ;CHECK: cost of 0 {{.*}} bitcast
+  %5 = bitcast i32* %4 to <8 x i32>*
+  ;CHECK: cost of 1 {{.*}} load
+  %6 = load <8 x i32>* %5, align 4
+  ;CHECK: cost of 4 {{.*}} mul
+  %7 = mul nsw <8 x i32> %6, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  %8 = getelementptr inbounds i32* %A, i64 %index
+  %9 = bitcast i32* %8 to <8 x i32>*
+  %10 = load <8 x i32>* %9, align 4
+  ;CHECK: cost of 4 {{.*}} add
+  %11 = add nsw <8 x i32> %10, %7
+  ;CHECK: cost of 1 {{.*}} store
+  store <8 x i32> %11, <8 x i32>* %9, align 4
+  %index.next = add i64 %index, 8
+  %12 = icmp eq i64 %index.next, %end.idx.rnd.down
+  ;CHECK: cost of 1 {{.*}} br
+  br i1 %12, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body, %for.body.lr.ph
+  %cmp.n = icmp eq i64 %end.idx, %end.idx.rnd.down
+  br i1 %cmp.n, label %for.end, label %for.body
+
+for.body:                                         ; preds = %middle.block, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %end.idx.rnd.down, %middle.block ]
+  %13 = add nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds i32* %B, i64 %13
+  ;CHECK: cost of 1 {{.*}} load
+  %14 = load i32* %arrayidx, align 4, !tbaa !0
+  ;CHECK: cost of 1 {{.*}} mul
+  %mul = mul nsw i32 %14, 5
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  ;CHECK: cost of 1 {{.*}} load
+  %15 = load i32* %arrayidx2, align 4, !tbaa !0
+  %add3 = add nsw i32 %15, %mul
+  store i32 %add3, i32* %arrayidx2, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  ;CHECK: cost of 0 {{.*}} trunc
+  %16 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %16, %end
+  ;CHECK: cost of 1 {{.*}} br
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %middle.block, %for.body, %entry
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Analysis/LoopDependenceAnalysis/lit.local.cfg b/test/Analysis/CostModel/lit.local.cfg
index 19eebc0..19eebc0 100644
--- a/test/Analysis/LoopDependenceAnalysis/lit.local.cfg
+++ b/test/Analysis/CostModel/lit.local.cfg
diff --git a/test/Analysis/CostModel/no_info.ll b/test/Analysis/CostModel/no_info.ll
new file mode 100644
index 0000000..d20d56b
--- /dev/null
+++ b/test/Analysis/CostModel/no_info.ll
@@ -0,0 +1,15 @@
+; RUN: opt < %s -cost-model -analyze | FileCheck %s
+
+; The cost model does not have any target information so it can't make a decision.
+; Notice that OPT does not read the triple information from the module itself, only through the command line.
+
+; This info ignored:
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: Unknown cost {{.*}} add
+;CHECK: Unknown cost {{.*}} ret
+define i32 @no_info(i32 %arg) {
+  %e = add i32 %arg, %arg
+  ret i32 %e
+}
diff --git a/test/Analysis/DependenceAnalysis/Banerjee.ll b/test/Analysis/DependenceAnalysis/Banerjee.ll
new file mode 100644
index 0000000..8865ee9
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/Banerjee.ll
@@ -0,0 +1,595 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'Banerjee.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 1; i <= 10; i++)
+;;    for (long int j = 1; j <= 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j - 1];
+
+define void @banerjee0(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 1, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 1, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %sub = add nsw i64 %add5, -1
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %sub
+  %0 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - flow [<= <>]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 11
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 11
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+
+;;  for (long int i = 1; i <= n; i++)
+;;    for (long int j = 1; j <= m; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j - 1];
+
+define void @banerjee1(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp sgt i64 %n, 0
+  br i1 %cmp4, label %for.cond1.preheader.preheader, label %for.end9
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  %0 = add i64 %n, 1
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc7
+  %B.addr.06 = phi i64* [ %B.addr.1.lcssa, %for.inc7 ], [ %B, %for.cond1.preheader.preheader ]
+  %i.05 = phi i64 [ %inc8, %for.inc7 ], [ 1, %for.cond1.preheader.preheader ]
+  %1 = add i64 %m, 1
+  %cmp21 = icmp sgt i64 %m, 0
+  br i1 %cmp21, label %for.body3.preheader, label %for.inc7
+
+for.body3.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+  %j.03 = phi i64 [ %inc, %for.body3 ], [ 1, %for.body3.preheader ]
+  %B.addr.12 = phi i64* [ %incdec.ptr, %for.body3 ], [ %B.addr.06, %for.body3.preheader ]
+  %mul = mul nsw i64 %i.05, 10
+  %add = add nsw i64 %mul, %j.03
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.05, 10
+  %add5 = add nsw i64 %mul4, %j.03
+  %sub = add nsw i64 %add5, -1
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %sub
+  %2 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - flow [* <>]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.12, i64 1
+  store i64 %2, i64* %B.addr.12, align 8
+  %inc = add nsw i64 %j.03, 1
+  %exitcond = icmp eq i64 %inc, %1
+  br i1 %exitcond, label %for.inc7.loopexit, label %for.body3
+
+for.inc7.loopexit:                                ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.06, i64 %m
+  br label %for.inc7
+
+for.inc7:                                         ; preds = %for.inc7.loopexit, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i64* [ %B.addr.06, %for.cond1.preheader ], [ %scevgep, %for.inc7.loopexit ]
+  %inc8 = add nsw i64 %i.05, 1
+  %exitcond7 = icmp eq i64 %inc8, %0
+  br i1 %exitcond7, label %for.end9.loopexit, label %for.cond1.preheader
+
+for.end9.loopexit:                                ; preds = %for.inc7
+  br label %for.end9
+
+for.end9:                                         ; preds = %for.end9.loopexit, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = 0;
+;;      *B++ = A[10*i + j + 100];
+
+define void @banerjee2(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %add6 = add nsw i64 %add5, 100
+  %arrayidx7 = getelementptr inbounds i64* %A, i64 %add6
+  %0 = load i64* %arrayidx7, align 8
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j + 99];
+
+define void @banerjee3(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %add6 = add nsw i64 %add5, 99
+  %arrayidx7 = getelementptr inbounds i64* %A, i64 %add6
+  %0 = load i64* %arrayidx7, align 8
+; CHECK: da analyze - flow [> >]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j - 100];
+
+define void @banerjee4(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %sub = add nsw i64 %add5, -100
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %sub
+  %0 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j - 99];
+
+define void @banerjee5(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %sub = add nsw i64 %add5, -99
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %sub
+  %0 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - flow [< <]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j + 9];
+
+define void @banerjee6(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %add6 = add nsw i64 %add5, 9
+  %arrayidx7 = getelementptr inbounds i64* %A, i64 %add6
+  %0 = load i64* %arrayidx7, align 8
+; CHECK: da analyze - flow [=> <>]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j + 10];
+
+define void @banerjee7(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %add6 = add nsw i64 %add5, 10
+  %arrayidx7 = getelementptr inbounds i64* %A, i64 %add6
+  %0 = load i64* %arrayidx7, align 8
+; CHECK: da analyze - flow [> <=]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j + 11];
+
+define void @banerjee8(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %add6 = add nsw i64 %add5, 11
+  %arrayidx7 = getelementptr inbounds i64* %A, i64 %add6
+  %0 = load i64* %arrayidx7, align 8
+; CHECK: da analyze - flow [> <>]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 20; i++)
+;;    for (long int j = 0; j < 20; j++) {
+;;      A[30*i + 500*j] = ...
+;;      ... = A[i - 500*j + 11];
+
+define void @banerjee9(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 30
+  %mul4 = mul nsw i64 %j.02, 500
+  %add = add nsw i64 %mul, %mul4
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %0 = mul i64 %j.02, -500
+  %sub = add i64 %i.03, %0
+  %add6 = add nsw i64 %sub, 11
+  %arrayidx7 = getelementptr inbounds i64* %A, i64 %add6
+  %1 = load i64* %arrayidx7, align 8
+; CHECK: da analyze - flow [<= =|<]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %1, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 20
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 20
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 20
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 20; i++)
+;;    for (long int j = 0; j < 20; j++) {
+;;      A[i + 500*j] = ...
+;;      ... = A[i - 500*j + 11];
+
+define void @banerjee10(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %j.02, 500
+  %add = add nsw i64 %i.03, %mul
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %0 = mul i64 %j.02, -500
+  %sub = add i64 %i.03, %0
+  %add5 = add nsw i64 %sub, 11
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %add5
+  %1 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - flow [<> =]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %1, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 20
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 20
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 20
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 20; i++)
+;;    for (long int j = 0; j < 20; j++) {
+;;      A[300*i + j] = ...
+;;      ... = A[250*i - j + 11];
+
+define void @banerjee11(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 300
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 250
+  %sub = sub nsw i64 %mul4, %j.02
+  %add5 = add nsw i64 %sub, 11
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %add5
+  %0 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - flow [<= <>]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 20
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 20
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 20
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 20; i++)
+;;    for (long int j = 0; j < 20; j++) {
+;;      A[100*i + j] = ...
+;;      ... = A[100*i - j + 11];
+
+define void @banerjee12(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 100
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 100
+  %sub = sub nsw i64 %mul4, %j.02
+  %add5 = add nsw i64 %sub, 11
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %add5
+  %0 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - flow [= <>]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 20
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 20
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 20
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/Coupled.ll b/test/Analysis/DependenceAnalysis/Coupled.ll
new file mode 100644
index 0000000..60163fe
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/Coupled.ll
@@ -0,0 +1,509 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'Coupled.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[i][i] = ...
+;;   ... = A[i + 10][i + 9]
+
+define void @couple0([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx1 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  store i32 %conv, i32* %arrayidx1, align 4
+  %add = add nsw i64 %i.02, 9
+  %add2 = add nsw i64 %i.02, 10
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %add2, i64 %add
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[i][i] = ...
+;;   ... = A[i + 9][i + 9]
+
+define void @couple1([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx1 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  store i32 %conv, i32* %arrayidx1, align 4
+  %add = add nsw i64 %i.02, 9
+  %add2 = add nsw i64 %i.02, 9
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %add2, i64 %add
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - consistent flow [-9]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[3*i - 6][3*i - 6] = ...
+;;   ... = A[i][i]
+
+define void @couple2([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul nsw i64 %i.02, 3
+  %sub = add nsw i64 %mul, -6
+  %mul1 = mul nsw i64 %i.02, 3
+  %sub2 = add nsw i64 %mul1, -6
+  %arrayidx3 = getelementptr inbounds [100 x i32]* %A, i64 %sub2, i64 %sub
+  store i32 %conv, i32* %arrayidx3, align 4
+  %arrayidx5 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[3*i - 6][3*i - 5] = ...
+;;   ... = A[i][i]
+
+define void @couple3([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul nsw i64 %i.02, 3
+  %sub = add nsw i64 %mul, -5
+  %mul1 = mul nsw i64 %i.02, 3
+  %sub2 = add nsw i64 %mul1, -6
+  %arrayidx3 = getelementptr inbounds [100 x i32]* %A, i64 %sub2, i64 %sub
+  store i32 %conv, i32* %arrayidx3, align 4
+  %arrayidx5 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[3*i - 6][3*i - n] = ...
+;;   ... = A[i][i]
+
+define void @couple4([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul nsw i64 %i.02, 3
+  %conv1 = sext i32 %n to i64
+  %sub = sub nsw i64 %mul, %conv1
+  %mul2 = mul nsw i64 %i.02, 3
+  %sub3 = add nsw i64 %mul2, -6
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %sub3, i64 %sub
+  store i32 %conv, i32* %arrayidx4, align 4
+  %arrayidx6 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx6, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[3*i - n + 1][3*i - n] = ...
+;;   ... = A[i][i]
+
+define void @couple5([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul nsw i64 %i.02, 3
+  %conv1 = sext i32 %n to i64
+  %sub = sub nsw i64 %mul, %conv1
+  %mul2 = mul nsw i64 %i.02, 3
+  %conv3 = sext i32 %n to i64
+  %sub4 = sub nsw i64 %mul2, %conv3
+  %add = add nsw i64 %sub4, 1
+  %arrayidx5 = getelementptr inbounds [100 x i32]* %A, i64 %add, i64 %sub
+  store i32 %conv, i32* %arrayidx5, align 4
+  %arrayidx7 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[i][3*i - 6] = ...
+;;   ... = A[i][i]
+
+define void @couple6([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul nsw i64 %i.02, 3
+  %sub = add nsw i64 %mul, -6
+  %arrayidx1 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %sub
+  store i32 %conv, i32* %arrayidx1, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - flow [=|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[i][3*i - 5] = ...
+;;   ... = A[i][i]
+
+define void @couple7([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul nsw i64 %i.02, 3
+  %sub = add nsw i64 %mul, -5
+  %arrayidx1 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %sub
+  store i32 %conv, i32* %arrayidx1, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i <= 15; i++)
+;;   A[3*i - 18][3 - i] = ...
+;;   ... = A[i][i]
+
+define void @couple8([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 3, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %A, i64 %sub1, i64 %sub
+  store i32 %conv, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i <= 15; i++)
+;;   A[3*i - 18][2 - i] = ...
+;;   ... = A[i][i]
+
+define void @couple9([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 2, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %A, i64 %sub1, i64 %sub
+  store i32 %conv, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i <= 15; i++)
+;;   A[3*i - 18][6 - i] = ...
+;;   ... = A[i][i]
+
+define void @couple10([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 6, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %A, i64 %sub1, i64 %sub
+  store i32 %conv, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - flow [>] splitable!
+; CHECK: da analyze - split level = 1, iteration = 3!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i <= 15; i++)
+;;   A[3*i - 18][18 - i] = ...
+;;   ... = A[i][i]
+
+define void @couple11([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 18, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %A, i64 %sub1, i64 %sub
+  store i32 %conv, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - flow [=|<] splitable!
+; CHECK: da analyze - split level = 1, iteration = 9!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i <= 12; i++)
+;;   A[3*i - 18][22 - i] = ...
+;;   ... = A[i][i]
+
+define void @couple12([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 22, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %A, i64 %sub1, i64 %sub
+  store i32 %conv, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - flow [<] splitable!
+; CHECK: da analyze - split level = 1, iteration = 11!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 13
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 12; i++)
+;;   A[3*i - 18][22 - i] = ...
+;;   ... = A[i][i]
+
+define void @couple13([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 22, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %A, i64 %sub1, i64 %sub
+  store i32 %conv, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 12
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 100; i++)
+;;   A[3*i - 18][18 - i][i] = ...
+;;   ... = A[i][i][i]
+
+define void @couple14([100 x [100 x i32]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 18, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx3 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %sub1, i64 %sub, i64 %i.02
+  store i32 %conv, i32* %arrayidx3, align 4
+  %arrayidx6 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %i.02, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx6, align 4
+; CHECK: da analyze - flow [=|<] splitable!
+; CHECK: da analyze - split level = 1, iteration = 9!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 100; i++)
+;;   A[3*i - 18][22 - i][i] = ...
+;;   ... = A[i][i][i]
+
+define void @couple15([100 x [100 x i32]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 22, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx3 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %sub1, i64 %sub, i64 %i.02
+  store i32 %conv, i32* %arrayidx3, align 4
+  %arrayidx6 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %i.02, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx6, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/ExactRDIV.ll b/test/Analysis/DependenceAnalysis/ExactRDIV.ll
new file mode 100644
index 0000000..aa5d254
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/ExactRDIV.ll
@@ -0,0 +1,508 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'ExactRDIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    A[4*i + 10] = ...
+;;  for (long int j = 0; j < 10; j++)
+;;    ... = A[2*j + 1];
+
+define void @rdiv0(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 2
+  %add = add nsw i64 %mul, 10
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 10
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc9, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %mul5 = shl nsw i64 %j.02, 1
+  %add64 = or i64 %mul5, 1
+  %arrayidx7 = getelementptr inbounds i32* %A, i64 %add64
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc9 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc9, 10
+  br i1 %cmp2, label %for.body4, label %for.end10
+
+for.end10:                                        ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    A[11*i - 45] = ...
+;;  for (long int j = 0; j < 10; j++)
+;;    ... = A[j];
+
+define void @rdiv1(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = add nsw i64 %mul, -45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 5
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %j.02
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 10
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i <= 5; i++)
+;;    A[11*i - 45] = ...
+;;  for (long int j = 0; j < 10; j++)
+;;    ... = A[j];
+
+define void @rdiv2(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = add nsw i64 %mul, -45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 6
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %j.02
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 10
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    A[11*i - 45] = ...
+;;  for (long int j = 0; j <= 10; j++)
+;;    ... = A[j];
+
+define void @rdiv3(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = add nsw i64 %mul, -45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 5
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %j.02
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 11
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i <= 5; i++)
+;;    A[11*i - 45] = ...
+;;  for (long int j = 0; j <= 10; j++)
+;;    ... = A[j];
+
+define void @rdiv4(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = add nsw i64 %mul, -45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 6
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %j.02
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - flow!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 11
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    A[-11*i + 45] = ...
+;;  for (long int j = 0; j < 10; j++)
+;;    ... = A[-j];
+
+define void @rdiv5(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -11
+  %add = add nsw i64 %mul, 45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 5
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %sub = sub nsw i64 0, %j.02
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 10
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i <= 5; i++)
+;;    A[-11*i + 45] = ...
+;;  for (long int j = 0; j < 10; j++)
+;;    ... = A[-j];
+
+define void @rdiv6(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -11
+  %add = add nsw i64 %mul, 45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 6
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %sub = sub nsw i64 0, %j.02
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 10
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    A[-11*i + 45] = ...
+;;  for (long int j = 0; j <= 10; j++)
+;;    ... = A[-j];
+
+define void @rdiv7(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -11
+  %add = add nsw i64 %mul, 45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 5
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %sub = sub nsw i64 0, %j.02
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 11
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i <= 5; i++)
+;;    A[-11*i + 45] = ...
+;;  for (long int j = 0; j <= 10; j++)
+;;    ... = A[-j];
+
+define void @rdiv8(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -11
+  %add = add nsw i64 %mul, 45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 6
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %sub = sub nsw i64 0, %j.02
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - flow!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 11
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    for (long int j = 0; j < 10; j++)
+;;      A[11*i - j] = ...
+;;      ... = A[45];
+
+define void @rdiv9(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc5, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc5 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc6, %for.inc5 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = sub nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 45
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 10
+  br i1 %cmp2, label %for.body3, label %for.inc5
+
+for.inc5:                                         ; preds = %for.body3
+  %inc6 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc6, 5
+  br i1 %cmp, label %for.cond1.preheader, label %for.end7
+
+for.end7:                                         ; preds = %for.inc5
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    for (long int j = 0; j <= 10; j++)
+;;      A[11*i - j] = ...
+;;      ... = A[45];
+
+define void @rdiv10(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc5, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc5 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc6, %for.inc5 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = sub nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 45
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 10
+  br i1 %cmp2, label %for.body3, label %for.inc5
+
+for.inc5:                                         ; preds = %for.body3
+  %inc6 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc6, 6
+  br i1 %cmp, label %for.cond1.preheader, label %for.end7
+
+for.end7:                                         ; preds = %for.inc5
+  ret void
+}
+
+
+;;  for (long int i = 0; i <= 5; i++)
+;;    for (long int j = 0; j <= 10; j++)
+;;      A[11*i - j] = ...
+;;      ... = A[45];
+
+define void @rdiv11(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc5, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc5 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc6, %for.inc5 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = sub nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 45
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 11
+  br i1 %cmp2, label %for.body3, label %for.inc5
+
+for.inc5:                                         ; preds = %for.body3
+  %inc6 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc6, 5
+  br i1 %cmp, label %for.cond1.preheader, label %for.end7
+
+for.end7:                                         ; preds = %for.inc5
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    for (long int j = 0; j < 10; j++)
+;;      A[11*i - j] = ...
+;;      ... = A[45];
+
+define void @rdiv12(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc5, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc5 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc6, %for.inc5 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = sub nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 45
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - flow [* *|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 11
+  br i1 %cmp2, label %for.body3, label %for.inc5
+
+for.inc5:                                         ; preds = %for.body3
+  %inc6 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc6, 6
+  br i1 %cmp, label %for.cond1.preheader, label %for.end7
+
+for.end7:                                         ; preds = %for.inc5
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/ExactSIV.ll b/test/Analysis/DependenceAnalysis/ExactSIV.ll
new file mode 100644
index 0000000..71e0502
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/ExactSIV.ll
@@ -0,0 +1,428 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'ExactSIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long unsigned i = 0; i < 10; i++) {
+;;    A[i + 10] = ...
+;;    ... = A[2*i + 1];
+
+define void @exact0(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %add = add i64 %i.02, 10
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %i.02, 1
+  %add13 = or i64 %mul, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %add13
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [<=|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 10; i++) {
+;;    A[4*i + 10] = ...
+;;    ... = A[2*i + 1];
+
+define void @exact1(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 2
+  %add = add i64 %mul, 10
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul1 = shl i64 %i.02, 1
+  %add23 = or i64 %mul1, 1
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %add23
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 10; i++) {
+;;    A[6*i] = ...
+;;    ... = A[i + 60];
+
+define void @exact2(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add i64 %i.02, 60
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i <= 10; i++) {
+;;    A[6*i] = ...
+;;    ... = A[i + 60];
+
+define void @exact3(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add i64 %i.02, 60
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [>]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 11
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 12; i++) {
+;;    A[6*i] = ...
+;;    ... = A[i + 60];
+
+define void @exact4(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add i64 %i.02, 60
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [>]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 12
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i <= 12; i++) {
+;;    A[6*i] = ...
+;;    ... = A[i + 60];
+
+define void @exact5(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add i64 %i.02, 60
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [=>|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 13
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 18; i++) {
+;;    A[6*i] = ...
+;;    ... = A[i + 60];
+
+define void @exact6(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add i64 %i.02, 60
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [=>|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 18
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i <= 18; i++) {
+;;    A[6*i] = ...
+;;    ... = A[i + 60];
+
+define void @exact7(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add i64 %i.02, 60
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 19
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 10; i++) {
+;;    A[-6*i] = ...
+;;    ... = A[-i - 60];
+
+define void @exact8(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, -6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub1 = sub i64 -60, %i.02
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i <= 10; i++) {
+;;    A[-6*i] = ...
+;;    ... = A[-i - 60];
+
+define void @exact9(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, -6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub1 = sub i64 -60, %i.02
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [>]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 11
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 12; i++) {
+;;    A[-6*i] = ...
+;;    ... = A[-i - 60];
+
+define void @exact10(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, -6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub1 = sub i64 -60, %i.02
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [>]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 12
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i <= 12; i++) {
+;;    A[-6*i] = ...
+;;    ... = A[-i - 60];
+
+define void @exact11(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, -6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub1 = sub i64 -60, %i.02
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [=>|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 13
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 18; i++) {
+;;    A[-6*i] = ...
+;;    ... = A[-i - 60];
+
+define void @exact12(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, -6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub1 = sub i64 -60, %i.02
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [=>|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 18
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i <= 18; i++) {
+;;    A[-6*i] = ...
+;;    ... = A[-i - 60];
+
+define void @exact13(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, -6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub1 = sub i64 -60, %i.02
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 19
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/GCD.ll b/test/Analysis/DependenceAnalysis/GCD.ll
new file mode 100644
index 0000000..94c93a8
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/GCD.ll
@@ -0,0 +1,597 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'GCD.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[2*i - 4*j] = ...
+;;      ... = A[6*i + 8*j];
+
+define void @gcd0(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %mul4 = shl nsw i64 %j.02, 2
+  %sub = sub nsw i64 %mul, %mul4
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul5 = mul nsw i64 %i.03, 6
+  %mul6 = shl nsw i64 %j.02, 3
+  %add = add nsw i64 %mul5, %mul6
+  %arrayidx7 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - flow [=> *|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 100
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.04, i64 100
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 100
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[2*i - 4*j] = ...
+;;      ... = A[6*i + 8*j + 1];
+
+define void @gcd1(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc9
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %scevgep, %for.inc9 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc10, %for.inc9 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %mul4 = shl nsw i64 %j.02, 2
+  %sub = sub nsw i64 %mul, %mul4
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul5 = mul nsw i64 %i.03, 6
+  %mul6 = shl nsw i64 %j.02, 3
+  %add = add nsw i64 %mul5, %mul6
+  %add7 = or i64 %add, 1
+  %arrayidx8 = getelementptr inbounds i32* %A, i64 %add7
+  %0 = load i32* %arrayidx8, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 100
+  br i1 %exitcond, label %for.body3, label %for.inc9
+
+for.inc9:                                         ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.04, i64 100
+  %inc10 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc10, 100
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end11
+
+for.end11:                                        ; preds = %for.inc9
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[2*i - 4*j + 1] = ...
+;;      ... = A[6*i + 8*j];
+
+define void @gcd2(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc9
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %scevgep, %for.inc9 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc10, %for.inc9 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %mul4 = shl nsw i64 %j.02, 2
+  %sub = sub nsw i64 %mul, %mul4
+  %add5 = or i64 %sub, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add5
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul5 = mul nsw i64 %i.03, 6
+  %mul6 = shl nsw i64 %j.02, 3
+  %add7 = add nsw i64 %mul5, %mul6
+  %arrayidx8 = getelementptr inbounds i32* %A, i64 %add7
+  %0 = load i32* %arrayidx8, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 100
+  br i1 %exitcond, label %for.body3, label %for.inc9
+
+for.inc9:                                         ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.04, i64 100
+  %inc10 = add nsw i64 %i.03, 1
+  %exitcond6 = icmp ne i64 %inc10, 100
+  br i1 %exitcond6, label %for.cond1.preheader, label %for.end11
+
+for.end11:                                        ; preds = %for.inc9
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[i + 2*j] = ...
+;;      ... = A[i + 2*j - 1];
+
+define void @gcd3(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %j.02, 1
+  %add = add nsw i64 %i.03, %mul
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul4 = shl nsw i64 %j.02, 1
+  %add5 = add nsw i64 %i.03, %mul4
+  %sub = add nsw i64 %add5, -1
+  %arrayidx6 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx6, align 4
+; CHECK: da analyze - flow [<> *]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 100
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.04, i64 100
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 100
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+
+;;  void gcd4(int *A, int *B, long int M, long int N) {
+;;    for (long int i = 0; i < 100; i++)
+;;      for (long int j = 0; j < 100; j++) {
+;;        A[5*i + 10*j*M + 9*M*N] = i;
+;;        *B++ = A[15*i + 20*j*M - 21*N*M + 4];
+
+define void @gcd4(i32* %A, i32* %B, i64 %M, i64 %N) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc17
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %scevgep, %for.inc17 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc18, %for.inc17 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 5
+  %mul4 = mul nsw i64 %j.02, 10
+  %mul5 = mul nsw i64 %mul4, %M
+  %add = add nsw i64 %mul, %mul5
+  %mul6 = mul nsw i64 %M, 9
+  %mul7 = mul nsw i64 %mul6, %N
+  %add8 = add nsw i64 %add, %mul7
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add8
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul9 = mul nsw i64 %i.03, 15
+  %mul10 = mul nsw i64 %j.02, 20
+  %mul11 = mul nsw i64 %mul10, %M
+  %add12 = add nsw i64 %mul9, %mul11
+  %mul13 = mul nsw i64 %N, 21
+  %mul14 = mul nsw i64 %mul13, %M
+  %sub = sub nsw i64 %add12, %mul14
+  %add15 = add nsw i64 %sub, 4
+  %arrayidx16 = getelementptr inbounds i32* %A, i64 %add15
+  %0 = load i32* %arrayidx16, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 100
+  br i1 %exitcond, label %for.body3, label %for.inc17
+
+for.inc17:                                        ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.04, i64 100
+  %inc18 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc18, 100
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end19
+
+for.end19:                                        ; preds = %for.inc17
+  ret void
+}
+
+
+;;  void gcd5(int *A, int *B, long int M, long int N) {
+;;    for (long int i = 0; i < 100; i++)
+;;      for (long int j = 0; j < 100; j++) {
+;;        A[5*i + 10*j*M + 9*M*N] = i;
+;;        *B++ = A[15*i + 20*j*M - 21*N*M + 5];
+
+define void @gcd5(i32* %A, i32* %B, i64 %M, i64 %N) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc17
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %scevgep, %for.inc17 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc18, %for.inc17 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 5
+  %mul4 = mul nsw i64 %j.02, 10
+  %mul5 = mul nsw i64 %mul4, %M
+  %add = add nsw i64 %mul, %mul5
+  %mul6 = mul nsw i64 %M, 9
+  %mul7 = mul nsw i64 %mul6, %N
+  %add8 = add nsw i64 %add, %mul7
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add8
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul9 = mul nsw i64 %i.03, 15
+  %mul10 = mul nsw i64 %j.02, 20
+  %mul11 = mul nsw i64 %mul10, %M
+  %add12 = add nsw i64 %mul9, %mul11
+  %mul13 = mul nsw i64 %N, 21
+  %mul14 = mul nsw i64 %mul13, %M
+  %sub = sub nsw i64 %add12, %mul14
+  %add15 = add nsw i64 %sub, 5
+  %arrayidx16 = getelementptr inbounds i32* %A, i64 %add15
+  %0 = load i32* %arrayidx16, align 4
+; CHECK: da analyze - flow [<> *]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 100
+  br i1 %exitcond, label %for.body3, label %for.inc17
+
+for.inc17:                                        ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.04, i64 100
+  %inc18 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc18, 100
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end19
+
+for.end19:                                        ; preds = %for.inc17
+  ret void
+}
+
+
+;;  void gcd6(long int n, int A[][n], int *B) {
+;;    for (long int i = 0; i < n; i++)
+;;      for (long int j = 0; j < n; j++) {
+;;        A[2*i][4*j] = i;
+;;        *B++ = A[8*i][6*j + 1];
+
+define void @gcd6(i64 %n, i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp sgt i64 %n, 0
+  br i1 %cmp4, label %for.cond1.preheader.preheader, label %for.end12
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc10
+  %i.06 = phi i64 [ %inc11, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ]
+  %B.addr.05 = phi i32* [ %B.addr.1.lcssa, %for.inc10 ], [ %B, %for.cond1.preheader.preheader ]
+  %cmp21 = icmp sgt i64 %n, 0
+  br i1 %cmp21, label %for.body3.preheader, label %for.inc10
+
+for.body3.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+  %j.03 = phi i64 [ %inc, %for.body3 ], [ 0, %for.body3.preheader ]
+  %B.addr.12 = phi i32* [ %incdec.ptr, %for.body3 ], [ %B.addr.05, %for.body3.preheader ]
+  %conv = trunc i64 %i.06 to i32
+  %mul = shl nsw i64 %j.03, 2
+  %mul4 = shl nsw i64 %i.06, 1
+  %0 = mul nsw i64 %mul4, %n
+  %arrayidx.sum = add i64 %0, %mul
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %arrayidx.sum
+  store i32 %conv, i32* %arrayidx5, align 4
+  %mul6 = mul nsw i64 %j.03, 6
+  %add7 = or i64 %mul6, 1
+  %mul7 = shl nsw i64 %i.06, 3
+  %1 = mul nsw i64 %mul7, %n
+  %arrayidx8.sum = add i64 %1, %add7
+  %arrayidx9 = getelementptr inbounds i32* %A, i64 %arrayidx8.sum
+  %2 = load i32* %arrayidx9, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.12, i64 1
+  store i32 %2, i32* %B.addr.12, align 4
+  %inc = add nsw i64 %j.03, 1
+  %exitcond = icmp ne i64 %inc, %n
+  br i1 %exitcond, label %for.body3, label %for.inc10.loopexit
+
+for.inc10.loopexit:                               ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.05, i64 %n
+  br label %for.inc10
+
+for.inc10:                                        ; preds = %for.inc10.loopexit, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i32* [ %B.addr.05, %for.cond1.preheader ], [ %scevgep, %for.inc10.loopexit ]
+  %inc11 = add nsw i64 %i.06, 1
+  %exitcond8 = icmp ne i64 %inc11, %n
+  br i1 %exitcond8, label %for.cond1.preheader, label %for.end12.loopexit
+
+for.end12.loopexit:                               ; preds = %for.inc10
+  br label %for.end12
+
+for.end12:                                        ; preds = %for.end12.loopexit, %entry
+  ret void
+}
+
+
+;;  void gcd7(int n, int A[][n], int *B) {
+;;    for (int i = 0; i < n; i++)
+;;      for (int j = 0; j < n; j++) {
+;;        A[2*i][4*j] = i;
+;;        *B++ = A[8*i][6*j + 1];
+
+define void @gcd7(i32 %n, i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  %0 = zext i32 %n to i64
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.cond1.preheader.preheader, label %for.end15
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc13
+  %indvars.iv8 = phi i64 [ 0, %for.cond1.preheader.preheader ], [ %indvars.iv.next9, %for.inc13 ]
+  %B.addr.05 = phi i32* [ %B.addr.1.lcssa, %for.inc13 ], [ %B, %for.cond1.preheader.preheader ]
+  %1 = add i32 %n, -1
+  %2 = zext i32 %1 to i64
+  %3 = add i64 %2, 1
+  %cmp21 = icmp sgt i32 %n, 0
+  br i1 %cmp21, label %for.body3.preheader, label %for.inc13
+
+for.body3.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+  %indvars.iv = phi i64 [ 0, %for.body3.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %B.addr.12 = phi i32* [ %incdec.ptr, %for.body3 ], [ %B.addr.05, %for.body3.preheader ]
+  %4 = trunc i64 %indvars.iv to i32
+  %mul = shl nsw i32 %4, 2
+  %idxprom = sext i32 %mul to i64
+  %5 = trunc i64 %indvars.iv8 to i32
+  %mul4 = shl nsw i32 %5, 1
+  %idxprom5 = sext i32 %mul4 to i64
+  %6 = mul nsw i64 %idxprom5, %0
+  %arrayidx.sum = add i64 %6, %idxprom
+  %arrayidx6 = getelementptr inbounds i32* %A, i64 %arrayidx.sum
+  %7 = trunc i64 %indvars.iv8 to i32
+  store i32 %7, i32* %arrayidx6, align 4
+  %8 = trunc i64 %indvars.iv to i32
+  %mul7 = mul nsw i32 %8, 6
+  %add7 = or i32 %mul7, 1
+  %idxprom8 = sext i32 %add7 to i64
+  %9 = trunc i64 %indvars.iv8 to i32
+  %mul9 = shl nsw i32 %9, 3
+  %idxprom10 = sext i32 %mul9 to i64
+  %10 = mul nsw i64 %idxprom10, %0
+  %arrayidx11.sum = add i64 %10, %idxprom8
+  %arrayidx12 = getelementptr inbounds i32* %A, i64 %arrayidx11.sum
+  %11 = load i32* %arrayidx12, align 4
+; CHECK: da analyze - flow [* *|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.12, i64 1
+  store i32 %11, i32* %B.addr.12, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body3, label %for.inc13.loopexit
+
+for.inc13.loopexit:                               ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.05, i64 %3
+  br label %for.inc13
+
+for.inc13:                                        ; preds = %for.inc13.loopexit, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i32* [ %B.addr.05, %for.cond1.preheader ], [ %scevgep, %for.inc13.loopexit ]
+  %indvars.iv.next9 = add i64 %indvars.iv8, 1
+  %lftr.wideiv10 = trunc i64 %indvars.iv.next9 to i32
+  %exitcond11 = icmp ne i32 %lftr.wideiv10, %n
+  br i1 %exitcond11, label %for.cond1.preheader, label %for.end15.loopexit
+
+for.end15.loopexit:                               ; preds = %for.inc13
+  br label %for.end15
+
+for.end15:                                        ; preds = %for.end15.loopexit, %entry
+  ret void
+}
+
+
+;;  void gcd8(int n, int *A, int *B) {
+;;    for (int i = 0; i < n; i++)
+;;      for (int j = 0; j < n; j++) {
+;;        A[n*2*i + 4*j] = i;
+;;        *B++ = A[n*8*i + 6*j + 1];
+
+define void @gcd8(i32 %n, i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.cond1.preheader.preheader, label %for.end15
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc13
+  %i.06 = phi i32 [ %inc14, %for.inc13 ], [ 0, %for.cond1.preheader.preheader ]
+  %B.addr.05 = phi i32* [ %B.addr.1.lcssa, %for.inc13 ], [ %B, %for.cond1.preheader.preheader ]
+  %0 = add i32 %n, -1
+  %1 = zext i32 %0 to i64
+  %2 = add i64 %1, 1
+  %cmp21 = icmp sgt i32 %n, 0
+  br i1 %cmp21, label %for.body3.preheader, label %for.inc13
+
+for.body3.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+  %indvars.iv = phi i64 [ 0, %for.body3.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %B.addr.12 = phi i32* [ %incdec.ptr, %for.body3 ], [ %B.addr.05, %for.body3.preheader ]
+  %mul = shl nsw i32 %n, 1
+  %mul4 = mul nsw i32 %mul, %i.06
+  %3 = trunc i64 %indvars.iv to i32
+  %mul5 = shl nsw i32 %3, 2
+  %add = add nsw i32 %mul4, %mul5
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 %i.06, i32* %arrayidx, align 4
+  %mul6 = shl nsw i32 %n, 3
+  %mul7 = mul nsw i32 %mul6, %i.06
+  %4 = trunc i64 %indvars.iv to i32
+  %mul8 = mul nsw i32 %4, 6
+  %add9 = add nsw i32 %mul7, %mul8
+  %add10 = or i32 %add9, 1
+  %idxprom11 = sext i32 %add10 to i64
+  %arrayidx12 = getelementptr inbounds i32* %A, i64 %idxprom11
+  %5 = load i32* %arrayidx12, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.12, i64 1
+  store i32 %5, i32* %B.addr.12, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body3, label %for.inc13.loopexit
+
+for.inc13.loopexit:                               ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.05, i64 %2
+  br label %for.inc13
+
+for.inc13:                                        ; preds = %for.inc13.loopexit, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i32* [ %B.addr.05, %for.cond1.preheader ], [ %scevgep, %for.inc13.loopexit ]
+  %inc14 = add nsw i32 %i.06, 1
+  %exitcond7 = icmp ne i32 %inc14, %n
+  br i1 %exitcond7, label %for.cond1.preheader, label %for.end15.loopexit
+
+for.end15.loopexit:                               ; preds = %for.inc13
+  br label %for.end15
+
+for.end15:                                        ; preds = %for.end15.loopexit, %entry
+  ret void
+}
+
+
+;;  void gcd9(unsigned n, int A[][n], int *B) {
+;;    for (unsigned i = 0; i < n; i++)
+;;      for (unsigned j = 0; j < n; j++) {
+;;        A[2*i][4*j] = i;
+;;        *B++ = A[8*i][6*j + 1];
+
+define void @gcd9(i32 %n, i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  %0 = zext i32 %n to i64
+  %cmp4 = icmp eq i32 %n, 0
+  br i1 %cmp4, label %for.end15, label %for.cond1.preheader.preheader
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc13
+  %indvars.iv8 = phi i64 [ 0, %for.cond1.preheader.preheader ], [ %indvars.iv.next9, %for.inc13 ]
+  %B.addr.05 = phi i32* [ %B.addr.1.lcssa, %for.inc13 ], [ %B, %for.cond1.preheader.preheader ]
+  %1 = add i32 %n, -1
+  %2 = zext i32 %1 to i64
+  %3 = add i64 %2, 1
+  %cmp21 = icmp eq i32 %n, 0
+  br i1 %cmp21, label %for.inc13, label %for.body3.preheader
+
+for.body3.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+  %indvars.iv = phi i64 [ 0, %for.body3.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %B.addr.12 = phi i32* [ %incdec.ptr, %for.body3 ], [ %B.addr.05, %for.body3.preheader ]
+  %4 = trunc i64 %indvars.iv to i32
+  %mul = shl i32 %4, 2
+  %idxprom = zext i32 %mul to i64
+  %5 = trunc i64 %indvars.iv8 to i32
+  %mul4 = shl i32 %5, 1
+  %idxprom5 = zext i32 %mul4 to i64
+  %6 = mul nsw i64 %idxprom5, %0
+  %arrayidx.sum = add i64 %6, %idxprom
+  %arrayidx6 = getelementptr inbounds i32* %A, i64 %arrayidx.sum
+  %7 = trunc i64 %indvars.iv8 to i32
+  store i32 %7, i32* %arrayidx6, align 4
+  %8 = trunc i64 %indvars.iv to i32
+  %mul7 = mul i32 %8, 6
+  %add7 = or i32 %mul7, 1
+  %idxprom8 = zext i32 %add7 to i64
+  %9 = trunc i64 %indvars.iv8 to i32
+  %mul9 = shl i32 %9, 3
+  %idxprom10 = zext i32 %mul9 to i64
+  %10 = mul nsw i64 %idxprom10, %0
+  %arrayidx11.sum = add i64 %10, %idxprom8
+  %arrayidx12 = getelementptr inbounds i32* %A, i64 %arrayidx11.sum
+  %11 = load i32* %arrayidx12, align 4
+; CHECK: da analyze - flow [* *|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.12, i64 1
+  store i32 %11, i32* %B.addr.12, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body3, label %for.inc13.loopexit
+
+for.inc13.loopexit:                               ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.05, i64 %3
+  br label %for.inc13
+
+for.inc13:                                        ; preds = %for.inc13.loopexit, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i32* [ %B.addr.05, %for.cond1.preheader ], [ %scevgep, %for.inc13.loopexit ]
+  %indvars.iv.next9 = add i64 %indvars.iv8, 1
+  %lftr.wideiv10 = trunc i64 %indvars.iv.next9 to i32
+  %exitcond11 = icmp ne i32 %lftr.wideiv10, %n
+  br i1 %exitcond11, label %for.cond1.preheader, label %for.end15.loopexit
+
+for.end15.loopexit:                               ; preds = %for.inc13
+  br label %for.end15
+
+for.end15:                                        ; preds = %for.end15.loopexit, %entry
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/Preliminary.ll b/test/Analysis/DependenceAnalysis/Preliminary.ll
new file mode 100644
index 0000000..3ef63fd
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/Preliminary.ll
@@ -0,0 +1,469 @@
+; RUN: opt < %s -analyze -basicaa -indvars -da | FileCheck %s
+
+; This series of tests is more interesting when debugging is enabled.
+
+; ModuleID = 'Preliminary.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;; may alias
+;; int p0(int n, int *A, int *B) {
+;;  A[0] = n;
+;;  return B[1];
+
+define i32 @p0(i32 %n, i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  store i32 %n, i32* %A, align 4
+  %arrayidx1 = getelementptr inbounds i32* %B, i64 1
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - confused!
+  ret i32 %0
+}
+
+
+;; no alias
+;; int p1(int n, int *restrict A, int *restrict B) {
+;;  A[0] = n;
+;;  return B[1];
+
+define i32 @p1(i32 %n, i32* noalias %A, i32* noalias %B) nounwind uwtable ssp {
+entry:
+  store i32 %n, i32* %A, align 4
+  %arrayidx1 = getelementptr inbounds i32* %B, i64 1
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  ret i32 %0
+}
+
+;; check loop nesting levels
+;;  for (long int i = 0; i < n; i++)
+;;    for (long int j = 0; j < n; j++)
+;;      for (long int k = 0; k < n; k++)
+;;        A[i][j][k] = ...
+;;      for (long int k = 0; k < n; k++)
+;;        ... = A[i + 3][j + 2][k + 1];
+
+define void @p2(i64 %n, [100 x [100 x i64]]* %A, i64* %B) nounwind uwtable ssp {
+entry:
+  %cmp10 = icmp sgt i64 %n, 0
+  br i1 %cmp10, label %for.cond1.preheader, label %for.end26
+
+for.cond1.preheader:                              ; preds = %for.inc24, %entry
+  %B.addr.012 = phi i64* [ %B.addr.1.lcssa, %for.inc24 ], [ %B, %entry ]
+  %i.011 = phi i64 [ %inc25, %for.inc24 ], [ 0, %entry ]
+  %cmp26 = icmp sgt i64 %n, 0
+  br i1 %cmp26, label %for.cond4.preheader, label %for.inc24
+
+for.cond4.preheader:                              ; preds = %for.inc21, %for.cond1.preheader
+  %B.addr.18 = phi i64* [ %B.addr.2.lcssa, %for.inc21 ], [ %B.addr.012, %for.cond1.preheader ]
+  %j.07 = phi i64 [ %inc22, %for.inc21 ], [ 0, %for.cond1.preheader ]
+  %cmp51 = icmp sgt i64 %n, 0
+  br i1 %cmp51, label %for.body6, label %for.cond10.loopexit
+
+for.body6:                                        ; preds = %for.body6, %for.cond4.preheader
+  %k.02 = phi i64 [ %inc, %for.body6 ], [ 0, %for.cond4.preheader ]
+  %arrayidx8 = getelementptr inbounds [100 x [100 x i64]]* %A, i64 %i.011, i64 %j.07, i64 %k.02
+  store i64 %i.011, i64* %arrayidx8, align 8
+  %inc = add nsw i64 %k.02, 1
+  %cmp5 = icmp slt i64 %inc, %n
+  br i1 %cmp5, label %for.body6, label %for.cond10.loopexit
+
+for.cond10.loopexit:                              ; preds = %for.body6, %for.cond4.preheader
+  %cmp113 = icmp sgt i64 %n, 0
+  br i1 %cmp113, label %for.body12, label %for.inc21
+
+for.body12:                                       ; preds = %for.body12, %for.cond10.loopexit
+  %k9.05 = phi i64 [ %inc19, %for.body12 ], [ 0, %for.cond10.loopexit ]
+  %B.addr.24 = phi i64* [ %incdec.ptr, %for.body12 ], [ %B.addr.18, %for.cond10.loopexit ]
+  %add = add nsw i64 %k9.05, 1
+  %add13 = add nsw i64 %j.07, 2
+  %add14 = add nsw i64 %i.011, 3
+  %arrayidx17 = getelementptr inbounds [100 x [100 x i64]]* %A, i64 %add14, i64 %add13, i64 %add
+  %0 = load i64* %arrayidx17, align 8
+; CHECK: da analyze - flow [-3 -2]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.24, i64 1
+  store i64 %0, i64* %B.addr.24, align 8
+  %inc19 = add nsw i64 %k9.05, 1
+  %cmp11 = icmp slt i64 %inc19, %n
+  br i1 %cmp11, label %for.body12, label %for.inc21
+
+for.inc21:                                        ; preds = %for.body12, %for.cond10.loopexit
+  %B.addr.2.lcssa = phi i64* [ %B.addr.18, %for.cond10.loopexit ], [ %incdec.ptr, %for.body12 ]
+  %inc22 = add nsw i64 %j.07, 1
+  %cmp2 = icmp slt i64 %inc22, %n
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc24
+
+for.inc24:                                        ; preds = %for.inc21, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i64* [ %B.addr.012, %for.cond1.preheader ], [ %B.addr.2.lcssa, %for.inc21 ]
+  %inc25 = add nsw i64 %i.011, 1
+  %cmp = icmp slt i64 %inc25, %n
+  br i1 %cmp, label %for.cond1.preheader, label %for.end26
+
+for.end26:                                        ; preds = %for.inc24, %entry
+  ret void
+}
+
+
+;; classify subscripts
+;;  for (long int i = 0; i < n; i++)
+;;  for (long int j = 0; j < n; j++)
+;;  for (long int k = 0; k < n; k++)
+;;  for (long int l = 0; l < n; l++)
+;;  for (long int m = 0; m < n; m++)
+;;  for (long int o = 0; o < n; o++)
+;;  for (long int p = 0; p < n; p++)
+;;  for (long int q = 0; q < n; q++)
+;;  for (long int r = 0; r < n; r++)
+;;  for (long int s = 0; s < n; s++)
+;;  for (long int u = 0; u < n; u++)
+;;  for (long int t = 0; t < n; t++) {
+;;          A[i - 3] [j] [2] [k-1] [2*l + 1] [m] [p + q] [r + s] = ...
+;;    ... = A[i + 3] [2] [u] [1-k] [3*l - 1] [o] [1 + n] [t + 2];
+
+define void @p3(i64 %n, [100 x [100 x [100 x [100 x [100 x [100 x [100 x i64]]]]]]]* %A, i64* %B) nounwind uwtable ssp {
+entry:
+  %cmp44 = icmp sgt i64 %n, 0
+  br i1 %cmp44, label %for.cond1.preheader, label %for.end90
+
+for.cond1.preheader:                              ; preds = %for.inc88, %entry
+  %B.addr.046 = phi i64* [ %B.addr.1.lcssa, %for.inc88 ], [ %B, %entry ]
+  %i.045 = phi i64 [ %inc89, %for.inc88 ], [ 0, %entry ]
+  %cmp240 = icmp sgt i64 %n, 0
+  br i1 %cmp240, label %for.cond4.preheader, label %for.inc88
+
+for.cond4.preheader:                              ; preds = %for.inc85, %for.cond1.preheader
+  %B.addr.142 = phi i64* [ %B.addr.2.lcssa, %for.inc85 ], [ %B.addr.046, %for.cond1.preheader ]
+  %j.041 = phi i64 [ %inc86, %for.inc85 ], [ 0, %for.cond1.preheader ]
+  %cmp536 = icmp sgt i64 %n, 0
+  br i1 %cmp536, label %for.cond7.preheader, label %for.inc85
+
+for.cond7.preheader:                              ; preds = %for.inc82, %for.cond4.preheader
+  %B.addr.238 = phi i64* [ %B.addr.3.lcssa, %for.inc82 ], [ %B.addr.142, %for.cond4.preheader ]
+  %k.037 = phi i64 [ %inc83, %for.inc82 ], [ 0, %for.cond4.preheader ]
+  %cmp832 = icmp sgt i64 %n, 0
+  br i1 %cmp832, label %for.cond10.preheader, label %for.inc82
+
+for.cond10.preheader:                             ; preds = %for.inc79, %for.cond7.preheader
+  %B.addr.334 = phi i64* [ %B.addr.4.lcssa, %for.inc79 ], [ %B.addr.238, %for.cond7.preheader ]
+  %l.033 = phi i64 [ %inc80, %for.inc79 ], [ 0, %for.cond7.preheader ]
+  %cmp1128 = icmp sgt i64 %n, 0
+  br i1 %cmp1128, label %for.cond13.preheader, label %for.inc79
+
+for.cond13.preheader:                             ; preds = %for.inc76, %for.cond10.preheader
+  %B.addr.430 = phi i64* [ %B.addr.5.lcssa, %for.inc76 ], [ %B.addr.334, %for.cond10.preheader ]
+  %m.029 = phi i64 [ %inc77, %for.inc76 ], [ 0, %for.cond10.preheader ]
+  %cmp1424 = icmp sgt i64 %n, 0
+  br i1 %cmp1424, label %for.cond16.preheader, label %for.inc76
+
+for.cond16.preheader:                             ; preds = %for.inc73, %for.cond13.preheader
+  %B.addr.526 = phi i64* [ %B.addr.6.lcssa, %for.inc73 ], [ %B.addr.430, %for.cond13.preheader ]
+  %o.025 = phi i64 [ %inc74, %for.inc73 ], [ 0, %for.cond13.preheader ]
+  %cmp1720 = icmp sgt i64 %n, 0
+  br i1 %cmp1720, label %for.cond19.preheader, label %for.inc73
+
+for.cond19.preheader:                             ; preds = %for.inc70, %for.cond16.preheader
+  %B.addr.622 = phi i64* [ %B.addr.7.lcssa, %for.inc70 ], [ %B.addr.526, %for.cond16.preheader ]
+  %p.021 = phi i64 [ %inc71, %for.inc70 ], [ 0, %for.cond16.preheader ]
+  %cmp2016 = icmp sgt i64 %n, 0
+  br i1 %cmp2016, label %for.cond22.preheader, label %for.inc70
+
+for.cond22.preheader:                             ; preds = %for.inc67, %for.cond19.preheader
+  %B.addr.718 = phi i64* [ %B.addr.8.lcssa, %for.inc67 ], [ %B.addr.622, %for.cond19.preheader ]
+  %q.017 = phi i64 [ %inc68, %for.inc67 ], [ 0, %for.cond19.preheader ]
+  %cmp2312 = icmp sgt i64 %n, 0
+  br i1 %cmp2312, label %for.cond25.preheader, label %for.inc67
+
+for.cond25.preheader:                             ; preds = %for.inc64, %for.cond22.preheader
+  %B.addr.814 = phi i64* [ %B.addr.9.lcssa, %for.inc64 ], [ %B.addr.718, %for.cond22.preheader ]
+  %r.013 = phi i64 [ %inc65, %for.inc64 ], [ 0, %for.cond22.preheader ]
+  %cmp268 = icmp sgt i64 %n, 0
+  br i1 %cmp268, label %for.cond28.preheader, label %for.inc64
+
+for.cond28.preheader:                             ; preds = %for.inc61, %for.cond25.preheader
+  %B.addr.910 = phi i64* [ %B.addr.10.lcssa, %for.inc61 ], [ %B.addr.814, %for.cond25.preheader ]
+  %s.09 = phi i64 [ %inc62, %for.inc61 ], [ 0, %for.cond25.preheader ]
+  %cmp294 = icmp sgt i64 %n, 0
+  br i1 %cmp294, label %for.cond31.preheader, label %for.inc61
+
+for.cond31.preheader:                             ; preds = %for.inc58, %for.cond28.preheader
+  %u.06 = phi i64 [ %inc59, %for.inc58 ], [ 0, %for.cond28.preheader ]
+  %B.addr.105 = phi i64* [ %B.addr.11.lcssa, %for.inc58 ], [ %B.addr.910, %for.cond28.preheader ]
+  %cmp321 = icmp sgt i64 %n, 0
+  br i1 %cmp321, label %for.body33, label %for.inc58
+
+for.body33:                                       ; preds = %for.body33, %for.cond31.preheader
+  %t.03 = phi i64 [ %inc, %for.body33 ], [ 0, %for.cond31.preheader ]
+  %B.addr.112 = phi i64* [ %incdec.ptr, %for.body33 ], [ %B.addr.105, %for.cond31.preheader ]
+  %add = add nsw i64 %r.013, %s.09
+  %add34 = add nsw i64 %p.021, %q.017
+  %mul = shl nsw i64 %l.033, 1
+  %add3547 = or i64 %mul, 1
+  %sub = add nsw i64 %k.037, -1
+  %sub36 = add nsw i64 %i.045, -3
+  %arrayidx43 = getelementptr inbounds [100 x [100 x [100 x [100 x [100 x [100 x [100 x i64]]]]]]]* %A, i64 %sub36, i64 %j.041, i64 2, i64 %sub, i64 %add3547, i64 %m.029, i64 %add34, i64 %add
+  store i64 %i.045, i64* %arrayidx43, align 8
+  %add44 = add nsw i64 %t.03, 2
+  %add45 = add nsw i64 %n, 1
+  %mul46 = mul nsw i64 %l.033, 3
+  %sub47 = add nsw i64 %mul46, -1
+  %sub48 = sub nsw i64 1, %k.037
+  %add49 = add nsw i64 %i.045, 3
+  %arrayidx57 = getelementptr inbounds [100 x [100 x [100 x [100 x [100 x [100 x [100 x i64]]]]]]]* %A, i64 %add49, i64 2, i64 %u.06, i64 %sub48, i64 %sub47, i64 %o.025, i64 %add45, i64 %add44
+  %0 = load i64* %arrayidx57, align 8
+; CHECK: da analyze - flow [-6 * * => * * * * * * * *] splitable!
+; CHECK: da analyze - split level = 3, iteration = 1!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.112, i64 1
+  store i64 %0, i64* %B.addr.112, align 8
+  %inc = add nsw i64 %t.03, 1
+  %cmp32 = icmp slt i64 %inc, %n
+  br i1 %cmp32, label %for.body33, label %for.inc58
+
+for.inc58:                                        ; preds = %for.body33, %for.cond31.preheader
+  %B.addr.11.lcssa = phi i64* [ %B.addr.105, %for.cond31.preheader ], [ %incdec.ptr, %for.body33 ]
+  %inc59 = add nsw i64 %u.06, 1
+  %cmp29 = icmp slt i64 %inc59, %n
+  br i1 %cmp29, label %for.cond31.preheader, label %for.inc61
+
+for.inc61:                                        ; preds = %for.inc58, %for.cond28.preheader
+  %B.addr.10.lcssa = phi i64* [ %B.addr.910, %for.cond28.preheader ], [ %B.addr.11.lcssa, %for.inc58 ]
+  %inc62 = add nsw i64 %s.09, 1
+  %cmp26 = icmp slt i64 %inc62, %n
+  br i1 %cmp26, label %for.cond28.preheader, label %for.inc64
+
+for.inc64:                                        ; preds = %for.inc61, %for.cond25.preheader
+  %B.addr.9.lcssa = phi i64* [ %B.addr.814, %for.cond25.preheader ], [ %B.addr.10.lcssa, %for.inc61 ]
+  %inc65 = add nsw i64 %r.013, 1
+  %cmp23 = icmp slt i64 %inc65, %n
+  br i1 %cmp23, label %for.cond25.preheader, label %for.inc67
+
+for.inc67:                                        ; preds = %for.inc64, %for.cond22.preheader
+  %B.addr.8.lcssa = phi i64* [ %B.addr.718, %for.cond22.preheader ], [ %B.addr.9.lcssa, %for.inc64 ]
+  %inc68 = add nsw i64 %q.017, 1
+  %cmp20 = icmp slt i64 %inc68, %n
+  br i1 %cmp20, label %for.cond22.preheader, label %for.inc70
+
+for.inc70:                                        ; preds = %for.inc67, %for.cond19.preheader
+  %B.addr.7.lcssa = phi i64* [ %B.addr.622, %for.cond19.preheader ], [ %B.addr.8.lcssa, %for.inc67 ]
+  %inc71 = add nsw i64 %p.021, 1
+  %cmp17 = icmp slt i64 %inc71, %n
+  br i1 %cmp17, label %for.cond19.preheader, label %for.inc73
+
+for.inc73:                                        ; preds = %for.inc70, %for.cond16.preheader
+  %B.addr.6.lcssa = phi i64* [ %B.addr.526, %for.cond16.preheader ], [ %B.addr.7.lcssa, %for.inc70 ]
+  %inc74 = add nsw i64 %o.025, 1
+  %cmp14 = icmp slt i64 %inc74, %n
+  br i1 %cmp14, label %for.cond16.preheader, label %for.inc76
+
+for.inc76:                                        ; preds = %for.inc73, %for.cond13.preheader
+  %B.addr.5.lcssa = phi i64* [ %B.addr.430, %for.cond13.preheader ], [ %B.addr.6.lcssa, %for.inc73 ]
+  %inc77 = add nsw i64 %m.029, 1
+  %cmp11 = icmp slt i64 %inc77, %n
+  br i1 %cmp11, label %for.cond13.preheader, label %for.inc79
+
+for.inc79:                                        ; preds = %for.inc76, %for.cond10.preheader
+  %B.addr.4.lcssa = phi i64* [ %B.addr.334, %for.cond10.preheader ], [ %B.addr.5.lcssa, %for.inc76 ]
+  %inc80 = add nsw i64 %l.033, 1
+  %cmp8 = icmp slt i64 %inc80, %n
+  br i1 %cmp8, label %for.cond10.preheader, label %for.inc82
+
+for.inc82:                                        ; preds = %for.inc79, %for.cond7.preheader
+  %B.addr.3.lcssa = phi i64* [ %B.addr.238, %for.cond7.preheader ], [ %B.addr.4.lcssa, %for.inc79 ]
+  %inc83 = add nsw i64 %k.037, 1
+  %cmp5 = icmp slt i64 %inc83, %n
+  br i1 %cmp5, label %for.cond7.preheader, label %for.inc85
+
+for.inc85:                                        ; preds = %for.inc82, %for.cond4.preheader
+  %B.addr.2.lcssa = phi i64* [ %B.addr.142, %for.cond4.preheader ], [ %B.addr.3.lcssa, %for.inc82 ]
+  %inc86 = add nsw i64 %j.041, 1
+  %cmp2 = icmp slt i64 %inc86, %n
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc88
+
+for.inc88:                                        ; preds = %for.inc85, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i64* [ %B.addr.046, %for.cond1.preheader ], [ %B.addr.2.lcssa, %for.inc85 ]
+  %inc89 = add nsw i64 %i.045, 1
+  %cmp = icmp slt i64 %inc89, %n
+  br i1 %cmp, label %for.cond1.preheader, label %for.end90
+
+for.end90:                                        ; preds = %for.inc88, %entry
+  ret void
+}
+
+
+;; cleanup around chars, shorts, ints
+;;void p4(int *A, int *B, long int n)
+;;  for (char i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @p4(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i8 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv2 = sext i8 %i.03 to i32
+  %conv3 = sext i8 %i.03 to i64
+  %add = add i64 %conv3, 2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv2, i32* %arrayidx, align 4
+  %idxprom4 = sext i8 %i.03 to i64
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %idxprom4
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i8 %i.03, 1
+  %conv = sext i8 %inc to i64
+  %cmp = icmp slt i64 %conv, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;void p5(int *A, int *B, long int n)
+;;  for (short i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @p5(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i16 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv2 = sext i16 %i.03 to i32
+  %conv3 = sext i16 %i.03 to i64
+  %add = add i64 %conv3, 2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv2, i32* %arrayidx, align 4
+  %idxprom4 = sext i16 %i.03 to i64
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %idxprom4
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i16 %i.03, 1
+  %conv = sext i16 %inc to i64
+  %cmp = icmp slt i64 %conv, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;void p6(int *A, int *B, long int n)
+;;  for (int i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @p6(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %add = add nsw i32 %i.03, 2
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 %i.03, i32* %arrayidx, align 4
+  %idxprom2 = sext i32 %i.03 to i64
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %idxprom2
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - consistent flow [2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i32 %i.03, 1
+  %conv = sext i32 %inc to i64
+  %cmp = icmp slt i64 %conv, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;void p7(unsigned *A, unsigned *B,  char n)
+;;  A[n] = ...
+;;  ... = A[n + 1];
+
+define void @p7(i32* %A, i32* %B, i8 signext %n) nounwind uwtable ssp {
+entry:
+  %idxprom = sext i8 %n to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 0, i32* %arrayidx, align 4
+  %conv = sext i8 %n to i64
+  %add = add i64 %conv, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
+
+
+
+;;void p8(unsigned *A, unsigned *B,  short n)
+;;  A[n] = ...
+;;  ... = A[n + 1];
+
+define void @p8(i32* %A, i32* %B, i16 signext %n) nounwind uwtable ssp {
+entry:
+  %idxprom = sext i16 %n to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 0, i32* %arrayidx, align 4
+  %conv = sext i16 %n to i64
+  %add = add i64 %conv, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
+
+
+;;void p9(unsigned *A, unsigned *B,  int n)
+;;  A[n] = ...
+;;  ... = A[n + 1];
+
+define void @p9(i32* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %idxprom = sext i32 %n to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 0, i32* %arrayidx, align 4
+  %add = add nsw i32 %n, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
+
+
+;;void p10(unsigned *A, unsigned *B,  unsigned n)
+;;  A[n] = ...
+;;  ... = A[n + 1];
+
+define void @p10(i32* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %idxprom = zext i32 %n to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 0, i32* %arrayidx, align 4
+  %add = add i32 %n, 1
+  %idxprom1 = zext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/Propagating.ll b/test/Analysis/DependenceAnalysis/Propagating.ll
new file mode 100644
index 0000000..076348c
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/Propagating.ll
@@ -0,0 +1,467 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'Propagating.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[i + 1][i + j] = i;
+;;      *B++ = A[i][i + j];
+
+define void @prop0([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc9, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc9 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc10, %for.inc9 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %add = add nsw i64 %i.03, %j.02
+  %add4 = add nsw i64 %i.03, 1
+  %arrayidx5 = getelementptr inbounds [100 x i32]* %A, i64 %add4, i64 %add
+  store i32 %conv, i32* %arrayidx5, align 4
+  %add6 = add nsw i64 %i.03, %j.02
+  %arrayidx8 = getelementptr inbounds [100 x i32]* %A, i64 %i.03, i64 %add6
+  %0 = load i32* %arrayidx8, align 4
+; CHECK: da analyze - consistent flow [1 -1]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc9
+
+for.inc9:                                         ; preds = %for.body3
+  %inc10 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc10, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end11
+
+for.end11:                                        ; preds = %for.inc9
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      for (long int k = 0; k < 100; k++)
+;;        A[j - i][i + 1][j + k] = ...
+;;        ... = A[j - i][i][j + k];
+
+define void @prop1([100 x [100 x i32]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc18, %entry
+  %B.addr.06 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc18 ]
+  %i.05 = phi i64 [ 0, %entry ], [ %inc19, %for.inc18 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc15, %for.cond1.preheader
+  %B.addr.14 = phi i32* [ %B.addr.06, %for.cond1.preheader ], [ %incdec.ptr, %for.inc15 ]
+  %j.03 = phi i64 [ 0, %for.cond1.preheader ], [ %inc16, %for.inc15 ]
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6, %for.cond4.preheader
+  %k.02 = phi i64 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
+  %B.addr.21 = phi i32* [ %B.addr.14, %for.cond4.preheader ], [ %incdec.ptr, %for.body6 ]
+  %conv = trunc i64 %i.05 to i32
+  %add = add nsw i64 %j.03, %k.02
+  %add7 = add nsw i64 %i.05, 1
+  %sub = sub nsw i64 %j.03, %i.05
+  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %sub, i64 %add7, i64 %add
+  store i32 %conv, i32* %arrayidx9, align 4
+  %add10 = add nsw i64 %j.03, %k.02
+  %sub11 = sub nsw i64 %j.03, %i.05
+  %arrayidx14 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %sub11, i64 %i.05, i64 %add10
+  %0 = load i32* %arrayidx14, align 4
+; CHECK: da analyze - consistent flow [1 1 -1]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.21, i64 1
+  store i32 %0, i32* %B.addr.21, align 4
+  %inc = add nsw i64 %k.02, 1
+  %cmp5 = icmp slt i64 %inc, 100
+  br i1 %cmp5, label %for.body6, label %for.inc15
+
+for.inc15:                                        ; preds = %for.body6
+  %inc16 = add nsw i64 %j.03, 1
+  %cmp2 = icmp slt i64 %inc16, 100
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc18
+
+for.inc18:                                        ; preds = %for.inc15
+  %inc19 = add nsw i64 %i.05, 1
+  %cmp = icmp slt i64 %inc19, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end20
+
+for.end20:                                        ; preds = %for.inc18
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[i - 1][2*i] = ...
+;;      ... = A[i][i + j + 110];
+
+define void @prop2([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc8, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %sub = add nsw i64 %i.03, -1
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %sub, i64 %mul
+  store i32 %conv, i32* %arrayidx4, align 4
+  %add = add nsw i64 %i.03, %j.02
+  %add5 = add nsw i64 %add, 110
+  %arrayidx7 = getelementptr inbounds [100 x i32]* %A, i64 %i.03, i64 %add5
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %inc9 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc9, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[i][2*j + i] = ...
+;;      ... = A[i][2*j - i + 5];
+
+define void @prop3([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc9, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc9 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc10, %for.inc9 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %j.02, 1
+  %add = add nsw i64 %mul, %i.03
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.03, i64 %add
+  store i32 %conv, i32* %arrayidx4, align 4
+  %mul5 = shl nsw i64 %j.02, 1
+  %sub = sub nsw i64 %mul5, %i.03
+  %add6 = add nsw i64 %sub, 5
+  %arrayidx8 = getelementptr inbounds [100 x i32]* %A, i64 %i.03, i64 %add6
+  %0 = load i32* %arrayidx8, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc9
+
+for.inc9:                                         ; preds = %for.body3
+  %inc10 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc10, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end11
+
+for.end11:                                        ; preds = %for.inc9
+  ret void
+}
+
+
+;; propagate Distance
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[i + 2][2*i + j + 1] = ...
+;;      ... = A[i][2*i + j];
+
+define void @prop4([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc11, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc11 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc12, %for.inc11 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %add = add nsw i64 %mul, %j.02
+  %add4 = add nsw i64 %add, 1
+  %add5 = add nsw i64 %i.03, 2
+  %arrayidx6 = getelementptr inbounds [100 x i32]* %A, i64 %add5, i64 %add4
+  store i32 %conv, i32* %arrayidx6, align 4
+  %mul7 = shl nsw i64 %i.03, 1
+  %add8 = add nsw i64 %mul7, %j.02
+  %arrayidx10 = getelementptr inbounds [100 x i32]* %A, i64 %i.03, i64 %add8
+  %0 = load i32* %arrayidx10, align 4
+; CHECK: da analyze - consistent flow [2 -3]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc11
+
+for.inc11:                                        ; preds = %for.body3
+  %inc12 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc12, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end13
+
+for.end13:                                        ; preds = %for.inc11
+  ret void
+}
+
+
+;; propagate Point
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[3*i - 18][22 - i][2*i + j] = ...
+;;      ... = A[i][i][3*i + j];
+
+define void @prop5([100 x [100 x i32]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc13, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc13 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc14, %for.inc13 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %add = add nsw i64 %mul, %j.02
+  %sub = sub nsw i64 22, %i.03
+  %mul4 = mul nsw i64 %i.03, 3
+  %sub5 = add nsw i64 %mul4, -18
+  %arrayidx7 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %sub5, i64 %sub, i64 %add
+  store i32 %conv, i32* %arrayidx7, align 4
+  %mul8 = mul nsw i64 %i.03, 3
+  %add9 = add nsw i64 %mul8, %j.02
+  %arrayidx12 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %i.03, i64 %i.03, i64 %add9
+  %0 = load i32* %arrayidx12, align 4
+; CHECK: da analyze - flow [< -16] splitable!
+; CHECK: da analyze - split level = 1, iteration = 11!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc13
+
+for.inc13:                                        ; preds = %for.body3
+  %inc14 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc14, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end15
+
+for.end15:                                        ; preds = %for.inc13
+  ret void
+}
+
+
+;; propagate Line
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[i + 1][4*i + j + 2] = ...
+;;      ... = A[2*i][8*i + j];
+
+define void @prop6([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc12, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc12 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc13, %for.inc12 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 2
+  %add = add nsw i64 %mul, %j.02
+  %add4 = add nsw i64 %add, 2
+  %add5 = add nsw i64 %i.03, 1
+  %arrayidx6 = getelementptr inbounds [100 x i32]* %A, i64 %add5, i64 %add4
+  store i32 %conv, i32* %arrayidx6, align 4
+  %mul7 = shl nsw i64 %i.03, 3
+  %add8 = add nsw i64 %mul7, %j.02
+  %mul9 = shl nsw i64 %i.03, 1
+  %arrayidx11 = getelementptr inbounds [100 x i32]* %A, i64 %mul9, i64 %add8
+  %0 = load i32* %arrayidx11, align 4
+; CHECK: da analyze - flow [=> -2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc12
+
+for.inc12:                                        ; preds = %for.body3
+  %inc13 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc13, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end14
+
+for.end14:                                        ; preds = %for.inc12
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[2*i + 4][-5*i + j + 2] = ...
+;;      ... = A[-2*i + 20][5*i + j];
+
+define void @prop7([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc14, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc14 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc15, %for.inc14 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -5
+  %add = add nsw i64 %mul, %j.02
+  %add4 = add nsw i64 %add, 2
+  %mul5 = shl nsw i64 %i.03, 1
+  %add6 = add nsw i64 %mul5, 4
+  %arrayidx7 = getelementptr inbounds [100 x i32]* %A, i64 %add6, i64 %add4
+  store i32 %conv, i32* %arrayidx7, align 4
+  %mul8 = mul nsw i64 %i.03, 5
+  %add9 = add nsw i64 %mul8, %j.02
+  %mul10 = mul nsw i64 %i.03, -2
+  %add11 = add nsw i64 %mul10, 20
+  %arrayidx13 = getelementptr inbounds [100 x i32]* %A, i64 %add11, i64 %add9
+  %0 = load i32* %arrayidx13, align 4
+; CHECK: da analyze - flow [* -38] splitable!
+; CHECK: da analyze - split level = 1, iteration = 4!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc14
+
+for.inc14:                                        ; preds = %for.body3
+  %inc15 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc15, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end16
+
+for.end16:                                        ; preds = %for.inc14
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[4][j + 2] = ...
+;;      ... = A[-2*i + 4][5*i + j];
+
+define void @prop8([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc10, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc10 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc11, %for.inc10 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %add = add nsw i64 %j.02, 2
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 4, i64 %add
+  store i32 %conv, i32* %arrayidx4, align 4
+  %mul = mul nsw i64 %i.03, 5
+  %add5 = add nsw i64 %mul, %j.02
+  %mul6 = mul nsw i64 %i.03, -2
+  %add7 = add nsw i64 %mul6, 4
+  %arrayidx9 = getelementptr inbounds [100 x i32]* %A, i64 %add7, i64 %add5
+  %0 = load i32* %arrayidx9, align 4
+; CHECK: da analyze - flow [p<= 2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc10
+
+for.inc10:                                        ; preds = %for.body3
+  %inc11 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc11, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end12
+
+for.end12:                                        ; preds = %for.inc10
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[2*i + 4][5*i + j + 2] = ...
+;;      ... = A[4][j];
+
+define void @prop9([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc10, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc10 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc11, %for.inc10 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 5
+  %add = add nsw i64 %mul, %j.02
+  %add4 = add nsw i64 %add, 2
+  %mul5 = shl nsw i64 %i.03, 1
+  %add6 = add nsw i64 %mul5, 4
+  %arrayidx7 = getelementptr inbounds [100 x i32]* %A, i64 %add6, i64 %add4
+  store i32 %conv, i32* %arrayidx7, align 4
+  %arrayidx9 = getelementptr inbounds [100 x i32]* %A, i64 4, i64 %j.02
+  %0 = load i32* %arrayidx9, align 4
+; CHECK: da analyze - flow [p<= 2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc10
+
+for.inc10:                                        ; preds = %for.body3
+  %inc11 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc11, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end12
+
+for.end12:                                        ; preds = %for.inc10
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/Separability.ll b/test/Analysis/DependenceAnalysis/Separability.ll
new file mode 100644
index 0000000..d42d3cd
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/Separability.ll
@@ -0,0 +1,267 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'Separability.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 0; i < 50; i++)
+;;    for (long int j = 0; j < 50; j++)
+;;      for (long int k = 0; k < 50; k++)
+;;        for (long int l = 0; l < 50; l++)
+;;          A[n][i][j + k] = ...
+;;          ... = A[10][i + 10][2*j - l];
+
+define void @sep0([100 x [100 x i32]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc22, %entry
+  %B.addr.08 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc22 ]
+  %i.07 = phi i64 [ 0, %entry ], [ %inc23, %for.inc22 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc19, %for.cond1.preheader
+  %B.addr.16 = phi i32* [ %B.addr.08, %for.cond1.preheader ], [ %incdec.ptr, %for.inc19 ]
+  %j.05 = phi i64 [ 0, %for.cond1.preheader ], [ %inc20, %for.inc19 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc16, %for.cond4.preheader
+  %B.addr.24 = phi i32* [ %B.addr.16, %for.cond4.preheader ], [ %incdec.ptr, %for.inc16 ]
+  %k.03 = phi i64 [ 0, %for.cond4.preheader ], [ %inc17, %for.inc16 ]
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
+  %l.02 = phi i64 [ 0, %for.cond7.preheader ], [ %inc, %for.body9 ]
+  %B.addr.31 = phi i32* [ %B.addr.24, %for.cond7.preheader ], [ %incdec.ptr, %for.body9 ]
+  %conv = trunc i64 %i.07 to i32
+  %add = add nsw i64 %j.05, %k.03
+  %idxprom = sext i32 %n to i64
+  %arrayidx11 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %idxprom, i64 %i.07, i64 %add
+  store i32 %conv, i32* %arrayidx11, align 4
+  %mul = shl nsw i64 %j.05, 1
+  %sub = sub nsw i64 %mul, %l.02
+  %add12 = add nsw i64 %i.07, 10
+  %arrayidx15 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 10, i64 %add12, i64 %sub
+  %0 = load i32* %arrayidx15, align 4
+; CHECK: da analyze - flow [-10 * * *]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.31, i64 1
+  store i32 %0, i32* %B.addr.31, align 4
+  %inc = add nsw i64 %l.02, 1
+  %cmp8 = icmp slt i64 %inc, 50
+  br i1 %cmp8, label %for.body9, label %for.inc16
+
+for.inc16:                                        ; preds = %for.body9
+  %inc17 = add nsw i64 %k.03, 1
+  %cmp5 = icmp slt i64 %inc17, 50
+  br i1 %cmp5, label %for.cond7.preheader, label %for.inc19
+
+for.inc19:                                        ; preds = %for.inc16
+  %inc20 = add nsw i64 %j.05, 1
+  %cmp2 = icmp slt i64 %inc20, 50
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc22
+
+for.inc22:                                        ; preds = %for.inc19
+  %inc23 = add nsw i64 %i.07, 1
+  %cmp = icmp slt i64 %inc23, 50
+  br i1 %cmp, label %for.cond1.preheader, label %for.end24
+
+for.end24:                                        ; preds = %for.inc22
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 50; i++)
+;;    for (long int j = 0; j < 50; j++)
+;;      for (long int k = 0; k < 50; k++)
+;;        for (long int l = 0; l < 50; l++)
+;;          A[i][i][j + k] = ...
+;;          ... = A[10][i + 10][2*j - l];
+
+define void @sep1([100 x [100 x i32]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc22, %entry
+  %B.addr.08 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc22 ]
+  %i.07 = phi i64 [ 0, %entry ], [ %inc23, %for.inc22 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc19, %for.cond1.preheader
+  %B.addr.16 = phi i32* [ %B.addr.08, %for.cond1.preheader ], [ %incdec.ptr, %for.inc19 ]
+  %j.05 = phi i64 [ 0, %for.cond1.preheader ], [ %inc20, %for.inc19 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc16, %for.cond4.preheader
+  %B.addr.24 = phi i32* [ %B.addr.16, %for.cond4.preheader ], [ %incdec.ptr, %for.inc16 ]
+  %k.03 = phi i64 [ 0, %for.cond4.preheader ], [ %inc17, %for.inc16 ]
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
+  %l.02 = phi i64 [ 0, %for.cond7.preheader ], [ %inc, %for.body9 ]
+  %B.addr.31 = phi i32* [ %B.addr.24, %for.cond7.preheader ], [ %incdec.ptr, %for.body9 ]
+  %conv = trunc i64 %i.07 to i32
+  %add = add nsw i64 %j.05, %k.03
+  %arrayidx11 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %i.07, i64 %i.07, i64 %add
+  store i32 %conv, i32* %arrayidx11, align 4
+  %mul = shl nsw i64 %j.05, 1
+  %sub = sub nsw i64 %mul, %l.02
+  %add12 = add nsw i64 %i.07, 10
+  %arrayidx15 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 10, i64 %add12, i64 %sub
+  %0 = load i32* %arrayidx15, align 4
+; CHECK: da analyze - flow [> * * *]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.31, i64 1
+  store i32 %0, i32* %B.addr.31, align 4
+  %inc = add nsw i64 %l.02, 1
+  %cmp8 = icmp slt i64 %inc, 50
+  br i1 %cmp8, label %for.body9, label %for.inc16
+
+for.inc16:                                        ; preds = %for.body9
+  %inc17 = add nsw i64 %k.03, 1
+  %cmp5 = icmp slt i64 %inc17, 50
+  br i1 %cmp5, label %for.cond7.preheader, label %for.inc19
+
+for.inc19:                                        ; preds = %for.inc16
+  %inc20 = add nsw i64 %j.05, 1
+  %cmp2 = icmp slt i64 %inc20, 50
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc22
+
+for.inc22:                                        ; preds = %for.inc19
+  %inc23 = add nsw i64 %i.07, 1
+  %cmp = icmp slt i64 %inc23, 50
+  br i1 %cmp, label %for.cond1.preheader, label %for.end24
+
+for.end24:                                        ; preds = %for.inc22
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 50; i++)
+;;    for (long int j = 0; j < 50; j++)
+;;      for (long int k = 0; k < 50; k++)
+;;        for (long int l = 0; l < 50; l++)
+;;          A[i][i][i + k][l] = ...
+;;          ... = A[10][i + 10][j + k][l + 10];
+
+define void @sep2([100 x [100 x [100 x i32]]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc26, %entry
+  %B.addr.08 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc26 ]
+  %i.07 = phi i64 [ 0, %entry ], [ %inc27, %for.inc26 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc23, %for.cond1.preheader
+  %B.addr.16 = phi i32* [ %B.addr.08, %for.cond1.preheader ], [ %incdec.ptr, %for.inc23 ]
+  %j.05 = phi i64 [ 0, %for.cond1.preheader ], [ %inc24, %for.inc23 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc20, %for.cond4.preheader
+  %B.addr.24 = phi i32* [ %B.addr.16, %for.cond4.preheader ], [ %incdec.ptr, %for.inc20 ]
+  %k.03 = phi i64 [ 0, %for.cond4.preheader ], [ %inc21, %for.inc20 ]
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
+  %l.02 = phi i64 [ 0, %for.cond7.preheader ], [ %inc, %for.body9 ]
+  %B.addr.31 = phi i32* [ %B.addr.24, %for.cond7.preheader ], [ %incdec.ptr, %for.body9 ]
+  %conv = trunc i64 %i.07 to i32
+  %add = add nsw i64 %i.07, %k.03
+  %arrayidx12 = getelementptr inbounds [100 x [100 x [100 x i32]]]* %A, i64 %i.07, i64 %i.07, i64 %add, i64 %l.02
+  store i32 %conv, i32* %arrayidx12, align 4
+  %add13 = add nsw i64 %l.02, 10
+  %add14 = add nsw i64 %j.05, %k.03
+  %add15 = add nsw i64 %i.07, 10
+  %arrayidx19 = getelementptr inbounds [100 x [100 x [100 x i32]]]* %A, i64 10, i64 %add15, i64 %add14, i64 %add13
+  %0 = load i32* %arrayidx19, align 4
+; CHECK: da analyze - flow [> * * -10]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.31, i64 1
+  store i32 %0, i32* %B.addr.31, align 4
+  %inc = add nsw i64 %l.02, 1
+  %cmp8 = icmp slt i64 %inc, 50
+  br i1 %cmp8, label %for.body9, label %for.inc20
+
+for.inc20:                                        ; preds = %for.body9
+  %inc21 = add nsw i64 %k.03, 1
+  %cmp5 = icmp slt i64 %inc21, 50
+  br i1 %cmp5, label %for.cond7.preheader, label %for.inc23
+
+for.inc23:                                        ; preds = %for.inc20
+  %inc24 = add nsw i64 %j.05, 1
+  %cmp2 = icmp slt i64 %inc24, 50
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc26
+
+for.inc26:                                        ; preds = %for.inc23
+  %inc27 = add nsw i64 %i.07, 1
+  %cmp = icmp slt i64 %inc27, 50
+  br i1 %cmp, label %for.cond1.preheader, label %for.end28
+
+for.end28:                                        ; preds = %for.inc26
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 50; i++)
+;;    for (long int j = 0; j < 50; j++)
+;;      for (long int k = 0; k < 50; k++)
+;;        for (long int l = 0; l < 50; l++)
+;;          A[i][i][i + k][l + k] = ...
+;;          ... = A[10][i + 10][j + k][l + 10];
+
+define void @sep3([100 x [100 x [100 x i32]]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc27, %entry
+  %B.addr.08 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc27 ]
+  %i.07 = phi i64 [ 0, %entry ], [ %inc28, %for.inc27 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc24, %for.cond1.preheader
+  %B.addr.16 = phi i32* [ %B.addr.08, %for.cond1.preheader ], [ %incdec.ptr, %for.inc24 ]
+  %j.05 = phi i64 [ 0, %for.cond1.preheader ], [ %inc25, %for.inc24 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc21, %for.cond4.preheader
+  %B.addr.24 = phi i32* [ %B.addr.16, %for.cond4.preheader ], [ %incdec.ptr, %for.inc21 ]
+  %k.03 = phi i64 [ 0, %for.cond4.preheader ], [ %inc22, %for.inc21 ]
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
+  %l.02 = phi i64 [ 0, %for.cond7.preheader ], [ %inc, %for.body9 ]
+  %B.addr.31 = phi i32* [ %B.addr.24, %for.cond7.preheader ], [ %incdec.ptr, %for.body9 ]
+  %conv = trunc i64 %i.07 to i32
+  %add = add nsw i64 %l.02, %k.03
+  %add10 = add nsw i64 %i.07, %k.03
+  %arrayidx13 = getelementptr inbounds [100 x [100 x [100 x i32]]]* %A, i64 %i.07, i64 %i.07, i64 %add10, i64 %add
+  store i32 %conv, i32* %arrayidx13, align 4
+  %add14 = add nsw i64 %l.02, 10
+  %add15 = add nsw i64 %j.05, %k.03
+  %add16 = add nsw i64 %i.07, 10
+  %arrayidx20 = getelementptr inbounds [100 x [100 x [100 x i32]]]* %A, i64 10, i64 %add16, i64 %add15, i64 %add14
+  %0 = load i32* %arrayidx20, align 4
+; CHECK: da analyze - flow [> * * *]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.31, i64 1
+  store i32 %0, i32* %B.addr.31, align 4
+  %inc = add nsw i64 %l.02, 1
+  %cmp8 = icmp slt i64 %inc, 50
+  br i1 %cmp8, label %for.body9, label %for.inc21
+
+for.inc21:                                        ; preds = %for.body9
+  %inc22 = add nsw i64 %k.03, 1
+  %cmp5 = icmp slt i64 %inc22, 50
+  br i1 %cmp5, label %for.cond7.preheader, label %for.inc24
+
+for.inc24:                                        ; preds = %for.inc21
+  %inc25 = add nsw i64 %j.05, 1
+  %cmp2 = icmp slt i64 %inc25, 50
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc27
+
+for.inc27:                                        ; preds = %for.inc24
+  %inc28 = add nsw i64 %i.07, 1
+  %cmp = icmp slt i64 %inc28, 50
+  br i1 %cmp, label %for.cond1.preheader, label %for.end29
+
+for.end29:                                        ; preds = %for.inc27
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/StrongSIV.ll b/test/Analysis/DependenceAnalysis/StrongSIV.ll
new file mode 100644
index 0000000..be336c3
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/StrongSIV.ll
@@ -0,0 +1,342 @@
+; RUN: opt < %s -analyze -basicaa -indvars -da | FileCheck %s
+
+; ModuleID = 'StrongSIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (int i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @strong0(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %add = add nsw i32 %i.03, 2
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 %i.03, i32* %arrayidx, align 4
+  %idxprom2 = sext i32 %i.03 to i64
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %idxprom2
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - consistent flow [2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i32 %i.03, 1
+  %conv = sext i32 %inc to i64
+  %cmp = icmp slt i64 %conv, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @strong1(i32* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %conv = sext i32 %n to i64
+  %cmp1 = icmp sgt i32 %n, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv2 = trunc i64 %i.03 to i32
+  %add = add nsw i64 %i.03, 2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv2, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %i.03
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - consistent flow [2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, %conv
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @strong2(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %add = add i64 %i.03, 2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %i.03
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - consistent flow [2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (int i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @strong3(i32* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp sgt i32 %n, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %add = add nsw i32 %i.03, 2
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 %i.03, i32* %arrayidx, align 4
+  %idxprom1 = sext i32 %i.03 to i64
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - consistent flow [2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i32 %i.03, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 19; i++)
+;;    A[i + 19] = ...
+;;    ... = A[i];
+
+define void @strong4(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %add = add i64 %i.02, 19
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %i.02
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 19
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 20; i++)
+;;    A[i + 19] = ...
+;;    ... = A[i];
+
+define void @strong5(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %add = add i64 %i.02, 19
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %i.02
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - consistent flow [19]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 20
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 20; i++)
+;;    A[2*i + 6] = ...
+;;    ... = A[2*i];
+
+define void @strong6(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %add = add i64 %mul, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul1 = shl i64 %i.02, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %mul1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - consistent flow [3]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 20
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 20; i++)
+;;    A[2*i + 7] = ...
+;;    ... = A[2*i];
+
+define void @strong7(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %add = add i64 %mul, 7
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul1 = shl i64 %i.02, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %mul1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 20
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 20; i++)
+;;    A[i + n] = ...
+;;    ... = A[i];
+
+define void @strong8(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %add = add i64 %i.02, %n
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %i.02
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - consistent flow [%n|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 20
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[i + n] = ...
+;;    ... = A[i + 2*n];
+
+define void @strong9(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %add = add i64 %i.03, %n
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %n, 1
+  %add1 = add i64 %i.03, %mul
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %add1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 1000; i++)
+;;    A[n*i + 5] = ...
+;;    ... = A[n*i + 5];
+
+define void @strong10(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, %n
+  %add = add i64 %mul, 5
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul1 = mul i64 %i.02, %n
+  %add2 = add i64 %mul1, 5
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %add2
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - consistent flow [0|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 1000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll b/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll
new file mode 100644
index 0000000..2a1b4e7
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll
@@ -0,0 +1,312 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'SymbolicRDIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    A[2*i + n1] = ...
+;;  for (long int j = 0; j < n2; j++)
+;;    ... = A[3*j + 3*n1];
+
+define void @symbolicrdiv0(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.cond1.preheader, label %for.body
+
+for.cond1.preheader:                              ; preds = %for.body, %entry
+  %cmp21 = icmp eq i64 %n2, 0
+  br i1 %cmp21, label %for.end11, label %for.body4
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %conv = trunc i64 %i.05 to i32
+  %mul = shl nsw i64 %i.05, 1
+  %add = add i64 %mul, %n1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc, %n1
+  br i1 %cmp, label %for.body, label %for.cond1.preheader
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %j.03 = phi i64 [ %inc10, %for.body4 ], [ 0, %for.cond1.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.cond1.preheader ]
+  %mul56 = add i64 %j.03, %n1
+  %add7 = mul i64 %mul56, 3
+  %arrayidx8 = getelementptr inbounds i32* %A, i64 %add7
+  %0 = load i32* %arrayidx8, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc10 = add nsw i64 %j.03, 1
+  %cmp2 = icmp ult i64 %inc10, %n2
+  br i1 %cmp2, label %for.body4, label %for.end11
+
+for.end11:                                        ; preds = %for.body4, %for.cond1.preheader
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    A[2*i + 5*n2] = ...
+;;  for (long int j = 0; j < n2; j++)
+;;    ... = A[3*j + 2*n2];
+
+define void @symbolicrdiv1(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.cond2.preheader, label %for.body
+
+for.cond2.preheader:                              ; preds = %for.body, %entry
+  %cmp31 = icmp eq i64 %n2, 0
+  br i1 %cmp31, label %for.end12, label %for.body5
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %conv = trunc i64 %i.05 to i32
+  %mul = shl nsw i64 %i.05, 1
+  %mul1 = mul i64 %n2, 5
+  %add = add i64 %mul, %mul1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc, %n1
+  br i1 %cmp, label %for.body, label %for.cond2.preheader
+
+for.body5:                                        ; preds = %for.body5, %for.cond2.preheader
+  %j.03 = phi i64 [ %inc11, %for.body5 ], [ 0, %for.cond2.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body5 ], [ %B, %for.cond2.preheader ]
+  %mul6 = mul nsw i64 %j.03, 3
+  %mul7 = shl i64 %n2, 1
+  %add8 = add i64 %mul6, %mul7
+  %arrayidx9 = getelementptr inbounds i32* %A, i64 %add8
+  %0 = load i32* %arrayidx9, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc11 = add nsw i64 %j.03, 1
+  %cmp3 = icmp ult i64 %inc11, %n2
+  br i1 %cmp3, label %for.body5, label %for.end12
+
+for.end12:                                        ; preds = %for.body5, %for.cond2.preheader
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    A[2*i - n2] = ...
+;;  for (long int j = 0; j < n2; j++)
+;;    ... = A[-j + 2*n1];
+
+define void @symbolicrdiv2(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.cond1.preheader, label %for.body
+
+for.cond1.preheader:                              ; preds = %for.body, %entry
+  %cmp21 = icmp eq i64 %n2, 0
+  br i1 %cmp21, label %for.end10, label %for.body4
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %conv = trunc i64 %i.05 to i32
+  %mul = shl nsw i64 %i.05, 1
+  %sub = sub i64 %mul, %n2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc, %n1
+  br i1 %cmp, label %for.body, label %for.cond1.preheader
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %j.03 = phi i64 [ %inc9, %for.body4 ], [ 0, %for.cond1.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.cond1.preheader ]
+  %mul6 = shl i64 %n1, 1
+  %add = sub i64 %mul6, %j.03
+  %arrayidx7 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc9 = add nsw i64 %j.03, 1
+  %cmp2 = icmp ult i64 %inc9, %n2
+  br i1 %cmp2, label %for.body4, label %for.end10
+
+for.end10:                                        ; preds = %for.body4, %for.cond1.preheader
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    A[-i + n2] = ...
+;;  for (long int j = 0; j < n2; j++)
+;;    ... = A[j - n1];
+
+define void @symbolicrdiv3(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.cond1.preheader, label %for.body
+
+for.cond1.preheader:                              ; preds = %for.body, %entry
+  %cmp21 = icmp eq i64 %n2, 0
+  br i1 %cmp21, label %for.end9, label %for.body4
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %conv = trunc i64 %i.05 to i32
+  %add = sub i64 %n2, %i.05
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc, %n1
+  br i1 %cmp, label %for.body, label %for.cond1.preheader
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %j.03 = phi i64 [ %inc8, %for.body4 ], [ 0, %for.cond1.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.cond1.preheader ]
+  %sub5 = sub i64 %j.03, %n1
+  %arrayidx6 = getelementptr inbounds i32* %A, i64 %sub5
+  %0 = load i32* %arrayidx6, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc8 = add nsw i64 %j.03, 1
+  %cmp2 = icmp ult i64 %inc8, %n2
+  br i1 %cmp2, label %for.body4, label %for.end9
+
+for.end9:                                         ; preds = %for.body4, %for.cond1.preheader
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    A[-i + 2*n1] = ...
+;;  for (long int j = 0; j < n2; j++)
+;;    ... = A[-j + n1];
+
+define void @symbolicrdiv4(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.cond1.preheader, label %for.body
+
+for.cond1.preheader:                              ; preds = %for.body, %entry
+  %cmp21 = icmp eq i64 %n2, 0
+  br i1 %cmp21, label %for.end10, label %for.body4
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %conv = trunc i64 %i.05 to i32
+  %mul = shl i64 %n1, 1
+  %add = sub i64 %mul, %i.05
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc, %n1
+  br i1 %cmp, label %for.body, label %for.cond1.preheader
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %j.03 = phi i64 [ %inc9, %for.body4 ], [ 0, %for.cond1.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.cond1.preheader ]
+  %add6 = sub i64 %n1, %j.03
+  %arrayidx7 = getelementptr inbounds i32* %A, i64 %add6
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc9 = add nsw i64 %j.03, 1
+  %cmp2 = icmp ult i64 %inc9, %n2
+  br i1 %cmp2, label %for.body4, label %for.end10
+
+for.end10:                                        ; preds = %for.body4, %for.cond1.preheader
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    A[-i + n2] = ...
+;;  for (long int j = 0; j < n2; j++)
+;;    ... = A[-j + 2*n2];
+
+define void @symbolicrdiv5(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.cond1.preheader, label %for.body
+
+for.cond1.preheader:                              ; preds = %for.body, %entry
+  %cmp21 = icmp eq i64 %n2, 0
+  br i1 %cmp21, label %for.end10, label %for.body4
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %conv = trunc i64 %i.05 to i32
+  %add = sub i64 %n2, %i.05
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc, %n1
+  br i1 %cmp, label %for.body, label %for.cond1.preheader
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %j.03 = phi i64 [ %inc9, %for.body4 ], [ 0, %for.cond1.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.cond1.preheader ]
+  %mul = shl i64 %n2, 1
+  %add6 = sub i64 %mul, %j.03
+  %arrayidx7 = getelementptr inbounds i32* %A, i64 %add6
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc9 = add nsw i64 %j.03, 1
+  %cmp2 = icmp ult i64 %inc9, %n2
+  br i1 %cmp2, label %for.body4, label %for.end10
+
+for.end10:                                        ; preds = %for.body4, %for.cond1.preheader
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    for (long int j = 0; j < n2; j++)
+;;      A[j -i + n2] = ...
+;;      ... = A[2*n2];
+
+define void @symbolicrdiv6(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.end7, label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc5, %entry
+  %B.addr.06 = phi i32* [ %B.addr.1.lcssa, %for.inc5 ], [ %B, %entry ]
+  %i.05 = phi i64 [ %inc6, %for.inc5 ], [ 0, %entry ]
+  %cmp21 = icmp eq i64 %n2, 0
+  br i1 %cmp21, label %for.inc5, label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.03 = phi i64 [ %inc, %for.body3 ], [ 0, %for.cond1.preheader ]
+  %B.addr.12 = phi i32* [ %incdec.ptr, %for.body3 ], [ %B.addr.06, %for.cond1.preheader ]
+  %conv = trunc i64 %i.05 to i32
+  %sub = sub nsw i64 %j.03, %i.05
+  %add = add i64 %sub, %n2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %n2, 1
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 %mul
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.12, i64 1
+  store i32 %0, i32* %B.addr.12, align 4
+  %inc = add nsw i64 %j.03, 1
+  %cmp2 = icmp ult i64 %inc, %n2
+  br i1 %cmp2, label %for.body3, label %for.inc5
+
+for.inc5:                                         ; preds = %for.body3, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i32* [ %B.addr.06, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %inc6 = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc6, %n1
+  br i1 %cmp, label %for.cond1.preheader, label %for.end7
+
+for.end7:                                         ; preds = %for.inc5, %entry
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/SymbolicSIV.ll b/test/Analysis/DependenceAnalysis/SymbolicSIV.ll
new file mode 100644
index 0000000..ee2343f
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/SymbolicSIV.ll
@@ -0,0 +1,330 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'SymbolicSIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[2*i + n] = ...
+;;    ... = A[3*i + 3*n];
+
+define void @symbolicsiv0(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %add = add i64 %mul, %n
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul14 = add i64 %i.03, %n
+  %add3 = mul i64 %mul14, 3
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 %add3
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[2*i + 5*n] = ...
+;;    ... = A[3*i + 2*n];
+
+define void @symbolicsiv1(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %mul1 = mul i64 %n, 5
+  %add = add i64 %mul, %mul1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul2 = mul nsw i64 %i.03, 3
+  %mul3 = shl i64 %n, 1
+  %add4 = add i64 %mul2, %mul3
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %add4
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[2*i - n] = ...
+;;    ... = A[-i + 2*n];
+
+define void @symbolicsiv2(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %sub = sub i64 %mul, %n
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul2 = shl i64 %n, 1
+  %add = sub i64 %mul2, %i.03
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[-2*i + n + 1] = ...
+;;    ... = A[i - 2*n];
+
+define void @symbolicsiv3(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -2
+  %add = add i64 %mul, %n
+  %add1 = add i64 %add, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add1
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul2 = shl i64 %n, 1
+  %sub = sub i64 %i.03, %mul2
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[-2*i + 3*n] = ...
+;;    ... = A[-i + n];
+
+define void @symbolicsiv4(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -2
+  %mul1 = mul i64 %n, 3
+  %add = add i64 %mul, %mul1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %add2 = sub i64 %n, %i.03
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %add2
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[-2*i - 2*n] = ...
+;;    ... = A[-i - n];
+
+define void @symbolicsiv5(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -2
+  %mul1 = shl i64 %n, 1
+  %sub = sub i64 %mul, %mul1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub2 = sub nsw i64 0, %i.03
+  %sub3 = sub i64 %sub2, %n
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 %sub3
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;; why doesn't SCEV package understand that n >= 0?
+;;void weaktest(int *A, int *B, long unsigned n)
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[i + n + 1] = ...
+;;    ... = A[-i];
+
+define void @weaktest(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %add = add i64 %i.03, %n
+  %add1 = add i64 %add, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add1
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub = sub i64 0, %i.03
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [*|<] splitable!
+; CHECK: da analyze - split level = 1, iteration = ((0 smax (-1 + (-1 * %n))) /u 2)!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  void symbolicsiv6(int *A, int *B, long unsigned n, long unsigned N, long unsigned M) {
+;;    for (long int i = 0; i < n; i++) {
+;;      A[4*N*i + M] = i;
+;;      *B++ = A[4*N*i + 3*M + 1];
+
+define void @symbolicsiv6(i32* %A, i32* %B, i64 %n, i64 %N, i64 %M) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %for.body.preheader ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl i64 %N, 2
+  %mul1 = mul i64 %mul, %i.03
+  %add = add i64 %mul1, %M
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul2 = shl i64 %N, 2
+  %mul3 = mul i64 %mul2, %i.03
+  %mul4 = mul i64 %M, 3
+  %add5 = add i64 %mul3, %mul4
+  %add6 = add i64 %add5, 1
+  %arrayidx7 = getelementptr inbounds i32* %A, i64 %add6
+  %0 = load i32* %arrayidx7, align 4
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+; CHECK: da analyze - none!
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %exitcond = icmp ne i64 %inc, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+
+;;  void symbolicsiv7(int *A, int *B, long unsigned n, long unsigned N, long unsigned M) {
+;;    for (long int i = 0; i < n; i++) {
+;;      A[2*N*i + M] = i;
+;;      *B++ = A[2*N*i - 3*M + 2];
+
+define void @symbolicsiv7(i32* %A, i32* %B, i64 %n, i64 %N, i64 %M) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %for.body.preheader ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl i64 %N, 1
+  %mul1 = mul i64 %mul, %i.03
+  %add = add i64 %mul1, %M
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul2 = shl i64 %N, 1
+  %mul3 = mul i64 %mul2, %i.03
+  %0 = mul i64 %M, -3
+  %sub = add i64 %mul3, %0
+  %add5 = add i64 %sub, 2
+  %arrayidx6 = getelementptr inbounds i32* %A, i64 %add5
+  %1 = load i32* %arrayidx6, align 4
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+; CHECK: da analyze - flow [<>]!
+  store i32 %1, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %exitcond = icmp ne i64 %inc, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/WeakCrossingSIV.ll b/test/Analysis/DependenceAnalysis/WeakCrossingSIV.ll
new file mode 100644
index 0000000..343e8f4
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/WeakCrossingSIV.ll
@@ -0,0 +1,220 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'WeakCrossingSIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[1 + n*i] = ...
+;;    ... = A[1 - n*i];
+
+define void @weakcrossing0(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul i64 %i.03, %n
+  %add = add i64 %mul, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul1 = mul i64 %i.03, %n
+  %sub = sub i64 1, %mul1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [0|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[n + i] = ...
+;;    ... = A[1 + n - i];
+
+define void @weakcrossing1(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %add = add i64 %i.03, %n
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %add1 = add i64 %n, 1
+  %sub = sub i64 %add1, %i.03
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [<>] splitable!
+; CHECK: da analyze - split level = 1, iteration = 0!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 3; i++)
+;;    A[i] = ...
+;;    ... = A[6 - i];
+
+define void @weakcrossing2(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 %i.02
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub = sub i64 6, %i.02
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 3
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 4; i++)
+;;    A[i] = ...
+;;    ... = A[6 - i];
+
+define void @weakcrossing3(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 %i.02
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub = sub i64 6, %i.02
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [0|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 4
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 10; i++)
+;;    A[i] = ...
+;;    ... = A[-6 - i];
+
+define void @weakcrossing4(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 %i.02
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub = sub i64 -6, %i.02
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[3*i] = ...
+;;    ... = A[5 - 3*i];
+
+define void @weakcrossing5(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul i64 %i.03, 3
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %0 = mul i64 %i.03, -3
+  %sub = add i64 %0, 5
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub
+  %1 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %1, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 4; i++)
+;;    A[i] = ...
+;;    ... = A[5 - i];
+
+define void @weakcrossing6(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 %i.02
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub = sub i64 5, %i.02
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [<>] splitable!
+; CHECK: da analyze - split level = 1, iteration = 2!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 4
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/WeakZeroDstSIV.ll b/test/Analysis/DependenceAnalysis/WeakZeroDstSIV.ll
new file mode 100644
index 0000000..a598716
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/WeakZeroDstSIV.ll
@@ -0,0 +1,212 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'WeakZeroDstSIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long unsigned i = 0; i < 30; i++)
+;;    A[2*i + 10] = ...
+;;    ... = A[10];
+
+define void @weakzerodst0(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %add = add i64 %mul, 10
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [p<=|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 30
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[n*i + 10] = ...
+;;    ... = A[10];
+
+define void @weakzerodst1(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul i64 %i.03, %n
+  %add = add i64 %mul, 10
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [p<=|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 5; i++)
+;;    A[2*i] = ...
+;;    ... = A[10];
+
+define void @weakzerodst2(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 5
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 6; i++)
+;;    A[2*i] = ...
+;;    ... = A[10];
+
+define void @weakzerodst3(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [=>p|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 6
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 7; i++)
+;;    A[2*i] = ...
+;;    ... = A[10];
+
+define void @weakzerodst4(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 7
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 7; i++)
+;;    A[2*i] = ...
+;;    ... = A[-10];
+
+define void @weakzerodst5(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 -10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 7
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[3*i] = ...
+;;    ... = A[10];
+
+define void @weakzerodst6(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul i64 %i.03, 3
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/WeakZeroSrcSIV.ll b/test/Analysis/DependenceAnalysis/WeakZeroSrcSIV.ll
new file mode 100644
index 0000000..fd4f462
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/WeakZeroSrcSIV.ll
@@ -0,0 +1,212 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'WeakZeroSrcSIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long unsigned i = 0; i < 30; i++)
+;;    A[10] = ...
+;;    ... = A[2*i + 10];
+
+define void @weakzerosrc0(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %i.02, 1
+  %add = add i64 %mul, 10
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [p<=|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 30
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[10] = ...
+;;    ... = A[n*i + 10];
+
+define void @weakzerosrc1(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = mul i64 %i.03, %n
+  %add = add i64 %mul, 10
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [p<=|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 5; i++)
+;;    A[10] = ...
+;;    ... = A[2*i];
+
+define void @weakzerosrc2(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %i.02, 1
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %mul
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 5
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 6; i++)
+;;    A[10] = ...
+;;    ... = A[2*i];
+
+define void @weakzerosrc3(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %i.02, 1
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %mul
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [=>p|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 6
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 7; i++)
+;;    A[10] = ...
+;;    ... = A[2*i];
+
+define void @weakzerosrc4(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %i.02, 1
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %mul
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 7
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 7; i++)
+;;    A[-10] = ...
+;;    ... = A[2*i];
+
+define void @weakzerosrc5(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 -10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %i.02, 1
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %mul
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 7
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[10] = ...
+;;    ... = A[3*i];
+
+define void @weakzerosrc6(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = mul i64 %i.03, 3
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %mul
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/ZIV.ll b/test/Analysis/DependenceAnalysis/ZIV.ll
new file mode 100644
index 0000000..42b2389
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/ZIV.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'ZIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  A[n + 1] = ...
+;;  ... = A[1 + n];
+
+define void @z0(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %add = add i64 %n, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 0, i32* %arrayidx, align 4
+  %add1 = add i64 %n, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %add1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - consistent flow!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
+
+
+;;  A[n] = ...
+;;  ... = A[n + 1];
+
+define void @z1(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %arrayidx = getelementptr inbounds i32* %A, i64 %n
+  store i32 0, i32* %arrayidx, align 4
+  %add = add i64 %n, 1
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
+
+
+;;  A[n] = ...
+;;  ... = A[m];
+
+define void @z2(i32* %A, i32* %B, i64 %n, i64 %m) nounwind uwtable ssp {
+entry:
+  %arrayidx = getelementptr inbounds i32* %A, i64 %n
+  store i32 0, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %m
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/lit.local.cfg b/test/Analysis/DependenceAnalysis/lit.local.cfg
new file mode 100644
index 0000000..c6106e4
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.ll']
diff --git a/test/Analysis/LoopDependenceAnalysis/alias.ll b/test/Analysis/LoopDependenceAnalysis/alias.ll
deleted file mode 100644
index 78d0bf4..0000000
--- a/test/Analysis/LoopDependenceAnalysis/alias.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-; RUN: opt < %s -analyze -basicaa -lda | FileCheck %s
-
-;; x[5] = x[6] // with x being a pointer passed as argument
-
-define void @f1(i32* nocapture %xptr) nounwind {
-entry:
-  %x.ld.addr = getelementptr i32* %xptr, i64 6
-  %x.st.addr = getelementptr i32* %xptr, i64 5
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x = load i32* %x.ld.addr
-  store i32 %x, i32* %x.st.addr
-; CHECK: 0,1: dep
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; x[5] = x[6] // with x being an array on the stack
-
-define void @foo(...) nounwind {
-entry:
-  %xptr = alloca [256 x i32], align 4
-  %x.ld.addr = getelementptr [256 x i32]* %xptr, i64 0, i64 6
-  %x.st.addr = getelementptr [256 x i32]* %xptr, i64 0, i64 5
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x = load i32* %x.ld.addr
-  store i32 %x, i32* %x.st.addr
-; CHECK: 0,1: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/Analysis/LoopDependenceAnalysis/siv-strong.ll b/test/Analysis/LoopDependenceAnalysis/siv-strong.ll
deleted file mode 100644
index 401e466..0000000
--- a/test/Analysis/LoopDependenceAnalysis/siv-strong.ll
+++ /dev/null
@@ -1,110 +0,0 @@
-; RUN: opt < %s -analyze -basicaa -lda | FileCheck %s
-
-@x = common global [256 x i32] zeroinitializer, align 4
-@y = common global [256 x i32] zeroinitializer, align 4
-
-;; for (i = 0; i < 256; i++)
-;;   x[i] = x[i] + y[i]
-
-define void @f1(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %y.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %x = load i32* %x.addr      ; 0
-  %y = load i32* %y.addr      ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.addr  ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; for (i = 0; i < 256; i++)
-;;   x[i+1] = x[i] + y[i]
-
-define void @f2(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %i.next = add i64 %i, 1
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.next
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; for (i = 0; i < 10; i++)
-;;   x[i+20] = x[i] + y[i]
-
-define void @f3(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %i.20 = add i64 %i, 20
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.20
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 10
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; for (i = 0; i < 10; i++)
-;;   x[10*i+1] = x[10*i] + y[i]
-
-define void @f4(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %i.10 = mul i64 %i, 10
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i.10
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.10
-  %i.10.1 = add i64 %i.10, 1
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.10.1
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 10
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/Analysis/LoopDependenceAnalysis/siv-weak-crossing.ll b/test/Analysis/LoopDependenceAnalysis/siv-weak-crossing.ll
deleted file mode 100644
index 9d0128c..0000000
--- a/test/Analysis/LoopDependenceAnalysis/siv-weak-crossing.ll
+++ /dev/null
@@ -1,118 +0,0 @@
-; RUN: opt < %s -analyze -basicaa -lda | FileCheck %s
-
-@x = common global [256 x i32] zeroinitializer, align 4
-@y = common global [256 x i32] zeroinitializer, align 4
-
-;; for (i = 0; i < 256; i++)
-;;   x[i] = x[255 - i] + y[i]
-
-define void @f1(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %i.255 = sub i64 255, %i
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.255
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; for (i = 0; i < 100; i++)
-;;   x[i] = x[255 - i] + y[i]
-
-define void @f2(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %i.255 = sub i64 255, %i
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.255
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 100
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; // the first iteration (i=0) leads to an out-of-bounds access of x. as the
-;; // result of this access is undefined, _any_ dependence result is safe.
-;; for (i = 0; i < 256; i++)
-;;   x[i] = x[256 - i] + y[i]
-
-define void @f3(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %i.256 = sub i64 0, %i
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 1, i64 %i.256
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2:
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; // slightly contrived but valid IR for the following loop, where all
-;; // accesses in all iterations are within bounds. while this example's first
-;; // (ZIV-)subscript is (0, 1), accesses are dependent.
-;; for (i = 1; i < 256; i++)
-;;   x[i] = x[256 - i] + y[i]
-
-define void @f4(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %i.1 = add i64 1, %i
-  %i.256 = sub i64 -1, %i
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i.1
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 1, i64 %i.256
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.1
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/Analysis/LoopDependenceAnalysis/siv-weak-zero.ll b/test/Analysis/LoopDependenceAnalysis/siv-weak-zero.ll
deleted file mode 100644
index 1c5ae4c..0000000
--- a/test/Analysis/LoopDependenceAnalysis/siv-weak-zero.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: opt < %s -analyze -basicaa -lda | FileCheck %s
-
-@x = common global [256 x i32] zeroinitializer, align 4
-@y = common global [256 x i32] zeroinitializer, align 4
-
-;; for (i = 0; i < 256; i++)
-;;   x[i] = x[42] + y[i]
-
-define void @f1(...) nounwind {
-entry:
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 42
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %y.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x = load i32* %x.ld.addr   ; 0
-  %y = load i32* %y.addr      ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.addr  ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; for (i = 0; i < 250; i++)
-;;   x[i] = x[255] + y[i]
-
-define void @f2(...) nounwind {
-entry:
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 255
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %y.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x = load i32* %x.ld.addr   ; 0
-  %y = load i32* %y.addr      ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.addr  ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 250
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/Analysis/LoopDependenceAnalysis/ziv.ll b/test/Analysis/LoopDependenceAnalysis/ziv.ll
deleted file mode 100644
index 645ae7f..0000000
--- a/test/Analysis/LoopDependenceAnalysis/ziv.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: opt < %s -analyze -basicaa -lda | FileCheck %s
-
-@x = common global [256 x i32] zeroinitializer, align 4
-
-;; x[5] = x[6]
-
-define void @f1(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x = load i32* getelementptr ([256 x i32]* @x, i32 0, i64 6)
-  store i32 %x, i32* getelementptr ([256 x i32]* @x, i32 0, i64 5)
-; CHECK: 0,1: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; x[c] = x[c+1] // with c being a loop-invariant constant
-
-define void @f2(i64 %c0) nounwind {
-entry:
-  %c1 = add i64 %c0, 1
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 %c0
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %c1
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x = load i32* %x.ld.addr
-  store i32 %x, i32* %x.st.addr
-; CHECK: 0,1: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; x[6] = x[6]
-
-define void @f3(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x = load i32* getelementptr ([256 x i32]* @x, i32 0, i64 6)
-  store i32 %x, i32* getelementptr ([256 x i32]* @x, i32 0, i64 6)
-; CHECK: 0,1: dep
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/Analysis/Profiling/load-branch-weights-ifs.ll b/test/Analysis/Profiling/load-branch-weights-ifs.ll
new file mode 100644
index 0000000..7ed090b
--- /dev/null
+++ b/test/Analysis/Profiling/load-branch-weights-ifs.ll
@@ -0,0 +1,122 @@
+; RUN: opt -insert-edge-profiling -o %t1 < %s
+; RUN: rm -f %t1.prof_data
+; RUN: lli %defaultjit -load %llvmshlibdir/libprofile_rt%shlibext %t1 \
+; RUN:     -llvmprof-output %t1.prof_data
+; RUN: opt -profile-file %t1.prof_data -profile-metadata-loader -S -o - < %s \
+; RUN:     | FileCheck %s
+; RUN: rm -f %t1.prof_data
+
+; FIXME: profile_rt.dll could be built on win32.
+; REQUIRES: loadable_module
+
+;; func_mod - Branch taken 6 times in 7.
+define i32 @func_mod(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %rem = srem i32 %0, 7
+  %tobool = icmp ne i32 %rem, 0
+  br i1 %tobool, label %if.then, label %if.else
+; CHECK: br i1 %tobool, label %if.then, label %if.else, !prof !0
+
+if.then:
+  store i32 1, i32* %retval
+  br label %return
+
+if.else:
+  store i32 0, i32* %retval
+  br label %return
+
+return:
+  %1 = load i32* %retval
+  ret i32 %1
+}
+
+;; func_const_true - conditional branch which 100% taken probability.
+define i32 @func_const_true(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %cmp = icmp eq i32 %0, 1
+  br i1 %cmp, label %if.then, label %if.end
+; CHECK: br i1 %cmp, label %if.then, label %if.end, !prof !1
+
+if.then:
+  store i32 1, i32* %retval
+  br label %return
+
+if.end:
+  store i32 0, i32* %retval
+  br label %return
+
+return:
+  %1 = load i32* %retval
+  ret i32 %1
+}
+
+;; func_const_true - conditional branch which 100% not-taken probability.
+define i32 @func_const_false(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %cmp = icmp eq i32 %0, 1
+  br i1 %cmp, label %if.then, label %if.end
+; CHECK: br i1 %cmp, label %if.then, label %if.end, !prof !2
+
+if.then:
+  store i32 1, i32* %retval
+  br label %return
+
+if.end:
+  store i32 0, i32* %retval
+  br label %return
+
+return:
+  %1 = load i32* %retval
+  ret i32 %1
+}
+
+define i32 @main(i32 %argc, i8** %argv) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  %loop = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %loop, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32* %loop, align 4
+  %cmp = icmp slt i32 %0, 7000
+  br i1 %cmp, label %for.body, label %for.end
+; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !3
+
+for.body:
+  %1 = load i32* %loop, align 4
+  %call = call i32 @func_mod(i32 %1)
+  br label %for.inc
+
+for.inc:
+  %2 = load i32* %loop, align 4
+  %inc = add nsw i32 %2, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %for.cond
+
+for.end:
+  %call1 = call i32 @func_const_true(i32 1)
+  %call2 = call i32 @func_const_false(i32 0)
+  ret i32 0
+}
+
+; CHECK: !0 = metadata !{metadata !"branch_weights", i32 6000, i32 1000}
+; CHECK: !1 = metadata !{metadata !"branch_weights", i32 1, i32 0}
+; CHECK: !2 = metadata !{metadata !"branch_weights", i32 0, i32 1}
+; CHECK: !3 = metadata !{metadata !"branch_weights", i32 7000, i32 1}
+; CHECK-NOT: !4
diff --git a/test/Analysis/Profiling/load-branch-weights-loops.ll b/test/Analysis/Profiling/load-branch-weights-loops.ll
new file mode 100644
index 0000000..9d1925a
--- /dev/null
+++ b/test/Analysis/Profiling/load-branch-weights-loops.ll
@@ -0,0 +1,188 @@
+; RUN: opt -insert-edge-profiling -o %t1 < %s
+; RUN: rm -f %t1.prof_data
+; RUN: lli %defaultjit -load %llvmshlibdir/libprofile_rt%shlibext %t1 \
+; RUN:     -llvmprof-output %t1.prof_data
+; RUN: opt -profile-file %t1.prof_data -profile-metadata-loader -S -o - < %s \
+; RUN:     | FileCheck %s
+; RUN: rm -f %t1.prof_data
+
+; FIXME: profile_rt.dll could be built on win32.
+; REQUIRES: loadable_module
+
+;; func_for - Test branch probabilities for a vanilla for loop.
+define i32 @func_for(i32 %N) nounwind uwtable {
+entry:
+  %N.addr = alloca i32, align 4
+  %ret = alloca i32, align 4
+  %loop = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  store i32 0, i32* %ret, align 4
+  store i32 0, i32* %loop, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32* %loop, align 4
+  %1 = load i32* %N.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !0
+
+for.body:
+  %2 = load i32* %N.addr, align 4
+  %3 = load i32* %ret, align 4
+  %add = add nsw i32 %3, %2
+  store i32 %add, i32* %ret, align 4
+  br label %for.inc
+
+for.inc:
+  %4 = load i32* %loop, align 4
+  %inc = add nsw i32 %4, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %for.cond
+
+for.end:
+  %5 = load i32* %ret, align 4
+  ret i32 %5
+}
+
+;; func_for_odd - Test branch probabilities for a for loop with a continue and
+;; a break.
+define i32 @func_for_odd(i32 %N) nounwind uwtable {
+entry:
+  %N.addr = alloca i32, align 4
+  %ret = alloca i32, align 4
+  %loop = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  store i32 0, i32* %ret, align 4
+  store i32 0, i32* %loop, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32* %loop, align 4
+  %1 = load i32* %N.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !1
+
+for.body:
+  %2 = load i32* %loop, align 4
+  %rem = srem i32 %2, 10
+  %tobool = icmp ne i32 %rem, 0
+  br i1 %tobool, label %if.then, label %if.end
+; CHECK: br i1 %tobool, label %if.then, label %if.end, !prof !2
+
+if.then:
+  br label %for.inc
+
+if.end:
+  %3 = load i32* %loop, align 4
+  %cmp1 = icmp eq i32 %3, 500
+  br i1 %cmp1, label %if.then2, label %if.end3
+; CHECK: br i1 %cmp1, label %if.then2, label %if.end3, !prof !3
+
+if.then2:
+  br label %for.end
+
+if.end3:
+  %4 = load i32* %N.addr, align 4
+  %5 = load i32* %ret, align 4
+  %add = add nsw i32 %5, %4
+  store i32 %add, i32* %ret, align 4
+  br label %for.inc
+
+for.inc:
+  %6 = load i32* %loop, align 4
+  %inc = add nsw i32 %6, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %for.cond
+
+for.end:
+  %7 = load i32* %ret, align 4
+  ret i32 %7
+}
+
+;; func_while - Test branch probability in a vanilla while loop.
+define i32 @func_while(i32 %N) nounwind uwtable {
+entry:
+  %N.addr = alloca i32, align 4
+  %ret = alloca i32, align 4
+  %loop = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  store i32 0, i32* %ret, align 4
+  store i32 0, i32* %loop, align 4
+  br label %while.cond
+
+while.cond:
+  %0 = load i32* %loop, align 4
+  %1 = load i32* %N.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %while.body, label %while.end
+; CHECK: br i1 %cmp, label %while.body, label %while.end, !prof !0
+
+while.body:
+  %2 = load i32* %N.addr, align 4
+  %3 = load i32* %ret, align 4
+  %add = add nsw i32 %3, %2
+  store i32 %add, i32* %ret, align 4
+  %4 = load i32* %loop, align 4
+  %inc = add nsw i32 %4, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %while.cond
+
+while.end:
+  %5 = load i32* %ret, align 4
+  ret i32 %5
+}
+
+;; func_while - Test branch probability in a vanilla do-while loop.
+define i32 @func_do_while(i32 %N) nounwind uwtable {
+entry:
+  %N.addr = alloca i32, align 4
+  %ret = alloca i32, align 4
+  %loop = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  store i32 0, i32* %ret, align 4
+  store i32 0, i32* %loop, align 4
+  br label %do.body
+
+do.body:
+  %0 = load i32* %N.addr, align 4
+  %1 = load i32* %ret, align 4
+  %add = add nsw i32 %1, %0
+  store i32 %add, i32* %ret, align 4
+  %2 = load i32* %loop, align 4
+  %inc = add nsw i32 %2, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %do.cond
+
+do.cond:
+  %3 = load i32* %loop, align 4
+  %4 = load i32* %N.addr, align 4
+  %cmp = icmp slt i32 %3, %4
+  br i1 %cmp, label %do.body, label %do.end
+; CHECK: br i1 %cmp, label %do.body, label %do.end, !prof !4
+
+do.end:
+  %5 = load i32* %ret, align 4
+  ret i32 %5
+}
+
+define i32 @main(i32 %argc, i8** %argv) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  store i32 0, i32* %retval
+  %call = call i32 @func_for(i32 1000)
+  %call1 = call i32 @func_for_odd(i32 1000)
+  %call2 = call i32 @func_while(i32 1000)
+  %call3 = call i32 @func_do_while(i32 1000)
+  ret i32 0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 1000, i32 1}
+!1 = metadata !{metadata !"branch_weights", i32 501, i32 0}
+!2 = metadata !{metadata !"branch_weights", i32 450, i32 51}
+!3 = metadata !{metadata !"branch_weights", i32 1, i32 50}
+!4 = metadata !{metadata !"branch_weights", i32 999, i32 1}
+; CHECK-NOT: !5
diff --git a/test/Analysis/Profiling/load-branch-weights-switches.ll b/test/Analysis/Profiling/load-branch-weights-switches.ll
new file mode 100644
index 0000000..5587c71
--- /dev/null
+++ b/test/Analysis/Profiling/load-branch-weights-switches.ll
@@ -0,0 +1,165 @@
+; RUN: opt -insert-edge-profiling -o %t1 < %s
+; RUN: rm -f %t1.prof_data
+; RUN: lli %defaultjit -load %llvmshlibdir/libprofile_rt%shlibext %t1 \
+; RUN:     -llvmprof-output %t1.prof_data
+; RUN: opt -profile-file %t1.prof_data -profile-metadata-loader -S -o - < %s \
+; RUN:     | FileCheck %s
+; RUN: rm -f %t1.prof_data
+
+; FIXME: profile_rt.dll could be built on win32.
+; REQUIRES: loadable_module
+
+;; func_switch - Test branch probabilities for a switch instruction with an
+;; even chance of taking each case (or no case).
+define i32 @func_switch(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %rem = srem i32 %0, 4
+  switch i32 %rem, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+  ]
+; CHECK: ], !prof !0
+
+sw.bb:
+  store i32 5, i32* %retval
+  br label %return
+
+sw.bb1:
+  store i32 6, i32* %retval
+  br label %return
+
+sw.bb2:
+  store i32 7, i32* %retval
+  br label %return
+
+sw.epilog:
+  store i32 8, i32* %retval
+  br label %return
+
+return:
+  %1 = load i32* %retval
+  ret i32 %1
+}
+
+;; func_switch_switch - Test branch probabilities in a switch-instruction that
+;; leads to further switch instructions.  The first-tier switch occludes some
+;; possibilities in the second-tier switches, leading to some branches having a
+;; 0 probability.
+define i32 @func_switch_switch(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %rem = srem i32 %0, 2
+  switch i32 %rem, label %sw.default11 [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb5
+  ]
+; CHECK: ], !prof !1
+
+sw.bb:
+  %1 = load i32* %N.addr, align 4
+  %rem1 = srem i32 %1, 4
+  switch i32 %rem1, label %sw.default [
+    i32 0, label %sw.bb2
+    i32 1, label %sw.bb3
+    i32 2, label %sw.bb4
+  ]
+; CHECK: ], !prof !2
+
+sw.bb2:
+  store i32 5, i32* %retval
+  br label %return
+
+sw.bb3:
+  store i32 6, i32* %retval
+  br label %return
+
+sw.bb4:
+  store i32 7, i32* %retval
+  br label %return
+
+sw.default:
+  store i32 8, i32* %retval
+  br label %return
+
+sw.bb5:
+  %2 = load i32* %N.addr, align 4
+  %rem6 = srem i32 %2, 4
+  switch i32 %rem6, label %sw.default10 [
+    i32 0, label %sw.bb7
+    i32 1, label %sw.bb8
+    i32 2, label %sw.bb9
+  ]
+; CHECK: ], !prof !3
+
+sw.bb7:
+  store i32 9, i32* %retval
+  br label %return
+
+sw.bb8:
+  store i32 10, i32* %retval
+  br label %return
+
+sw.bb9:
+  store i32 11, i32* %retval
+  br label %return
+
+sw.default10:
+  store i32 12, i32* %retval
+  br label %return
+
+sw.default11:
+  store i32 13, i32* %retval
+  br label %return
+
+return:
+  %3 = load i32* %retval
+  ret i32 %3
+}
+
+define i32 @main(i32 %argc, i8** %argv) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  %loop = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %loop, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32* %loop, align 4
+  %cmp = icmp slt i32 %0, 4000
+  br i1 %cmp, label %for.body, label %for.end
+; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !4
+
+for.body:
+  %1 = load i32* %loop, align 4
+  %call = call i32 @func_switch(i32 %1)
+  %2 = load i32* %loop, align 4
+  %call1 = call i32 @func_switch_switch(i32 %2)
+  br label %for.inc
+
+for.inc:
+  %3 = load i32* %loop, align 4
+  %inc = add nsw i32 %3, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %for.cond
+
+for.end:
+  ret i32 0
+}
+
+; CHECK: !0 = metadata !{metadata !"branch_weights", i32 1000, i32 1000, i32 1000, i32 1000}
+; CHECK: !1 = metadata !{metadata !"branch_weights", i32 0, i32 2000, i32 2000}
+; CHECK: !2 = metadata !{metadata !"branch_weights", i32 0, i32 1000, i32 0, i32 1000}
+; CHECK: !3 = metadata !{metadata !"branch_weights", i32 1000, i32 0, i32 1000, i32 0}
+; CHECK: !4 = metadata !{metadata !"branch_weights", i32 4000, i32 1}
+; CHECK-NOT: !5
diff --git a/test/Assembler/2008-09-02-FunctionNotes2.ll b/test/Assembler/2008-09-02-FunctionNotes2.ll
index 97351e2..47eb011 100644
--- a/test/Assembler/2008-09-02-FunctionNotes2.ll
+++ b/test/Assembler/2008-09-02-FunctionNotes2.ll
@@ -1,5 +1,5 @@
 ; Test function notes
-; RUN: not llvm-as %s -o /dev/null 2>&1 | grep "Attributes noinline alwaysinline are incompatible"
+; RUN: not llvm-as %s -o /dev/null 2>&1 | grep "Attributes 'noinline and alwaysinline' are incompatible"
 define void @fn1() alwaysinline  noinline {
   ret void
 }
diff --git a/test/Assembler/global-addrspace-forwardref.ll b/test/Assembler/global-addrspace-forwardref.ll
new file mode 100644
index 0000000..f0f094a
--- /dev/null
+++ b/test/Assembler/global-addrspace-forwardref.ll
@@ -0,0 +1,8 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+; Make sure the address space of forward decls is preserved
+
+; CHECK: @a2 = global i8 addrspace(1)* @a
+; CHECK: @a = addrspace(1) global i8 0
+@a2 = global i8 addrspace(1)* @a
+@a = addrspace(1) global i8 0
diff --git a/test/Assembler/invalid-fwdref1.ll b/test/Assembler/invalid-fwdref1.ll
new file mode 100644
index 0000000..ef8b16c
--- /dev/null
+++ b/test/Assembler/invalid-fwdref1.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as %s -disable-output 2>&1 | grep "invalid forward reference to function as global value!"
+
+define i8* @test1() { ret i8* @test1a }
+define void @test1a() { }
diff --git a/test/Bindings/Ocaml/ipo_opts.ml b/test/Bindings/Ocaml/ipo_opts.ml
index 3a36231..d4537e4 100644
--- a/test/Bindings/Ocaml/ipo_opts.ml
+++ b/test/Bindings/Ocaml/ipo_opts.ml
@@ -43,10 +43,10 @@ let test_transforms () =
       ignore (build_ret (build_call fn [| |] "" b) b);
   end;
 
-  let td = TargetData.create (target_triple m) in
+  let td = DataLayout.create (target_triple m) in
   
   ignore (PassManager.create ()
-           ++ TargetData.add td
+           ++ DataLayout.add td
            ++ add_argument_promotion
            ++ add_constant_merge
            ++ add_dead_arg_elimination
@@ -63,7 +63,7 @@ let test_transforms () =
            ++ PassManager.run_module m
            ++ PassManager.dispose);
 
-  TargetData.dispose td
+  DataLayout.dispose td
 
 
 (*===-- Driver ------------------------------------------------------------===*)
diff --git a/test/Bindings/Ocaml/scalar_opts.ml b/test/Bindings/Ocaml/scalar_opts.ml
index 34a7a6a..0760dad 100644
--- a/test/Bindings/Ocaml/scalar_opts.ml
+++ b/test/Bindings/Ocaml/scalar_opts.ml
@@ -38,10 +38,10 @@ let test_transforms () =
   let fn = define_function "fn" fty m in
   ignore (build_ret_void (builder_at_end context (entry_block fn)));
   
-  let td = TargetData.create (target_triple m) in
+  let td = DataLayout.create (target_triple m) in
   
   ignore (PassManager.create_function m
-           ++ TargetData.add td
+           ++ DataLayout.add td
            ++ add_verifier
            ++ add_constant_propagation
            ++ add_sccp
@@ -78,7 +78,7 @@ let test_transforms () =
            ++ PassManager.finalize
            ++ PassManager.dispose);
   
-  TargetData.dispose td
+  DataLayout.dispose td
 
 
 (*===-- Driver ------------------------------------------------------------===*)
diff --git a/test/Bindings/Ocaml/target.ml b/test/Bindings/Ocaml/target.ml
index 1b6b71e..7a35a790 100644
--- a/test/Bindings/Ocaml/target.ml
+++ b/test/Bindings/Ocaml/target.ml
@@ -33,10 +33,10 @@ let m = create_module context filename
 (*===-- Target Data -------------------------------------------------------===*)
 
 let test_target_data () =
-  let td = TargetData.create (target_triple m) in
+  let td = DataLayout.create (target_triple m) in
   let sty = struct_type context [| i32_type; i64_type |] in
   
-  ignore (TargetData.as_string td);
+  ignore (DataLayout.as_string td);
   ignore (byte_order td);
   ignore (pointer_size td);
   ignore (intptr_type td);
@@ -49,7 +49,7 @@ let test_target_data () =
   ignore (element_at_offset td sty (Int64.of_int 1));
   ignore (offset_of_element td sty 1);
   
-  TargetData.dispose td
+  DataLayout.dispose td
 
 
 (*===-- Driver ------------------------------------------------------------===*)
diff --git a/test/Bindings/Ocaml/vmcore.ml b/test/Bindings/Ocaml/vmcore.ml
index b8eb6d3..61be4b77 100644
--- a/test/Bindings/Ocaml/vmcore.ml
+++ b/test/Bindings/Ocaml/vmcore.ml
@@ -113,14 +113,14 @@ let test_constants () =
   ignore (define_global "const_int_string" c m);
   insist (i32_type = type_of c);
 
-  (* RUN: grep 'const_string.*"cruel\00world"' < %t.ll
+  (* RUN: grep 'const_string.*"cruel\\00world"' < %t.ll
    *)
   group "string";
   let c = const_string context "cruel\000world" in
   ignore (define_global "const_string" c m);
   insist ((array_type i8_type 11) = type_of c);
 
-  (* RUN: grep 'const_stringz.*"hi\00again\00"' < %t.ll
+  (* RUN: grep 'const_stringz.*"hi\\00again\\00"' < %t.ll
    *)
   group "stringz";
   let c = const_stringz context "hi\000again" in
@@ -187,7 +187,7 @@ let test_constants () =
   ignore (define_global "const_all_ones" c m);
 
   group "pointer null"; begin
-    (* RUN: grep "const_pointer_null = global i64* null" < %t.ll
+    (* RUN: grep "const_pointer_null = global i64\* null" < %t.ll
      *)
     let c = const_pointer_null (pointer_type i64_type) in
     ignore (define_global "const_pointer_null" c m);
@@ -542,7 +542,7 @@ let test_users () =
 (*===-- Aliases -----------------------------------------------------------===*)
 
 let test_aliases () =
-  (* RUN: grep "@alias = alias i32* @aliasee" < %t.ll
+  (* RUN: grep "@alias = alias i32\* @aliasee" < %t.ll
    *)
   let v = declare_global i32_type "aliasee" m in
   ignore (add_alias m (pointer_type i32_type) v "alias")
@@ -554,7 +554,7 @@ let test_functions () =
   let ty = function_type i32_type [| i32_type; i64_type |] in
   let ty2 = function_type i8_type [| i8_type; i64_type |] in
   
-  (* RUN: grep "declare i32 @Fn1\(i32, i64\)" < %t.ll
+  (* RUN: grep 'declare i32 @Fn1(i32, i64)' < %t.ll
    *)
   begin group "declare";
     insist (None = lookup_function "Fn1" m);
@@ -935,7 +935,7 @@ let test_builder () =
 
   group "malloc/free"; begin
       (* RUN: grep "call.*@malloc(i32 ptrtoint" < %t.ll
-       * RUN: grep "call.*@free(i8*" < %t.ll
+       * RUN: grep "call.*@free(i8\*" < %t.ll
        * RUN: grep "call.*@malloc(i32 %" < %t.ll
        *)
       let bb1 = append_block context "MallocBlock1" fn in
@@ -947,7 +947,7 @@ let test_builder () =
   end;
 
   group "indirectbr"; begin
-    (* RUN: grep "indirectbr i8* blockaddress(@X7, %IBRBlock2), [label %IBRBlock2, label %IBRBlock3]" < %t.ll
+    (* RUN: grep "indirectbr i8\* blockaddress(@X7, %IBRBlock2), \[label %IBRBlock2, label %IBRBlock3\]" < %t.ll
      *)
     let bb1 = append_block context "IBRBlock1" fn in
 
@@ -1054,10 +1054,10 @@ let test_builder () =
 
     (* RUN: grep "%build_alloca = alloca i32" < %t.ll
      * RUN: grep "%build_array_alloca = alloca i32, i32 %P2" < %t.ll
-     * RUN: grep "%build_load = load i32* %build_array_alloca" < %t.ll
-     * RUN: grep "store i32 %P2, i32* %build_alloca" < %t.ll
-     * RUN: grep "%build_gep = getelementptr i32* %build_array_alloca, i32 %P2" < %t.ll
-     * RUN: grep "%build_in_bounds_gep = getelementptr inbounds i32* %build_array_alloca, i32 %P2" < %t.ll
+     * RUN: grep "%build_load = load i32\* %build_array_alloca" < %t.ll
+     * RUN: grep "store i32 %P2, i32\* %build_alloca" < %t.ll
+     * RUN: grep "%build_gep = getelementptr i32\* %build_array_alloca, i32 %P2" < %t.ll
+     * RUN: grep "%build_in_bounds_gep = getelementptr inbounds i32\* %build_array_alloca, i32 %P2" < %t.ll
      * RUN: grep "%build_struct_gep = getelementptr inbounds.*%build_alloca2, i32 0, i32 1" < %t.ll
      *)
     let alloca = build_alloca i32_type "build_alloca" b in
@@ -1106,14 +1106,14 @@ let test_builder () =
      * RUN: grep "%build_fptrunc2 = fptrunc double %build_sitofp to float" < %t.ll
      * RUN: grep "%build_fpext = fpext float %build_fptrunc to double" < %t.ll
      * RUN: grep "%build_fpext2 = fpext float %build_fptrunc to double" < %t.ll
-     * RUN: grep "%build_inttoptr = inttoptr i32 %P1 to i8*" < %t.ll
-     * RUN: grep "%build_ptrtoint = ptrtoint i8* %build_inttoptr to i64" < %t.ll
-     * RUN: grep "%build_ptrtoint2 = ptrtoint i8* %build_inttoptr to i64" < %t.ll
+     * RUN: grep "%build_inttoptr = inttoptr i32 %P1 to i8\*" < %t.ll
+     * RUN: grep "%build_ptrtoint = ptrtoint i8\* %build_inttoptr to i64" < %t.ll
+     * RUN: grep "%build_ptrtoint2 = ptrtoint i8\* %build_inttoptr to i64" < %t.ll
      * RUN: grep "%build_bitcast = bitcast i64 %build_ptrtoint to double" < %t.ll
      * RUN: grep "%build_bitcast2 = bitcast i64 %build_ptrtoint to double" < %t.ll
      * RUN: grep "%build_bitcast3 = bitcast i64 %build_ptrtoint to double" < %t.ll
      * RUN: grep "%build_bitcast4 = bitcast i64 %build_ptrtoint to double" < %t.ll
-     * RUN: grep "%build_pointercast = bitcast i8* %build_inttoptr to i16*" < %t.ll
+     * RUN: grep "%build_pointercast = bitcast i8\* %build_inttoptr to i16*" < %t.ll
      *)
     let inst28 = build_trunc p1 i8_type "build_trunc" atentry in
     let inst29 = build_zext inst28 i32_type "build_zext" atentry in
@@ -1148,7 +1148,7 @@ let test_builder () =
      * RUN: grep "%build_fcmp_false = fcmp false float %F1, %F2" < %t.ll
      * RUN: grep "%build_fcmp_true = fcmp true float %F2, %F1" < %t.ll
      * RUN: grep "%build_is_null.*= icmp eq.*%X0,.*null" < %t.ll
-     * RUN: grep "%build_is_not_null = icmp ne i8* %X1, null" < %t.ll
+     * RUN: grep "%build_is_not_null = icmp ne i8\* %X1, null" < %t.ll
      * RUN: grep "%build_ptrdiff" < %t.ll
      *)
     ignore (build_icmp Icmp.Ne    p1 p2 "build_icmp_ne" atentry);
@@ -1167,7 +1167,7 @@ let test_builder () =
   group "miscellaneous"; begin
     (* RUN: grep "%build_call = tail call cc63 i32 @.*(i32 signext %P2, i32 %P1)" < %t.ll
      * RUN: grep "%build_select = select i1 %build_icmp, i32 %P1, i32 %P2" < %t.ll
-     * RUN: grep "%build_va_arg = va_arg i8** null, i32" < %t.ll
+     * RUN: grep "%build_va_arg = va_arg i8\*\* null, i32" < %t.ll
      * RUN: grep "%build_extractelement = extractelement <4 x i32> %Vec1, i32 %P2" < %t.ll
      * RUN: grep "%build_insertelement = insertelement <4 x i32> %Vec1, i32 %P1, i32 %P2" < %t.ll
      * RUN: grep "%build_shufflevector = shufflevector <4 x i32> %Vec1, <4 x i32> %Vec2, <4 x i32> <i32 1, i32 1, i32 0, i32 0>" < %t.ll
@@ -1240,8 +1240,8 @@ let test_builder () =
   end;
 
   group "dbg"; begin
-    (* RUN: grep "%dbg = add i32 %P1, %P2, !dbg !1" < %t.ll
-     * RUN: grep "!1 = metadata !{i32 2, i32 3, metadata !2, metadata !2}" < %t.ll
+    (* RUN: grep '%dbg = add i32 %P1, %P2, !dbg !1' < %t.ll
+     * RUN: grep '!1 = metadata !{i32 2, i32 3, metadata !2, metadata !2}' < %t.ll
      *)
     insist ((current_debug_location atentry) = None);
 
diff --git a/test/Bitcode/blockaddress.ll b/test/Bitcode/blockaddress.ll
index b9f3341..8ac54be 100644
--- a/test/Bitcode/blockaddress.ll
+++ b/test/Bitcode/blockaddress.ll
@@ -28,3 +28,18 @@ here:
 end:
   ret void
 }
+
+; PR13895
+define void @doitagain(i8** nocapture %pptr) {
+; CHECK: define void @doitagain
+entry:
+  br label %here
+
+here:
+  store i8* blockaddress(@doit, %here), i8** %pptr, align 8
+; CHECK: blockaddress(@doit, %here)
+  br label %end
+
+end:
+  ret void
+}
diff --git a/test/Bitcode/function-encoding-rel-operands.ll b/test/Bitcode/function-encoding-rel-operands.ll
new file mode 100644
index 0000000..aedb0c3
--- /dev/null
+++ b/test/Bitcode/function-encoding-rel-operands.ll
@@ -0,0 +1,49 @@
+; Basic sanity test to check that instruction operands are encoded with
+; relative IDs.
+; RUN: llvm-as < %s | llvm-bcanalyzer -dump | FileCheck %s
+
+; CHECK: FUNCTION_BLOCK
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_RET {{.*}}op0=1
+define i32 @test_int_binops(i32 %a) nounwind {
+entry:
+  %0 = add i32 %a, %a
+  %1 = sub i32 %0, %0
+  %2 = mul i32 %1, %1
+  ret i32 %2
+}
+
+
+; CHECK: FUNCTION_BLOCK
+; CHECK: INST_CAST {{.*}}op0=1
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_RET {{.*}}op0=1
+define double @test_float_binops(i32 %a) nounwind {
+  %1 = sitofp i32 %a to double
+  %2 = fadd double %1, %1
+  %3 = fsub double %2, %2
+  %4 = fmul double %3, %3
+  %5 = fdiv double %4, %4
+  ret double %5
+}
+
+
+; CHECK: FUNCTION_BLOCK
+; skip checking operands of INST_INBOUNDS_GEP since that depends on ordering
+; between literals and the formal parameters.
+; CHECK: INST_INBOUNDS_GEP {{.*}}
+; CHECK: INST_LOAD {{.*}}op0=1 {{.*}}
+; CHECK: INST_CMP2 op0=1 {{.*}}
+; CHECK: INST_RET {{.*}}op0=1
+define i1 @test_load(i32 %a, {i32, i32}* %ptr) nounwind {
+entry:
+  %0 = getelementptr inbounds {i32, i32}* %ptr, i32 %a, i32 0
+  %1 = load i32* %0
+  %2 = icmp eq i32 %1, %a
+  ret i1 %2
+}
diff --git a/test/BugPoint/crash-narrowfunctiontest.ll b/test/BugPoint/crash-narrowfunctiontest.ll
index d080d9d..c812836 100644
--- a/test/BugPoint/crash-narrowfunctiontest.ll
+++ b/test/BugPoint/crash-narrowfunctiontest.ll
@@ -2,6 +2,7 @@
 ;
 ; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes > /dev/null
 ; REQUIRES: loadable_module
+; XFAIL: lto_on_osx
 
 define i32 @foo() { ret i32 1 }
 
diff --git a/test/BugPoint/metadata.ll b/test/BugPoint/metadata.ll
index 0eda566..6dc9574 100644
--- a/test/BugPoint/metadata.ll
+++ b/test/BugPoint/metadata.ll
@@ -1,6 +1,7 @@
 ; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes > /dev/null
 ; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s
 ; REQUIRES: loadable_module
+; XFAIL: lto_on_osx
 
 ; Bugpoint should keep the call's metadata attached to the call.
 
diff --git a/test/BugPoint/remove_arguments_test.ll b/test/BugPoint/remove_arguments_test.ll
index 29a03b8..5a45f84 100644
--- a/test/BugPoint/remove_arguments_test.ll
+++ b/test/BugPoint/remove_arguments_test.ll
@@ -1,6 +1,7 @@
 ; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes
 ; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s
 ; REQUIRES: loadable_module
+; XFAIL: lto_on_osx
 
 ; Test to make sure that arguments are removed from the function if they are 
 ; unnecessary. And clean up any types that that frees up too.
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 991cc9d..e10a532 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -7,6 +7,11 @@ configure_lit_site_cfg(
   ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg
   )
 
+# Don't include check-llvm into check-all without LLVM_BUILD_TOOLS.
+if(NOT LLVM_BUILD_TOOLS)
+  set(EXCLUDE_FROM_ALL ON)
+endif()
+
 add_lit_testsuite(check-llvm "Running the LLVM regression tests"
   ${CMAKE_CURRENT_BINARY_DIR}
   PARAMS llvm_site_config=${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
@@ -14,10 +19,16 @@ add_lit_testsuite(check-llvm "Running the LLVM regression tests"
   DEPENDS UnitTests
           BugpointPasses LLVMHello
           llc lli llvm-ar llvm-as
-          llvm-diff
+          llvm-bcanalyzer llvm-diff
           llvm-dis llvm-extract llvm-dwarfdump
-          llvm-link llvm-mc llvm-nm llvm-objdump llvm-readobj
+          llvm-link
+          llvm-mc
+          llvm-mcmarkup
+          llvm-nm
+          llvm-objdump
+          llvm-readobj
           macho-dump opt
+          profile_rt-shared
           FileCheck count not
           yaml2obj
   )
diff --git a/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll b/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll
index 99db637..36d1575 100644
--- a/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll
+++ b/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll
@@ -13,12 +13,12 @@
 ; BASIC-NEXT:         0x00000000
 ; BASIC-NEXT:         0x00000000
 ; BASIC-NEXT:         0x0000003c
-; BASIC-NEXT:         0x00000020
+; BASIC-NEXT:         0x00000022
 ; BASIC-NEXT:         0x00000000
 ; BASIC-NEXT:         0x00000000
 ; BASIC-NEXT:         0x00000001
 ; BASIC-NEXT:         0x00000000
-; BASIC-NEXT:         '411f0000 00616561 62690001 15000000 06020801 09011401 15011703 18011901'
+; BASIC-NEXT:         '41210000 00616561 62690001 17000000 060a0741 08010902 14011501 17031801 1901'
 
 ; CORTEXA8:        .ARM.attributes
 ; CORTEXA8-NEXT:         0x70000003
diff --git a/test/CodeGen/ARM/2010-12-07-PEIBug.ll b/test/CodeGen/ARM/2010-12-07-PEIBug.ll
index 770ad44..4879f4e 100644
--- a/test/CodeGen/ARM/2010-12-07-PEIBug.ll
+++ b/test/CodeGen/ARM/2010-12-07-PEIBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a9 | FileCheck %s
 ; rdar://8728956
 
 define hidden void @foo() nounwind ssp {
diff --git a/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll b/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll
index 3e78c46..101a913 100644
--- a/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll
+++ b/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll
@@ -1,4 +1,9 @@
 ; RUN: llc < %s -arm-tail-calls=1 | FileCheck %s
+
+; tail call inside a function where byval argument is splitted between
+; registers and stack is currently unsupported.
+; XFAIL: *
+
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-ios"
 
diff --git a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
index 42b1491..6e0ef96 100644
--- a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
+++ b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
@@ -9,8 +9,8 @@ entry:
 }
 
 ; Trigger multiple NEON stores.
-; CHECK:      vstmia
-; CHECK-NEXT: vstmia
+; CHECK:      vst1.64
+; CHECK-NEXT: vst1.64
 define void @f_0_40(i8* nocapture %c) nounwind optsize {
 entry:
   call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 40, i32 16, i1 false)
diff --git a/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll b/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
index 89c01d5..f9ede74 100644
--- a/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
+++ b/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
@@ -8,12 +8,12 @@ define void @test_sqrt(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw    r1, :lower16:{{.*}}
 ; CHECK:      movt    r1, :upper16:{{.*}}
-; CHECK:      vldmia  r1
+; CHECK:      vld1.64 {{.*}}, [r1, :128]
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64  {{.*}}
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -31,21 +31,21 @@ define void @test_cos(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}cosf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}cosf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}cosf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}cosf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -62,21 +62,21 @@ define void @test_exp(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}expf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}expf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}expf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}expf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -93,21 +93,21 @@ define void @test_exp2(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}exp2f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}exp2f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}exp2f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}exp2f
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -124,21 +124,21 @@ define void @test_log10(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log10f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log10f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log10f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log10f
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -155,21 +155,21 @@ define void @test_log(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}logf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}logf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}logf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}logf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -186,21 +186,21 @@ define void @test_log2(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log2f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log2f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log2f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log2f
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -218,21 +218,21 @@ define void @test_pow(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}powf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}powf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}powf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}powf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
 
@@ -252,10 +252,10 @@ define void @test_powi(<4 x float>* %X) nounwind {
 
 ; CHECK:       movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:       movt  [[reg0]], :upper16:{{.*}}
-; CHECK:       vldmia  [[reg0]], {{.*}}
+; CHECK:       vld1.64 {{.*}}, :128
 ; CHECK:       vmul.f32 {{.*}}
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
 
@@ -275,21 +275,21 @@ define void @test_sin(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}sinf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}sinf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}sinf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}sinf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -300,3 +300,34 @@ L.entry:
 
 declare <4 x float> @llvm.sin.v4f32(<4 x float>) nounwind readonly
 
+define void @test_floor(<4 x float>* %X) nounwind {
+
+; CHECK: test_floor:
+
+; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
+; CHECK:      movt  [[reg0]], :upper16:{{.*}}
+; CHECK:      vld1.64
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      vst1.64
+
+L.entry:
+  %0 = load <4 x float>* @A, align 16
+  %1 = call <4 x float> @llvm.floor.v4f32(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* %X, align 16
+  ret void
+}
+
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readonly
+
diff --git a/test/CodeGen/ARM/2012-05-04-vmov.ll b/test/CodeGen/ARM/2012-05-04-vmov.ll
new file mode 100644
index 0000000..d52ef2c
--- /dev/null
+++ b/test/CodeGen/ARM/2012-05-04-vmov.ll
@@ -0,0 +1,11 @@
+; RUN: llc -O1 -march=arm -mcpu=cortex-a9 < %s | FileCheck -check-prefix=A9-CHECK %s
+; RUN: llc -O1 -march=arm -mcpu=swift < %s | FileCheck -check-prefix=SWIFT-CHECK %s
+; Check that swift doesn't use vmov.32. <rdar://problem/10453003>.
+
+define <2 x i32> @testuvec(<2 x i32> %A, <2 x i32> %B) nounwind {
+entry:
+  %div = udiv <2 x i32> %A, %B
+  ret <2 x i32> %div
+; A9-CHECK: vmov.32
+; SWIFT-CHECK-NOT: vmov.32
+}
diff --git a/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll b/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll
new file mode 100644
index 0000000..dd67843
--- /dev/null
+++ b/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=arm -mcpu=swift < %s | FileCheck %s
+; <rdar://problem/10451892>
+
+define void @f(i32 %x, i32* %p) nounwind ssp {
+entry:
+; CHECK-NOT: vdup.32
+  %vecinit.i = insertelement <2 x i32> undef, i32 %x, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %x, i32 1
+  %0 = bitcast i32* %p to i8*
+  tail call void @llvm.arm.neon.vst1.v2i32(i8* %0, <2 x i32> %vecinit1.i, i32 4)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind
diff --git a/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll b/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll
new file mode 100644
index 0000000..ec7f72d
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll
@@ -0,0 +1,129 @@
+; RUN: llc < %s -mcpu=cortex-a8 -march=thumb
+; Test that this doesn't crash.
+; <rdar://problem/12183003>
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios5.1.0"
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32) nounwind readonly
+
+declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) nounwind
+
+define void @findEdges(i8*) nounwind ssp {
+  %2 = icmp sgt i32 undef, 0
+  br i1 %2, label %5, label %3
+
+; <label>:3                                       ; preds = %5, %1
+  %4 = phi i8* [ %0, %1 ], [ %19, %5 ]
+  ret void
+
+; <label>:5                                       ; preds = %5, %1
+  %6 = phi i8* [ %19, %5 ], [ %0, %1 ]
+  %7 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* null, i32 1)
+  %8 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %7, 0
+  %9 = getelementptr inbounds i8* null, i32 3
+  %10 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %9, i32 1)
+  %11 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %10, 2
+  %12 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %6, i32 1)
+  %13 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 0
+  %14 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 1
+  %15 = getelementptr inbounds i8* %6, i32 3
+  %16 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %15, i32 1)
+  %17 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 1
+  %18 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 2
+  %19 = getelementptr inbounds i8* %6, i32 48
+  %20 = bitcast <16 x i8> %13 to <2 x i64>
+  %21 = bitcast <16 x i8> %8 to <2 x i64>
+  %22 = bitcast <16 x i8> %14 to <2 x i64>
+  %23 = shufflevector <2 x i64> %22, <2 x i64> undef, <1 x i32> zeroinitializer
+  %24 = bitcast <1 x i64> %23 to <8 x i8>
+  %25 = zext <8 x i8> %24 to <8 x i16>
+  %26 = sub <8 x i16> zeroinitializer, %25
+  %27 = bitcast <16 x i8> %17 to <2 x i64>
+  %28 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %26) nounwind
+  %29 = mul <8 x i16> %28, %28
+  %30 = add <8 x i16> zeroinitializer, %29
+  %31 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> undef, <8 x i16> %30) nounwind
+  %32 = bitcast <16 x i8> %11 to <2 x i64>
+  %33 = shufflevector <2 x i64> %32, <2 x i64> undef, <1 x i32> zeroinitializer
+  %34 = bitcast <1 x i64> %33 to <8 x i8>
+  %35 = zext <8 x i8> %34 to <8 x i16>
+  %36 = sub <8 x i16> %35, zeroinitializer
+  %37 = bitcast <16 x i8> %18 to <2 x i64>
+  %38 = shufflevector <2 x i64> %37, <2 x i64> undef, <1 x i32> zeroinitializer
+  %39 = bitcast <1 x i64> %38 to <8 x i8>
+  %40 = zext <8 x i8> %39 to <8 x i16>
+  %41 = sub <8 x i16> zeroinitializer, %40
+  %42 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %36) nounwind
+  %43 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %41) nounwind
+  %44 = mul <8 x i16> %42, %42
+  %45 = mul <8 x i16> %43, %43
+  %46 = add <8 x i16> %45, %44
+  %47 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %31, <8 x i16> %46) nounwind
+  %48 = bitcast <8 x i16> %47 to <2 x i64>
+  %49 = shufflevector <2 x i64> %48, <2 x i64> undef, <1 x i32> zeroinitializer
+  %50 = bitcast <1 x i64> %49 to <4 x i16>
+  %51 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %50, <4 x i16> undef) nounwind
+  %52 = tail call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> %51, <4 x i32> <i32 -6, i32 -6, i32 -6, i32 -6>)
+  %53 = bitcast <4 x i16> %52 to <1 x i64>
+  %54 = shufflevector <1 x i64> %53, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %55 = bitcast <2 x i64> %54 to <8 x i16>
+  %56 = tail call <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16> %55, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
+  %57 = shufflevector <2 x i64> %20, <2 x i64> undef, <1 x i32> <i32 1>
+  %58 = bitcast <1 x i64> %57 to <8 x i8>
+  %59 = zext <8 x i8> %58 to <8 x i16>
+  %60 = sub <8 x i16> zeroinitializer, %59
+  %61 = shufflevector <2 x i64> %21, <2 x i64> undef, <1 x i32> <i32 1>
+  %62 = bitcast <1 x i64> %61 to <8 x i8>
+  %63 = zext <8 x i8> %62 to <8 x i16>
+  %64 = sub <8 x i16> %63, zeroinitializer
+  %65 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %60) nounwind
+  %66 = mul <8 x i16> %65, %65
+  %67 = add <8 x i16> zeroinitializer, %66
+  %68 = shufflevector <2 x i64> %27, <2 x i64> undef, <1 x i32> <i32 1>
+  %69 = bitcast <1 x i64> %68 to <8 x i8>
+  %70 = zext <8 x i8> %69 to <8 x i16>
+  %71 = sub <8 x i16> zeroinitializer, %70
+  %72 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> undef) nounwind
+  %73 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %71) nounwind
+  %74 = mul <8 x i16> %72, %72
+  %75 = mul <8 x i16> %73, %73
+  %76 = add <8 x i16> %75, %74
+  %77 = shufflevector <2 x i64> %32, <2 x i64> undef, <1 x i32> <i32 1>
+  %78 = bitcast <1 x i64> %77 to <8 x i8>
+  %79 = zext <8 x i8> %78 to <8 x i16>
+  %80 = sub <8 x i16> %79, zeroinitializer
+  %81 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %80) nounwind
+  %82 = mul <8 x i16> %81, %81
+  %83 = add <8 x i16> zeroinitializer, %82
+  %84 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %76, <8 x i16> %83) nounwind
+  %85 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %67, <8 x i16> %84) nounwind
+  %86 = bitcast <8 x i16> %85 to <2 x i64>
+  %87 = shufflevector <2 x i64> %86, <2 x i64> undef, <1 x i32> <i32 1>
+  %88 = bitcast <1 x i64> %87 to <4 x i16>
+  %89 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %88, <4 x i16> undef) nounwind
+  %90 = tail call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> %89, <4 x i32> <i32 -6, i32 -6, i32 -6, i32 -6>)
+  %91 = bitcast <4 x i16> %90 to <1 x i64>
+  %92 = shufflevector <1 x i64> undef, <1 x i64> %91, <2 x i32> <i32 0, i32 1>
+  %93 = bitcast <2 x i64> %92 to <8 x i16>
+  %94 = tail call <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16> %93, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
+  %95 = bitcast <8 x i8> %56 to <1 x i64>
+  %96 = bitcast <8 x i8> %94 to <1 x i64>
+  %97 = shufflevector <1 x i64> %95, <1 x i64> %96, <2 x i32> <i32 0, i32 1>
+  %98 = bitcast <2 x i64> %97 to <16 x i8>
+  tail call void @llvm.arm.neon.vst1.v16i8(i8* null, <16 x i8> %98, i32 1)
+  %99 = icmp slt i32 undef, undef
+  br i1 %99, label %5, label %3
+}
+
+declare <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+declare <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) nounwind readnone
diff --git a/test/CodeGen/ARM/2012-08-30-select.ll b/test/CodeGen/ARM/2012-08-30-select.ll
new file mode 100644
index 0000000..8471be5
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-30-select.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
+; rdar://12201387
+
+;CHECK: select_s_v_v
+;CHECK: it  ne
+;CHECK-NEXT: vmovne.i32
+;CHECK: bx
+define <16 x i8> @select_s_v_v(i32 %avail, i8* %bar) {
+entry:
+  %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %bar, i32 1)
+  %and = and i32 %avail, 1
+  %tobool = icmp eq i32 %and, 0
+  %vld1. = select i1 %tobool, <16 x i8> %vld1, <16 x i8> zeroinitializer
+  ret <16 x i8> %vld1.
+}
+
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* , i32 )
+
diff --git a/test/CodeGen/ARM/2012-09-18-ARMv4ISelBug.ll b/test/CodeGen/ARM/2012-09-18-ARMv4ISelBug.ll
new file mode 100644
index 0000000..e761ffe
--- /dev/null
+++ b/test/CodeGen/ARM/2012-09-18-ARMv4ISelBug.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm -mcpu=arm7tdmi | FileCheck %s
+
+; movw is only legal for V6T2 and later.
+; rdar://12300648
+
+define i32 @t(i32 %x) {
+; CHECK: t:
+; CHECK-NOT: movw
+  %tmp = add i32 %x, -65535
+  ret i32 %tmp
+}
diff --git a/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv.ll b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv.ll
new file mode 100644
index 0000000..7576609
--- /dev/null
+++ b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 2>&1 | FileCheck %s
+
+; Check for error message:
+; CHECK: non-trivial scalar-to-vector conversion, possible invalid constraint for vector type
+
+define void @f() nounwind ssp {
+  %1 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } asm "vldm $4, { ${0:q}, ${1:q}, ${2:q}, ${3:q} }", "=r,=r,=r,=r,r"(i64* undef) nounwind, !srcloc !0
+  ret void
+}
+
+!0 = metadata !{i32 318437}
diff --git a/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll
new file mode 100644
index 0000000..6fa1391
--- /dev/null
+++ b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 2>&1 | FileCheck %s
+
+; Check for error message:
+; CHECK: scalar-to-vector conversion failed, possible invalid constraint for vector type
+
+define hidden void @f(i32* %corr, i32 %order) nounwind ssp {
+  tail call void asm sideeffect "vst1.s32 { ${1:q}, ${2:q} }, [$0]", "r,{q0},{q1}"(i32* %corr, <2 x i64>* undef, <2 x i64>* undef) nounwind, !srcloc !0
+  ret void
+}
+
+!0 = metadata !{i32 257}
diff --git a/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
new file mode 100644
index 0000000..b5f6d31
--- /dev/null
+++ b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi | FileCheck %s
+; Test that we correctly use registers and align elements when using va_arg
+
+%struct_t = type { double, double, double }
+@static_val = constant %struct_t { double 1.0, double 2.0, double 3.0 }
+
+declare void @llvm.va_start(i8*) nounwind
+declare void @llvm.va_end(i8*) nounwind
+
+; CHECK: test_byval_8_bytes_alignment:
+define void @test_byval_8_bytes_alignment(i32 %i, ...) {
+entry:
+; CHECK: stm     r0, {r1, r2, r3}
+  %g = alloca i8*
+  %g1 = bitcast i8** %g to i8*
+  call void @llvm.va_start(i8* %g1)
+
+; CHECK: add	[[REG:(r[0-9]+)|(lr)]], {{(r[0-9]+)|(lr)}}, #7
+; CHECK: bfc	[[REG]], #0, #3
+  %0 = va_arg i8** %g, double
+  call void @llvm.va_end(i8* %g1)
+  
+  ret void
+}
+
+; CHECK: main:
+; CHECK: ldm     r0, {r2, r3}
+define i32 @main() {
+entry:
+  call void (i32, ...)* @test_byval_8_bytes_alignment(i32 555, %struct_t* byval @static_val)
+  ret i32 0
+}
+
+declare void @f(double);
+
+; CHECK:     test_byval_8_bytes_alignment_fixed_arg:
+; CHECK-NOT:   str     r1
+; CHECK:       str     r3, [sp, #12]
+; CHECK:       str     r2, [sp, #8]
+; CHECK-NOT:   str     r1
+define void @test_byval_8_bytes_alignment_fixed_arg(i32 %n1, %struct_t* byval %val) nounwind {
+entry:
+  %a = getelementptr inbounds %struct_t* %val, i32 0, i32 0
+  %0 = load double* %a
+  call void (double)* @f(double %0)
+  ret void
+}
+
+; CHECK: main_fixed_arg:
+; CHECK: ldm     r0, {r2, r3}
+define i32 @main_fixed_arg() {
+entry:
+  call void (i32, %struct_t*)* @test_byval_8_bytes_alignment_fixed_arg(i32 555, %struct_t* byval @static_val)
+  ret i32 0
+}
+
diff --git a/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll b/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll
new file mode 100644
index 0000000..478048d
--- /dev/null
+++ b/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi | FileCheck %s
+
+@.str = private unnamed_addr constant [12 x i8] c"val.a = %f\0A\00"
+%struct_t = type { double, double, double }
+@static_val = constant %struct_t { double 1.0, double 2.0, double 3.0 }
+
+declare i32 @printf(i8*, ...)
+
+; CHECK:     test_byval_usage_scheduling:
+; CHECK:       str     r3, [sp, #12]
+; CHECK:       str     r2, [sp, #8]
+; CHECK:       vldr    d16, [sp, #8]
+define void @test_byval_usage_scheduling(i32 %n1, i32 %n2, %struct_t* byval %val) nounwind {
+entry:
+  %a = getelementptr inbounds %struct_t* %val, i32 0, i32 0
+  %0 = load double* %a
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), double %0)
+  ret void
+}
diff --git a/test/CodeGen/ARM/2012-10-04-LDRB_POST_IMM-Crash.ll b/test/CodeGen/ARM/2012-10-04-LDRB_POST_IMM-Crash.ll
new file mode 100644
index 0000000..f239510
--- /dev/null
+++ b/test/CodeGen/ARM/2012-10-04-LDRB_POST_IMM-Crash.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=armv7-none-linux- | FileCheck %s
+; Check that LDRB_POST_IMM instruction emitted properly.
+
+%my_struct_t = type { i8, i8, i8, i8, i8 }
+@main.val = private unnamed_addr constant %my_struct_t { i8 1, i8 2, i8 3, i8 4, i8 5 }
+
+declare void @f(i32 %n1, i32 %n2, i32 %n3, %my_struct_t* byval %val);
+
+; CHECK: main:
+define i32 @main() nounwind {
+entry:
+; CHECK: ldrb	{{(r[0-9]+)}}, {{(\[r[0-9]+\])}}, #1
+  call void @f(i32 555, i32 555, i32 555, %my_struct_t* byval @main.val)
+  ret i32 0
+}
+
diff --git a/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll b/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll
new file mode 100644
index 0000000..fcc6a7f
--- /dev/null
+++ b/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi | FileCheck %s
+
+%struct.s = type { [4 x i32] }
+@v = constant %struct.s zeroinitializer; 
+
+declare void @f(%struct.s* %p);
+
+; CHECK: t:
+define void @t(i32 %a, %struct.s* byval %s) nounwind {
+entry:
+
+; Here we need to only check proper start address of restored %s argument.
+; CHECK:      sub     sp, sp, #16
+; CHECK:      push    {r11, lr}
+; CHECK:      add     r0, sp, #12
+; CHECK:      stm     r0, {r1, r2, r3}
+; CHECK:      add     r0, sp, #12
+; CHECK-NEXT: bl f
+  call void @f(%struct.s* %s)
+  ret void
+}
+
+; CHECK: caller:
+define void @caller() {
+
+; CHECK:      ldm     r0, {r1, r2, r3}
+  call void @t(i32 0, %struct.s* @v);
+  ret void
+}
diff --git a/test/CodeGen/ARM/a15-mla.ll b/test/CodeGen/ARM/a15-mla.ll
new file mode 100644
index 0000000..25f6de4
--- /dev/null
+++ b/test/CodeGen/ARM/a15-mla.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s  -march=arm -float-abi=hard -mcpu=cortex-a15 -mattr=+neon,+neonfp | FileCheck %s
+
+; This test checks that the VMLxForwarting feature is disabled for A15.
+; CHECK: fun_a
+define <4 x i32> @fun_a(<4 x i32> %x, <4 x i32> %y) nounwind{
+  %1 = add <4 x i32> %x, %y
+; CHECK-NOT: vmul
+; CHECK: vmla
+  %2 = mul <4 x i32> %1, %1
+  %3 = add <4 x i32> %y, %2
+  ret <4 x i32> %3
+}
diff --git a/test/CodeGen/ARM/a15.ll b/test/CodeGen/ARM/a15.ll
new file mode 100644
index 0000000..6f816c1
--- /dev/null
+++ b/test/CodeGen/ARM/a15.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s  -mcpu=cortex-a15 | FileCheck %s
+
+; CHECK: a
+define i32 @a(i32 %x) {
+  ret i32 %x;
+}
diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
index 8967730..6e6b363 100644
--- a/test/CodeGen/ARM/atomic-op.ll
+++ b/test/CodeGen/ARM/atomic-op.ll
@@ -159,3 +159,13 @@ entry:
   store i8 %3, i8* %old
   ret void
 }
+
+; CHECK: func4
+; This function should not need to use callee-saved registers.
+; rdar://problem/12203728
+; CHECK-NOT: r4
+define i32 @func4(i32* %p) nounwind optsize ssp {
+entry:
+  %0 = atomicrmw add i32* %p, i32 1 monotonic
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM/atomicrmw_minmax.ll b/test/CodeGen/ARM/atomicrmw_minmax.ll
new file mode 100644
index 0000000..69f1384
--- /dev/null
+++ b/test/CodeGen/ARM/atomicrmw_minmax.ll
@@ -0,0 +1,21 @@
+;  RUN: llc -march=arm -mcpu=cortex-a9 < %s | FileCheck %s
+
+;  CHECK: max:
+define i32 @max(i8 %ctx, i32* %ptr, i32 %val)
+{
+;  CHECK: ldrex
+;  CHECK: cmp [[old:r[0-9]*]], [[val:r[0-9]*]]
+;  CHECK: movhi {{r[0-9]*}}, [[old]]
+  %old = atomicrmw umax i32* %ptr, i32 %val monotonic
+  ret i32 %old
+}
+
+;  CHECK: min:
+define i32 @min(i8 %ctx, i32* %ptr, i32 %val)
+{
+;  CHECK: ldrex
+;  CHECK: cmp [[old:r[0-9]*]], [[val:r[0-9]*]]
+;  CHECK: movlo {{r[0-9]*}}, [[old]]
+  %old = atomicrmw umin i32* %ptr, i32 %val monotonic
+  ret i32 %old
+}
diff --git a/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
index 1b385ab..96e83dd 100644
--- a/test/CodeGen/ARM/avoid-cpsr-rmw.ll
+++ b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift     | FileCheck %s
 ; Avoid some 's' 16-bit instruction which partially update CPSR (and add false
 ; dependency) when it isn't dependent on last CPSR defining instruction.
 ; rdar://8928208
diff --git a/test/CodeGen/ARM/call-noret-minsize.ll b/test/CodeGen/ARM/call-noret-minsize.ll
new file mode 100644
index 0000000..df3c19e
--- /dev/null
+++ b/test/CodeGen/ARM/call-noret-minsize.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8   | FileCheck %s -check-prefix=ARM
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=swift       | FileCheck %s -check-prefix=SWIFT
+; rdar://12348580
+
+define void @t1() noreturn minsize nounwind ssp {
+entry:
+; ARM: t1:
+; ARM: bl _bar
+
+; SWIFT: t1:
+; SWIFT: bl _bar
+  tail call void @bar() noreturn nounwind
+  unreachable
+}
+
+define void @t2() noreturn minsize nounwind ssp {
+entry:
+; ARM: t2:
+; ARM: bl _t1
+
+; SWIFT: t2:
+; SWIFT: bl _t1
+  tail call void @t1() noreturn nounwind
+  unreachable
+}
+
+declare void @bar() noreturn
diff --git a/test/CodeGen/ARM/call-noret.ll b/test/CodeGen/ARM/call-noret.ll
new file mode 100644
index 0000000..27062dc
--- /dev/null
+++ b/test/CodeGen/ARM/call-noret.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8   | FileCheck %s -check-prefix=ARM
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=swift       | FileCheck %s -check-prefix=SWIFT
+; rdar://8979299
+
+define void @t1() noreturn nounwind ssp {
+entry:
+; ARM: t1:
+; ARM: mov lr, pc
+; ARM: b _bar
+
+; SWIFT: t1:
+; SWIFT: mov lr, pc
+; SWIFT: b _bar
+  tail call void @bar() noreturn nounwind
+  unreachable
+}
+
+define void @t2() noreturn nounwind ssp {
+entry:
+; ARM: t2:
+; ARM: mov lr, pc
+; ARM: b _t1
+
+; SWIFT: t2:
+; SWIFT: mov lr, pc
+; SWIFT: b _t1
+  tail call void @t1() noreturn nounwind
+  unreachable
+}
+
+declare void @bar() noreturn
diff --git a/test/CodeGen/ARM/carry.ll b/test/CodeGen/ARM/carry.ll
index f84774d..bf51cd6 100644
--- a/test/CodeGen/ARM/carry.ll
+++ b/test/CodeGen/ARM/carry.ll
@@ -45,3 +45,16 @@ entry:
   %0 = sub nsw i64 0, %x
   ret i64 %0
 }
+
+; rdar://12559385
+define i64 @f5(i32 %vi) {
+entry:
+; CHECK: f5:
+; CHECK: movw [[REG:r[0-9]+]], #36102
+; CHECK: sbc r{{[0-9]+}}, r{{[0-9]+}}, [[REG]]
+    %v0 = zext i32 %vi to i64
+    %v1 = xor i64 %v0, -155057456198619
+    %v4 = add i64 %v1, 155057456198619
+    %v5 = add i64 %v4, %v1
+    ret i64 %v5
+}
diff --git a/test/CodeGen/ARM/coalesce-subregs.ll b/test/CodeGen/ARM/coalesce-subregs.ll
index fb0f4c6..3ba9475 100644
--- a/test/CodeGen/ARM/coalesce-subregs.ll
+++ b/test/CodeGen/ARM/coalesce-subregs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=cortex-a9 | FileCheck %s
+; RUN: llc < %s -mcpu=cortex-a9 -verify-coalescing -verify-machineinstrs | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios0.0.0"
 
@@ -66,3 +66,295 @@ do.end:                                           ; preds = %do.body
 
 declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32) nounwind readonly
 declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
+
+; CHECK: f3
+; This function has lane insertions that span basic blocks.
+; The trivial REG_SEQUENCE lowering can't handle that, but the coalescer can.
+;
+; void f3(float *p, float *q) {
+;   float32x2_t x;
+;   x[1] = p[3];
+;   if (q)
+;     x[0] = q[0] + q[1];
+;   else
+;     x[0] = p[2];
+;   vst1_f32(p+4, x);
+; }
+;
+; CHECK-NOT: vmov
+; CHECK-NOT: vorr
+define void @f3(float* %p, float* %q) nounwind ssp {
+entry:
+  %arrayidx = getelementptr inbounds float* %p, i32 3
+  %0 = load float* %arrayidx, align 4
+  %vecins = insertelement <2 x float> undef, float %0, i32 1
+  %tobool = icmp eq float* %q, null
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = load float* %q, align 4
+  %arrayidx2 = getelementptr inbounds float* %q, i32 1
+  %2 = load float* %arrayidx2, align 4
+  %add = fadd float %1, %2
+  %vecins3 = insertelement <2 x float> %vecins, float %add, i32 0
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx4 = getelementptr inbounds float* %p, i32 2
+  %3 = load float* %arrayidx4, align 4
+  %vecins5 = insertelement <2 x float> %vecins, float %3, i32 0
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %x.0 = phi <2 x float> [ %vecins3, %if.then ], [ %vecins5, %if.else ]
+  %add.ptr = getelementptr inbounds float* %p, i32 4
+  %4 = bitcast float* %add.ptr to i8*
+  tail call void @llvm.arm.neon.vst1.v2f32(i8* %4, <2 x float> %x.0, i32 4)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32) nounwind
+declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) nounwind readonly
+
+; CHECK: f4
+; This function inserts a lane into a fully defined vector.
+; The destination lane isn't read, so the subregs can coalesce.
+; CHECK-NOT: vmov
+; CHECK-NOT: vorr
+define void @f4(float* %p, float* %q) nounwind ssp {
+entry:
+  %0 = bitcast float* %p to i8*
+  %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %0, i32 4)
+  %tobool = icmp eq float* %q, null
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = load float* %q, align 4
+  %arrayidx1 = getelementptr inbounds float* %q, i32 1
+  %2 = load float* %arrayidx1, align 4
+  %add = fadd float %1, %2
+  %vecins = insertelement <2 x float> %vld1, float %add, i32 1
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %x.0 = phi <2 x float> [ %vecins, %if.then ], [ %vld1, %entry ]
+  tail call void @llvm.arm.neon.vst1.v2f32(i8* %0, <2 x float> %x.0, i32 4)
+  ret void
+}
+
+; CHECK: f5
+; Coalesce vector lanes through phis.
+; CHECK: vmov.f32 {{.*}}, #1.0
+; CHECK-NOT: vmov
+; CHECK-NOT: vorr
+; CHECK: %if.end
+; We may leave the last insertelement in the if.end block.
+; It is inserting the %add value into a dead lane, but %add causes interference
+; in the entry block, and we don't do dead lane checks across basic blocks.
+define void @f5(float* %p, float* %q) nounwind ssp {
+entry:
+  %0 = bitcast float* %p to i8*
+  %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %0, i32 4)
+  %vecext = extractelement <4 x float> %vld1, i32 0
+  %vecext1 = extractelement <4 x float> %vld1, i32 1
+  %vecext2 = extractelement <4 x float> %vld1, i32 2
+  %vecext3 = extractelement <4 x float> %vld1, i32 3
+  %add = fadd float %vecext3, 1.000000e+00
+  %tobool = icmp eq float* %q, null
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds float* %q, i32 1
+  %1 = load float* %arrayidx, align 4
+  %add4 = fadd float %vecext, %1
+  %2 = load float* %q, align 4
+  %add6 = fadd float %vecext1, %2
+  %arrayidx7 = getelementptr inbounds float* %q, i32 2
+  %3 = load float* %arrayidx7, align 4
+  %add8 = fadd float %vecext2, %3
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %a.0 = phi float [ %add4, %if.then ], [ %vecext, %entry ]
+  %b.0 = phi float [ %add6, %if.then ], [ %vecext1, %entry ]
+  %c.0 = phi float [ %add8, %if.then ], [ %vecext2, %entry ]
+  %vecinit = insertelement <4 x float> undef, float %a.0, i32 0
+  %vecinit9 = insertelement <4 x float> %vecinit, float %b.0, i32 1
+  %vecinit10 = insertelement <4 x float> %vecinit9, float %c.0, i32 2
+  %vecinit11 = insertelement <4 x float> %vecinit10, float %add, i32 3
+  tail call void @llvm.arm.neon.vst1.v4f32(i8* %0, <4 x float> %vecinit11, i32 4)
+  ret void
+}
+
+declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
+
+declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
+
+; CHECK: pr13999
+define void @pr13999() nounwind readonly {
+entry:
+ br i1 true, label %outer_loop, label %loop.end
+
+outer_loop:
+ %d = phi double [ 0.0, %entry ], [ %add, %after_inner_loop ]
+ %0 = insertelement <2 x double> <double 0.0, double 0.0>, double %d, i32 0
+ br i1 undef, label %after_inner_loop, label %inner_loop
+
+inner_loop:
+ br i1 true, label %after_inner_loop, label %inner_loop
+
+after_inner_loop:
+ %1 = phi <2 x double> [ %0, %outer_loop ], [ <double 0.0, double 0.0>,
+%inner_loop ]
+ %2 = extractelement <2 x double> %1, i32 1
+ %add = fadd double 1.0, %2
+ br i1 false, label %loop.end, label %outer_loop
+
+loop.end:
+ %d.end = phi double [ 0.0, %entry ], [ %add, %after_inner_loop ]
+ ret void
+}
+
+; CHECK: pr14078
+define arm_aapcs_vfpcc i32 @pr14078(i8* nocapture %arg, i8* nocapture %arg1, i32 %arg2) nounwind uwtable readonly {
+bb:
+  br i1 undef, label %bb31, label %bb3
+
+bb3:                                              ; preds = %bb12, %bb
+  %tmp = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp4 = bitcast <1 x i64> %tmp to <2 x float>
+  %tmp5 = shufflevector <2 x float> %tmp4, <2 x float> undef, <4 x i32> zeroinitializer
+  %tmp6 = bitcast <4 x float> %tmp5 to <2 x i64>
+  %tmp7 = shufflevector <2 x i64> %tmp6, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp8 = bitcast <1 x i64> %tmp7 to <2 x float>
+  %tmp9 = tail call <2 x float> @baz(<2 x float> <float 0xFFFFFFFFE0000000, float 0.000000e+00>, <2 x float> %tmp8, <2 x float> zeroinitializer) nounwind
+  br i1 undef, label %bb10, label %bb12
+
+bb10:                                             ; preds = %bb3
+  %tmp11 = load <4 x float>* undef, align 8
+  br label %bb12
+
+bb12:                                             ; preds = %bb10, %bb3
+  %tmp13 = shufflevector <2 x float> %tmp9, <2 x float> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %tmp14 = bitcast <2 x float> %tmp13 to <1 x i64>
+  %tmp15 = shufflevector <1 x i64> %tmp14, <1 x i64> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  %tmp16 = bitcast <2 x i64> %tmp15 to <4 x float>
+  %tmp17 = fmul <4 x float> zeroinitializer, %tmp16
+  %tmp18 = bitcast <4 x float> %tmp17 to <2 x i64>
+  %tmp19 = shufflevector <2 x i64> %tmp18, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp20 = bitcast <1 x i64> %tmp19 to <2 x float>
+  %tmp21 = tail call <2 x float> @baz67(<2 x float> %tmp20, <2 x float> undef) nounwind
+  %tmp22 = tail call <2 x float> @baz67(<2 x float> %tmp21, <2 x float> %tmp21) nounwind
+  %tmp23 = shufflevector <2 x float> %tmp22, <2 x float> undef, <4 x i32> zeroinitializer
+  %tmp24 = bitcast <4 x float> %tmp23 to <2 x i64>
+  %tmp25 = shufflevector <2 x i64> %tmp24, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp26 = bitcast <1 x i64> %tmp25 to <2 x float>
+  %tmp27 = extractelement <2 x float> %tmp26, i32 0
+  %tmp28 = fcmp olt float %tmp27, 0.000000e+00
+  %tmp29 = select i1 %tmp28, i32 0, i32 undef
+  %tmp30 = icmp ult i32 undef, %arg2
+  br i1 %tmp30, label %bb3, label %bb31
+
+bb31:                                             ; preds = %bb12, %bb
+  %tmp32 = phi i32 [ 1, %bb ], [ %tmp29, %bb12 ]
+  ret i32 %tmp32
+}
+
+declare <2 x float> @baz(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+
+declare <2 x float> @baz67(<2 x float>, <2 x float>) nounwind readnone
+
+%struct.wombat.5 = type { %struct.quux, %struct.quux, %struct.quux, %struct.quux }
+%struct.quux = type { <4 x float> }
+
+; CHECK: pr14079
+define linkonce_odr arm_aapcs_vfpcc %struct.wombat.5 @pr14079(i8* nocapture %arg, i8* nocapture %arg1, i8* nocapture %arg2) nounwind uwtable inlinehint {
+bb:
+  %tmp = shufflevector <2 x i64> zeroinitializer, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp3 = bitcast <1 x i64> %tmp to <2 x float>
+  %tmp4 = shufflevector <2 x float> %tmp3, <2 x float> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %tmp5 = shufflevector <2 x float> %tmp4, <2 x float> undef, <2 x i32> <i32 1, i32 3>
+  %tmp6 = bitcast <2 x float> %tmp5 to <1 x i64>
+  %tmp7 = shufflevector <1 x i64> undef, <1 x i64> %tmp6, <2 x i32> <i32 0, i32 1>
+  %tmp8 = bitcast <2 x i64> %tmp7 to <4 x float>
+  %tmp9 = shufflevector <2 x i64> zeroinitializer, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp10 = bitcast <1 x i64> %tmp9 to <2 x float>
+  %tmp11 = shufflevector <2 x float> %tmp10, <2 x float> undef, <2 x i32> <i32 0, i32 2>
+  %tmp12 = shufflevector <2 x float> %tmp11, <2 x float> undef, <2 x i32> <i32 0, i32 2>
+  %tmp13 = bitcast <2 x float> %tmp12 to <1 x i64>
+  %tmp14 = shufflevector <1 x i64> %tmp13, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %tmp15 = bitcast <2 x i64> %tmp14 to <4 x float>
+  %tmp16 = insertvalue %struct.wombat.5 undef, <4 x float> %tmp8, 1, 0
+  %tmp17 = insertvalue %struct.wombat.5 %tmp16, <4 x float> %tmp15, 2, 0
+  %tmp18 = insertvalue %struct.wombat.5 %tmp17, <4 x float> undef, 3, 0
+  ret %struct.wombat.5 %tmp18
+}
+
+; CHECK: adjustCopiesBackFrom
+; The shuffle in if.else3 must be preserved even though adjustCopiesBackFrom
+; is tempted to remove it.
+; CHECK: %if.else3
+; CHECK: vorr d
+define internal void @adjustCopiesBackFrom(<2 x i64>* noalias nocapture sret %agg.result, <2 x i64> %in) {
+entry:
+  %0 = extractelement <2 x i64> %in, i32 0
+  %cmp = icmp slt i64 %0, 1
+  %.in = select i1 %cmp, <2 x i64> <i64 0, i64 undef>, <2 x i64> %in
+  %1 = extractelement <2 x i64> %in, i32 1
+  %cmp1 = icmp slt i64 %1, 1
+  br i1 %cmp1, label %if.then2, label %if.else3
+
+if.then2:                                         ; preds = %entry
+  %2 = insertelement <2 x i64> %.in, i64 0, i32 1
+  br label %if.end4
+
+if.else3:                                         ; preds = %entry
+  %3 = shufflevector <2 x i64> %.in, <2 x i64> %in, <2 x i32> <i32 0, i32 3>
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.else3, %if.then2
+  %result.2 = phi <2 x i64> [ %2, %if.then2 ], [ %3, %if.else3 ]
+  store <2 x i64> %result.2, <2 x i64>* %agg.result, align 128
+  ret void
+}
+
+; <rdar://problem/12758887>
+; RegisterCoalescer::updateRegDefsUses() could visit an instruction more than
+; once under rare circumstances. When widening a register from QPR to DTriple
+; with the original virtual register in dsub_1_dsub_2, the double rewrite would
+; produce an invalid sub-register.
+;
+; This is because dsub_1_dsub_2 is not an idempotent sub-register index.
+; It will translate %vr:dsub_0 -> %vr:dsub_1.
+define hidden fastcc void @radar12758887() nounwind optsize ssp {
+entry:
+  br i1 undef, label %for.body, label %for.end70
+
+for.body:                                         ; preds = %for.end, %entry
+  br i1 undef, label %for.body29, label %for.end
+
+for.body29:                                       ; preds = %for.body29, %for.body
+  %0 = load <2 x double>* null, align 1
+  %splat40 = shufflevector <2 x double> %0, <2 x double> undef, <2 x i32> zeroinitializer
+  %mul41 = fmul <2 x double> undef, %splat40
+  %add42 = fadd <2 x double> undef, %mul41
+  %splat44 = shufflevector <2 x double> %0, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %mul45 = fmul <2 x double> undef, %splat44
+  %add46 = fadd <2 x double> undef, %mul45
+  br i1 undef, label %for.end, label %for.body29
+
+for.end:                                          ; preds = %for.body29, %for.body
+  %accumR2.0.lcssa = phi <2 x double> [ zeroinitializer, %for.body ], [ %add42, %for.body29 ]
+  %accumI2.0.lcssa = phi <2 x double> [ zeroinitializer, %for.body ], [ %add46, %for.body29 ]
+  %1 = shufflevector <2 x double> %accumI2.0.lcssa, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  %add58 = fadd <2 x double> undef, %1
+  %mul61 = fmul <2 x double> %add58, undef
+  %add63 = fadd <2 x double> undef, %mul61
+  %add64 = fadd <2 x double> undef, %add63
+  %add67 = fadd <2 x double> undef, %add64
+  store <2 x double> %add67, <2 x double>* undef, align 1
+  br i1 undef, label %for.end70, label %for.body
+
+for.end70:                                        ; preds = %for.end, %entry
+  ret void
+}
diff --git a/test/CodeGen/ARM/constants.ll b/test/CodeGen/ARM/constants.ll
index f4c1b5a..3baa103 100644
--- a/test/CodeGen/ARM/constants.ll
+++ b/test/CodeGen/ARM/constants.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv4t-unknown-linux-gnueabi -disable-cgp-branch-opts | FileCheck %s
+; RUN: llc < %s -mtriple=armv4t-unknown-linux-gnueabi -disable-cgp-branch-opts -verify-machineinstrs | FileCheck %s
 
 define i32 @f1() {
 ; CHECK: f1
@@ -45,6 +45,16 @@ r:
         ret void
 }
 
+define i32 @f8() nounwind {
+; Check that constant propagation through (i32)-1 => (float)Nan => (i32)-1
+; gives expected result
+; CHECK: f8
+; CHECK: mvn r0, #0
+        %tmp0 = bitcast i32 -1 to float
+        %tmp1 = bitcast float %tmp0 to i32
+        ret i32 %tmp1
+}
+
 %t1 = type { <3 x float>, <3 x float> }
 
 @const1 = global %t1 { <3 x float> zeroinitializer,
diff --git a/test/CodeGen/ARM/crash-shufflevector.ll b/test/CodeGen/ARM/crash-shufflevector.ll
new file mode 100644
index 0000000..bdc0e0e
--- /dev/null
+++ b/test/CodeGen/ARM/crash-shufflevector.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=armv7
+
+declare void @g(<16 x i8>)
+define void @f(<4 x i8> %param1, <4 x i8> %param2) {
+   %y1 = shufflevector <4 x i8> %param1, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+   %y2 = shufflevector <4 x i8> %param2, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+   %z = shufflevector <16 x i8> %y1, <16 x i8> %y2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+   call void @g(<16 x i8> %z)
+   ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/ARM/darwin-section-order.ll b/test/CodeGen/ARM/darwin-section-order.ll
new file mode 100644
index 0000000..701028c
--- /dev/null
+++ b/test/CodeGen/ARM/darwin-section-order.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=armv7-apple-darwin | FileCheck %s
+
+; CHECK: .section	__TEXT,__text,regular,pure_instructions
+; CHECK: .section	__TEXT,myprecious
+; CHECK: .section	__TEXT,__textcoal_nt,coalesced,pure_instructions
+; CHECK: .section	__TEXT,__const_coal,coalesced
+; CHECK: .section	__TEXT,__picsymbolstub4,symbol_stubs,none,16
+; CHECK: .section	__TEXT,__StaticInit,regular,pure_instructions
+
+
+define void @normal() nounwind readnone {
+; CHECK: .section	__TEXT,__text,regular,pure_instructions
+; CHECK: _normal:
+  ret void
+}
+
+define void @special() nounwind readnone section "__TEXT,myprecious" {
+; CHECK: .section	__TEXT,myprecious
+; CHECK: _special:
+  ret void
+}
diff --git a/test/CodeGen/ARM/deps-fix.ll b/test/CodeGen/ARM/deps-fix.ll
new file mode 100644
index 0000000..288697a
--- /dev/null
+++ b/test/CodeGen/ARM/deps-fix.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a9 -mattr=+neon,+neonfp -float-abi=hard -mtriple armv7-linux-gnueabi | FileCheck %s
+
+;; This test checks that the ExecutionDepsFix pass performs the domain changes
+;; even when some dependencies are propagated through implicit definitions.
+
+; CHECK: fun_a
+define <4 x float> @fun_a(<4 x float> %in, <4 x float> %x, float %y) nounwind {
+; CHECK: vext
+; CHECK: vext
+; CHECK: vadd.f32
+  %1 = insertelement <4 x float> %in, float %y, i32 0
+  %2 = fadd <4 x float> %1, %x  
+  ret <4 x float> %2
+}
+; CHECK: fun_b
+define <4 x i32> @fun_b(<4 x i32> %in, <4 x i32> %x, i32 %y) nounwind {
+; CHECK: vmov.32
+; CHECK: vadd.i32
+  %1 = insertelement <4 x i32> %in, i32 %y, i32 0
+  %2 = add <4 x i32> %1, %x  
+  ret <4 x i32> %2
+}
diff --git a/test/CodeGen/ARM/div.ll b/test/CodeGen/ARM/div.ll
index 3d29e05..82cfca1 100644
--- a/test/CodeGen/ARM/div.ll
+++ b/test/CodeGen/ARM/div.ll
@@ -1,9 +1,13 @@
-; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s -check-prefix=CHECK-ARM
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK-ARM
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=swift     | FileCheck %s -check-prefix=CHECK-SWIFT
 
 define i32 @f1(i32 %a, i32 %b) {
 entry:
 ; CHECK-ARM: f1
 ; CHECK-ARM: __divsi3
+
+; CHECK-SWIFT: f1
+; CHECK-SWIFT: sdiv
         %tmp1 = sdiv i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
@@ -12,6 +16,9 @@ define i32 @f2(i32 %a, i32 %b) {
 entry:
 ; CHECK-ARM: f2
 ; CHECK-ARM: __udivsi3
+
+; CHECK-SWIFT: f2
+; CHECK-SWIFT: udiv
         %tmp1 = udiv i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
@@ -20,6 +27,10 @@ define i32 @f3(i32 %a, i32 %b) {
 entry:
 ; CHECK-ARM: f3
 ; CHECK-ARM: __modsi3
+
+; CHECK-SWIFT: f3
+; CHECK-SWIFT: sdiv
+; CHECK-SWIFT: mls
         %tmp1 = srem i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
@@ -28,6 +39,10 @@ define i32 @f4(i32 %a, i32 %b) {
 entry:
 ; CHECK-ARM: f4
 ; CHECK-ARM: __umodsi3
+
+; CHECK-SWIFT: f4
+; CHECK-SWIFT: udiv
+; CHECK-SWIFT: mls
         %tmp1 = urem i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
diff --git a/test/CodeGen/ARM/divmod.ll b/test/CodeGen/ARM/divmod.ll
index 7fbf8f4..577f8aa 100644
--- a/test/CodeGen/ARM/divmod.ll
+++ b/test/CodeGen/ARM/divmod.ll
@@ -1,10 +1,18 @@
-; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
+; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=swift     | FileCheck %s -check-prefix=SWIFT
+
+; rdar://12481395
 
 define void @foo(i32 %x, i32 %y, i32* nocapture %P) nounwind ssp {
 entry:
-; CHECK: foo:
-; CHECK: bl ___divmodsi4
-; CHECK-NOT: bl ___divmodsi4
+; A8: foo:
+; A8: bl ___divmodsi4
+; A8-NOT: bl ___divmodsi4
+
+; SWIFT: foo:
+; SWIFT: sdiv
+; SWIFT: mls
+; SWIFT-NOT: bl __divmodsi4
   %div = sdiv i32 %x, %y
   store i32 %div, i32* %P, align 4
   %rem = srem i32 %x, %y
@@ -15,9 +23,14 @@ entry:
 
 define void @bar(i32 %x, i32 %y, i32* nocapture %P) nounwind ssp {
 entry:
-; CHECK: bar:
-; CHECK: bl ___udivmodsi4
-; CHECK-NOT: bl ___udivmodsi4
+; A8: bar:
+; A8: bl ___udivmodsi4
+; A8-NOT: bl ___udivmodsi4
+
+; SWIFT: bar:
+; SWIFT: udiv
+; SWIFT: mls
+; SWIFT-NOT: bl __udivmodsi4
   %div = udiv i32 %x, %y
   store i32 %div, i32* %P, align 4
   %rem = urem i32 %x, %y
@@ -32,14 +45,18 @@ entry:
 
 define void @do_indent(i32 %cols) nounwind {
 entry:
-; CHECK: do_indent:
+; A8: do_indent:
+; SWIFT: do_indent:
   %0 = load i32* @flags, align 4
   %1 = and i32 %0, 67108864
   %2 = icmp eq i32 %1, 0
   br i1 %2, label %bb1, label %bb
 
 bb:
-; CHECK: bl ___divmodsi4
+; A8: bl ___divmodsi4
+; SWIFT: sdiv
+; SWIFT: mls
+; SWIFT-NOT: bl __divmodsi4
   %3 = load i32* @tabsize, align 4
   %4 = srem i32 %cols, %3
   %5 = sdiv i32 %cols, %3
@@ -60,9 +77,14 @@ declare i8* @__memset_chk(i8*, i32, i32, i32) nounwind
 ; rdar://11714607
 define i32 @howmany(i32 %x, i32 %y) nounwind {
 entry:
-; CHECK: howmany:
-; CHECK: bl ___udivmodsi4
-; CHECK-NOT: ___udivsi3
+; A8: howmany:
+; A8: bl ___udivmodsi4
+; A8-NOT: ___udivsi3
+
+; SWIFT: howmany:
+; SWIFT: udiv
+; SWIFT: mls
+; SWIFT-NOT: bl __udivmodsi4
   %rem = urem i32 %x, %y
   %div = udiv i32 %x, %y
   %not.cmp = icmp ne i32 %rem, 0
diff --git a/test/CodeGen/ARM/domain-conv-vmovs.ll b/test/CodeGen/ARM/domain-conv-vmovs.ll
new file mode 100644
index 0000000..a5c4114
--- /dev/null
+++ b/test/CodeGen/ARM/domain-conv-vmovs.ll
@@ -0,0 +1,100 @@
+; RUN: llc -verify-machineinstrs -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a9 -mattr=+neon,+neonfp -float-abi=hard < %s | FileCheck %s
+
+define <2 x float> @test_vmovs_via_vext_lane0to0(float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane0to0:
+  %vec = insertelement <2 x float> %in, float %arg, i32 0
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d1, d0, #1
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+define <2 x float> @test_vmovs_via_vext_lane0to1(float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane0to1:
+  %vec = insertelement <2 x float> %in, float %arg, i32 1
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vext.32 d1, d1, d0, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+define <2 x float> @test_vmovs_via_vext_lane1to0(float, float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane1to0:
+  %vec = insertelement <2 x float> %in, float %arg, i32 0
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vext.32 d1, d0, d1, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+define <2 x float> @test_vmovs_via_vext_lane1to1(float, float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane1to1:
+  %vec = insertelement <2 x float> %in, float %arg, i32 1
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d0, d1, #1
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+
+define float @test_vmovs_via_vdup(float, float %ret, float %lhs, float %rhs) {
+; CHECK: test_vmovs_via_vdup:
+
+  ; Do an operation (which will end up NEON because of +neonfp) to convince the
+  ; execution-domain pass that NEON is a good thing to use.
+  %res = fadd float %ret, %ret
+  ;  It makes sense for LLVM to do the addition in d0 here, because it's going
+  ;  to be returned. This means it will want a "vmov s0, s1":
+; CHECK: vdup.32 d0, d0[1]
+
+  ret float %res
+}
+
+declare float @llvm.sqrt.f32(float)
+
+declare void @bar()
+
+; This is a comp
+define float @test_ineligible(float, float %in) {
+; CHECK: test_ineligible:
+
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %val = fadd float %sqrt, %sqrt
+
+  ; This call forces a move from a callee-saved register to the return-reg. That
+  ; move is not eligible for conversion to a d-register instructions because the
+  ; use-def chains would be messed up. Primarily a compile-test (we used to
+  ; internal fault).
+  call void @bar()
+; CHECL: bl bar
+; CHECK: vext.32
+; CHECK: vext.32
+  ret float %val
+}
+
+define i32 @test_vmovs_no_sreg(i32 %in) {
+; CHECK: test_vmovs_no_sreg:
+
+  ; Check that the movement to and from GPRs takes place in the NEON domain.
+; CHECK: vmov.32 d
+  %x = bitcast i32 %in to float
+
+  %res = fadd float %x, %x
+
+; CHECK: vmov.32 r{{[0-9]+}}, d
+  %resi = bitcast float %res to i32
+
+  ret i32 %resi
+}
diff --git a/test/CodeGen/ARM/fabss.ll b/test/CodeGen/ARM/fabss.ll
index bcb4ee7..46c2f1c 100644
--- a/test/CodeGen/ARM/fabss.ll
+++ b/test/CodeGen/ARM/fabss.ll
@@ -14,12 +14,12 @@ entry:
 declare float @fabsf(float)
 
 ; VFP2: test:
-; VFP2: 	vabs.f32	s1, s1
+; VFP2: 	vabs.f32	s2, s2
 
 ; NFP1: test:
 ; NFP1: 	vabs.f32	d1, d1
 ; NFP0: test:
-; NFP0: 	vabs.f32	s1, s1
+; NFP0: 	vabs.f32	s2, s2
 
 ; CORTEXA8: test:
 ; CORTEXA8:     vadd.f32        [[D1:d[0-9]+]]
diff --git a/test/CodeGen/ARM/fadds.ll b/test/CodeGen/ARM/fadds.ll
index e35103c..48ef5ed 100644
--- a/test/CodeGen/ARM/fadds.ll
+++ b/test/CodeGen/ARM/fadds.ll
@@ -10,14 +10,14 @@ entry:
 }
 
 ; VFP2: test:
-; VFP2: 	vadd.f32	s0, s1, s0
+; VFP2: 	vadd.f32	s
 
 ; NFP1: test:
-; NFP1: 	vadd.f32	d0, d1, d0
+; NFP1: 	vadd.f32	d
 ; NFP0: test:
-; NFP0: 	vadd.f32	s0, s1, s0
+; NFP0: 	vadd.f32	s
 
 ; CORTEXA8: test:
-; CORTEXA8: 	vadd.f32	d0, d1, d0
+; CORTEXA8: 	vadd.f32	d
 ; CORTEXA9: test:
 ; CORTEXA9: 	vadd.f32	s{{.}}, s{{.}}, s{{.}}
diff --git a/test/CodeGen/ARM/fast-isel-pic.ll b/test/CodeGen/ARM/fast-isel-pic.ll
new file mode 100644
index 0000000..867d53f
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-pic.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=arm-apple-ios | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARMv7
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=thumbv7-none-linux-gnueabi | FileCheck %s --check-prefix=THUMB-ELF
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=armv7-none-linux-gnueabi | FileCheck %s --check-prefix=ARMv7-ELF
+
+@g = global i32 0, align 4
+
+define i32 @LoadGV() {
+entry:
+; THUMB: LoadGV
+; THUMB: movw [[reg0:r[0-9]+]],
+; THUMB: movt [[reg0]],
+; THUMB: add  [[reg0]], pc
+; THUMB-ELF: LoadGV
+; THUMB-ELF: ldr.n r[[reg0:[0-9]+]],
+; THUMB-ELF: ldr.n r[[reg1:[0-9]+]],
+; THUMB-ELF: ldr r[[reg0]], [r[[reg1]], r[[reg0]]]
+; ARM: LoadGV
+; ARM: ldr [[reg1:r[0-9]+]],
+; ARM: add [[reg1]], pc, [[reg1]]
+; ARMv7: LoadGV
+; ARMv7: movw [[reg2:r[0-9]+]],
+; ARMv7: movt [[reg2]],
+; ARMv7: add  [[reg2]], pc, [[reg2]]
+; ARMv7-ELF: LoadGV
+; ARMv7-ELF: ldr r[[reg2:[0-9]+]],
+; ARMv7-ELF: ldr r[[reg3:[0-9]+]],
+; ARMv7-ELF: ldr r[[reg2]], [r[[reg3]], r[[reg2]]]
+  %tmp = load i32* @g
+  ret i32 %tmp
+}
+
+@i = external global i32
+
+define i32 @LoadIndirectSymbol() {
+entry:
+; THUMB: LoadIndirectSymbol
+; THUMB: movw r[[reg3:[0-9]+]],
+; THUMB: movt r[[reg3]],
+; THUMB: add  r[[reg3]], pc
+; THUMB: ldr  r[[reg3]], [r[[reg3]]]
+; THUMB-ELF: LoadIndirectSymbol
+; THUMB-ELF: ldr.n r[[reg3:[0-9]+]],
+; THUMB-ELF: ldr.n r[[reg4:[0-9]+]],
+; THUMB-ELF: ldr r[[reg3]], [r[[reg4]], r[[reg3]]]
+; ARM: LoadIndirectSymbol
+; ARM: ldr [[reg4:r[0-9]+]],
+; ARM: ldr [[reg4]], [pc, [[reg4]]]
+; ARMv7: LoadIndirectSymbol
+; ARMv7: movw r[[reg5:[0-9]+]],
+; ARMv7: movt r[[reg5]],
+; ARMv7: add  r[[reg5]], pc, r[[reg5]]
+; ARMv7: ldr  r[[reg5]], [r[[reg5]]]
+; ARMv7-ELF: LoadIndirectSymbol
+; ARMv7-ELF: ldr r[[reg5:[0-9]+]],
+; ARMv7-ELF: ldr r[[reg6:[0-9]+]],
+; ARMv7-ELF: ldr r[[reg5]], [r[[reg6]], r[[reg5]]]
+  %tmp = load i32* @i
+  ret i32 %tmp
+}
diff --git a/test/CodeGen/ARM/fast-isel.ll b/test/CodeGen/ARM/fast-isel.ll
index ecd5fe2..41fda41 100644
--- a/test/CodeGen/ARM/fast-isel.ll
+++ b/test/CodeGen/ARM/fast-isel.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
 
 ; Very basic fast-isel functionality.
 define i32 @add(i32 %a, i32 %b) nounwind {
@@ -238,3 +240,67 @@ entry:
 }
 
 declare void @llvm.trap() nounwind
+
+define void @unaligned_i16_store(i16 %x, i16* %y) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i16_store
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+
+; THUMB-STRICT-ALIGN: @unaligned_i16_store
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+
+  store i16 %x, i16* %y, align 1
+  ret void
+}
+
+define i16 @unaligned_i16_load(i16* %x) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i16_load
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+
+; THUMB-STRICT-ALIGN: @unaligned_i16_load
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+
+  %0 = load i16* %x, align 1
+  ret i16 %0
+}
+
+define void @unaligned_i32_store(i32 %x, i32* %y) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i32_store
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+
+; THUMB-STRICT-ALIGN: @unaligned_i32_store
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+
+  store i32 %x, i32* %y, align 1
+  ret void
+}
+
+define i32 @unaligned_i32_load(i32* %x) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i32_load
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+
+; THUMB-STRICT-ALIGN: @unaligned_i32_load
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+
+  %0 = load i32* %x, align 1
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM/fdivs.ll b/test/CodeGen/ARM/fdivs.ll
index 31c1ca9..8fab002 100644
--- a/test/CodeGen/ARM/fdivs.ll
+++ b/test/CodeGen/ARM/fdivs.ll
@@ -10,14 +10,14 @@ entry:
 }
 
 ; VFP2: test:
-; VFP2: 	vdiv.f32	s0, s1, s0
+; VFP2: 	vdiv.f32	s0, s2, s0
 
 ; NFP1: test:
-; NFP1: 	vdiv.f32	s0, s1, s0
+; NFP1: 	vdiv.f32	s0, s2, s0
 ; NFP0: test:
-; NFP0: 	vdiv.f32	s0, s1, s0
+; NFP0: 	vdiv.f32	s0, s2, s0
 
 ; CORTEXA8: test:
-; CORTEXA8: 	vdiv.f32	s0, s1, s0
+; CORTEXA8: 	vdiv.f32	s0, s2, s0
 ; CORTEXA9: test:
 ; CORTEXA9: 	vdiv.f32	s{{.}}, s{{.}}, s{{.}}
diff --git a/test/CodeGen/ARM/fmuls.ll b/test/CodeGen/ARM/fmuls.ll
index 3c3182b..1566a92 100644
--- a/test/CodeGen/ARM/fmuls.ll
+++ b/test/CodeGen/ARM/fmuls.ll
@@ -10,15 +10,15 @@ entry:
 }
 
 ; VFP2: test:
-; VFP2: 	vmul.f32	s0, s1, s0
+; VFP2: 	vmul.f32	s
 
 ; NFP1: test:
-; NFP1: 	vmul.f32	d0, d1, d0
+; NFP1: 	vmul.f32	d
 ; NFP0: test:
-; NFP0: 	vmul.f32	s0, s1, s0
+; NFP0: 	vmul.f32	s
 
 ; CORTEXA8: test:
-; CORTEXA8: 	vmul.f32	d0, d1, d0
+; CORTEXA8: 	vmul.f32	d
 ; CORTEXA9: test:
 ; CORTEXA9: 	vmul.f32	s{{.}}, s{{.}}, s{{.}}
 
diff --git a/test/CodeGen/ARM/fp-fast.ll b/test/CodeGen/ARM/fp-fast.ll
new file mode 100644
index 0000000..ec57187
--- /dev/null
+++ b/test/CodeGen/ARM/fp-fast.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=arm -mcpu=cortex-a9 -mattr=+vfp4 -enable-unsafe-fp-math < %s | FileCheck %s
+
+; CHECK: test1
+define float @test1(float %x) {
+; CHECK-NOT: vfma
+; CHECK: vmul.f32
+; CHECK-NOT: vfma
+  %t1 = fmul float %x, 3.0
+  %t2 = call float @llvm.fma.f32(float %x, float 2.0, float %t1)
+  ret float %t2
+}
+
+; CHECK: test2
+define float @test2(float %x, float %y) {
+; CHECK-NOT: vmul
+; CHECK: vfma.f32
+; CHECK-NOT: vmul
+  %t1 = fmul float %x, 3.0
+  %t2 = call float @llvm.fma.f32(float %t1, float 2.0, float %y)
+  ret float %t2
+}
+
+; CHECK: test3
+define float @test3(float %x, float %y) {
+; CHECK-NOT: vfma
+; CHECK: vadd.f32
+; CHECK-NOT: vfma
+  %t2 = call float @llvm.fma.f32(float %x, float 1.0, float %y)
+  ret float %t2
+}
+
+; CHECK: test4
+define float @test4(float %x, float %y) {
+; CHECK-NOT: vfma
+; CHECK: vsub.f32
+; CHECK-NOT: vfma
+  %t2 = call float @llvm.fma.f32(float %x, float -1.0, float %y)
+  ret float %t2
+}
+
+; CHECK: test5
+define float @test5(float %x) {
+; CHECK-NOT: vfma
+; CHECK: vmul.f32
+; CHECK-NOT: vfma
+  %t2 = call float @llvm.fma.f32(float %x, float 2.0, float %x)
+  ret float %t2
+}
+
+; CHECK: test6
+define float @test6(float %x) {
+; CHECK-NOT: vfma
+; CHECK: vmul.f32
+; CHECK-NOT: vfma
+  %t1 = fsub float -0.0, %x
+  %t2 = call float @llvm.fma.f32(float %x, float 5.0, float %t1)
+  ret float %t2
+}
+
+declare float @llvm.fma.f32(float, float, float)
diff --git a/test/CodeGen/ARM/fp_convert.ll b/test/CodeGen/ARM/fp_convert.ll
index 7002cec..44298b9 100644
--- a/test/CodeGen/ARM/fp_convert.ll
+++ b/test/CodeGen/ARM/fp_convert.ll
@@ -31,7 +31,7 @@ define float @test3(i32 %a, i32 %b) {
 ; VFP2: test3:
 ; VFP2: vcvt.f32.u32 s{{.}}, s{{.}}
 ; NEON: test3:
-; NEON: vcvt.f32.u32 d0, d0
+; NEON: vcvt.f32.u32 d
 entry:
         %0 = add i32 %a, %b
         %1 = uitofp i32 %0 to float
@@ -42,7 +42,7 @@ define float @test4(i32 %a, i32 %b) {
 ; VFP2: test4:
 ; VFP2: vcvt.f32.s32 s{{.}}, s{{.}}
 ; NEON: test4:
-; NEON: vcvt.f32.s32 d0, d0
+; NEON: vcvt.f32.s32 d
 entry:
         %0 = add i32 %a, %b
         %1 = sitofp i32 %0 to float
diff --git a/test/CodeGen/ARM/fsubs.ll b/test/CodeGen/ARM/fsubs.ll
index bea8d5f..f039e74 100644
--- a/test/CodeGen/ARM/fsubs.ll
+++ b/test/CodeGen/ARM/fsubs.ll
@@ -8,6 +8,6 @@ entry:
 	ret float %0
 }
 
-; VFP2: vsub.f32	s0, s1, s0
-; NFP1: vsub.f32	d0, d1, d0
-; NFP0: vsub.f32	s0, s1, s0
+; VFP2: vsub.f32	s
+; NFP1: vsub.f32	d
+; NFP0: vsub.f32	s
diff --git a/test/CodeGen/ARM/ifcvt1.ll b/test/CodeGen/ARM/ifcvt1.ll
index cd870bb..fd83144 100644
--- a/test/CodeGen/ARM/ifcvt1.ll
+++ b/test/CodeGen/ARM/ifcvt1.ll
@@ -1,17 +1,21 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
+; RUN: llc < %s -march=arm -mcpu=swift     | FileCheck %s -check-prefix=SWIFT
 
 define i32 @t1(i32 %a, i32 %b) {
-; CHECK: t1:
+; A8: t1:
+; SWIFT: t1:
 	%tmp2 = icmp eq i32 %a, 0
 	br i1 %tmp2, label %cond_false, label %cond_true
 
 cond_true:
-; CHECK: subeq r0, r1, #1
+; A8: subeq r0, r1, #1
+; SWIFT: sub r0, r1, #1
 	%tmp5 = add i32 %b, 1
 	ret i32 %tmp5
 
 cond_false:
-; CHECK: addne r0, r1, #1
+; A8: addne r0, r1, #1
+; SWIFT: addne r0, r1, #1
 	%tmp7 = add i32 %b, -1
 	ret i32 %tmp7
 }
diff --git a/test/CodeGen/ARM/ifcvt12.ll b/test/CodeGen/ARM/ifcvt12.ll
new file mode 100644
index 0000000..77bdca5
--- /dev/null
+++ b/test/CodeGen/ARM/ifcvt12.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=arm-apple-darwin -mcpu=cortex-a8 | FileCheck %s
+define i32 @f1(i32 %a, i32 %b, i32 %c) {
+; CHECK: f1:
+; CHECK: mlsne r0, r0, r1, r2
+    %tmp1 = icmp eq i32 %a, 0
+    br i1 %tmp1, label %cond_false, label %cond_true
+
+cond_true:
+    %tmp2 = mul i32 %a, %b
+    %tmp3 = sub i32 %c, %tmp2
+    ret i32 %tmp3
+
+cond_false:
+    ret i32 %a
+}
diff --git a/test/CodeGen/ARM/ifcvt5.ll b/test/CodeGen/ARM/ifcvt5.ll
index 95f5c97..5081791 100644
--- a/test/CodeGen/ARM/ifcvt5.ll
+++ b/test/CodeGen/ARM/ifcvt5.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=swift     | FileCheck %s -check-prefix=SWIFT
+; rdar://8402126
 
 @x = external global i32*		; <i32**> [#uses=1]
 
@@ -10,8 +12,12 @@ entry:
 }
 
 define i32 @t1(i32 %a, i32 %b) {
-; CHECK: t1:
-; CHECK: poplt {r7, pc}
+; A8: t1:
+; A8: poplt {r7, pc}
+
+; SWIFT: t1:
+; SWIFT: pop {r7, pc}
+; SWIFT: pop {r7, pc}
 entry:
 	%tmp1 = icmp sgt i32 %a, 10		; <i1> [#uses=1]
 	br i1 %tmp1, label %cond_true, label %UnifiedReturnBlock
diff --git a/test/CodeGen/ARM/indirectbr-2.ll b/test/CodeGen/ARM/indirectbr-2.ll
new file mode 100644
index 0000000..084f520
--- /dev/null
+++ b/test/CodeGen/ARM/indirectbr-2.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -O0 -relocation-model=pic -mtriple=thumbv7-apple-ios | FileCheck %s
+; <rdar://problem/12529625>
+
+@foo = global i32 34879, align 4
+@DWJumpTable2808 = global [2 x i32] [i32 sub (i32 ptrtoint (i8* blockaddress(@func, %14) to i32), i32 ptrtoint (i8* blockaddress(@func, %4) to i32)), i32 sub (i32 ptrtoint (i8* blockaddress(@func, %13) to i32), i32 ptrtoint (i8* blockaddress(@func, %4) to i32))]
+@0 = internal constant [45 x i8] c"func XXXXXXXXXXX :: bb xxxxxxxxxxxxxxxxxxxx\0A\00"
+
+; The indirect branch has the two destinations as successors. The lone PHI
+; statement shouldn't be implicitly defined.
+
+; CHECK:      func:
+; CHECK:      Ltmp1:    @ Block address taken
+; CHECK-NOT:            @ implicit-def: R0
+; CHECK:                @ 4-byte Reload
+
+define i32 @func() nounwind ssp {
+  %1 = alloca i32, align 4
+  %2 = load i32* @foo, align 4
+  %3 = icmp eq i32 %2, 34879
+  br label %4
+
+; <label>:4                                       ; preds = %0
+  %5 = zext i1 %3 to i32
+  %6 = mul i32 %5, 287
+  %7 = add i32 %6, 2
+  %8 = getelementptr [2 x i32]* @DWJumpTable2808, i32 0, i32 %5
+  %9 = load i32* %8
+  %10 = add i32 %9, ptrtoint (i8* blockaddress(@func, %4) to i32)
+  %11 = inttoptr i32 %10 to i8*
+  %12 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @0, i32 0, i32 0))
+  indirectbr i8* %11, [label %13, label %14]
+
+; <label>:13                                      ; preds = %4
+  %tmp14 = phi i32 [ %7, %4 ]
+  store i32 23958, i32* @foo, align 4
+  %tmp15 = load i32* %1, align 4
+  %tmp16 = icmp eq i32 %tmp15, 0
+  %tmp17 = zext i1 %tmp16 to i32
+  %tmp21 = add i32 %tmp17, %tmp14
+  ret i32 %tmp21
+
+; <label>:14                                      ; preds = %4
+  ret i32 42
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/ARM/integer_insertelement.ll b/test/CodeGen/ARM/integer_insertelement.ll
new file mode 100644
index 0000000..1d72afe
--- /dev/null
+++ b/test/CodeGen/ARM/integer_insertelement.ll
@@ -0,0 +1,35 @@
+; RUN: llc %s -o - -march=arm -mattr=+neon | FileCheck %s
+
+; This test checks that when inserting one (integer) element into a vector,
+; the vector is not spuriously copied. "vorr dX, dY, dY" is the way of moving
+; one DPR to another that we check for.
+
+; CHECK: @f
+; CHECK-NOT: vorr d
+; CHECK: vmov.32 d
+; CHECK-NOT: vorr d
+; CHECK: mov pc, lr
+define <4 x i32> @f(<4 x i32> %in) {
+  %1 = insertelement <4 x i32> %in, i32 255, i32 3
+  ret <4 x i32> %1
+}
+
+; CHECK: @g
+; CHECK-NOT: vorr d
+; CHECK: vmov.16 d
+; CHECK-NOT: vorr d
+; CHECK: mov pc, lr
+define <8 x i16> @g(<8 x i16> %in) {
+  %1 = insertelement <8 x i16> %in, i16 255, i32 7
+  ret <8 x i16> %1
+}
+
+; CHECK: @h
+; CHECK-NOT: vorr d
+; CHECK: vmov.8 d
+; CHECK-NOT: vorr d
+; CHECK: mov pc, lr
+define <16 x i8> @h(<16 x i8> %in) {
+  %1 = insertelement <16 x i8> %in, i8 255, i32 15
+  ret <16 x i8> %1
+}
diff --git a/test/CodeGen/ARM/ldr_post.ll b/test/CodeGen/ARM/ldr_post.ll
index 8ddf025..a6ca434 100644
--- a/test/CodeGen/ARM/ldr_post.ll
+++ b/test/CodeGen/ARM/ldr_post.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=swift | FileCheck %s
 
 ; CHECK: test1:
 ; CHECK: ldr {{.*, \[.*]}}, -r2
diff --git a/test/CodeGen/ARM/ldr_pre.ll b/test/CodeGen/ARM/ldr_pre.ll
index e904e5f..6c40ad7 100644
--- a/test/CodeGen/ARM/ldr_pre.ll
+++ b/test/CodeGen/ARM/ldr_pre.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=swift | FileCheck %s
 
 ; CHECK: test1:
 ; CHECK: ldr {{.*!}}
diff --git a/test/CodeGen/ARM/longMAC.ll b/test/CodeGen/ARM/longMAC.ll
new file mode 100644
index 0000000..e4a00e9
--- /dev/null
+++ b/test/CodeGen/ARM/longMAC.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -march=arm | FileCheck %s
+; Check generated signed and unsigned multiply accumulate long.
+
+define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) {
+;CHECK: MACLongTest1:
+;CHECK: umlal
+  %conv = zext i32 %a to i64
+  %conv1 = zext i32 %b to i64
+  %mul = mul i64 %conv1, %conv
+  %add = add i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest2(i32 %a, i32 %b, i64 %c)  {
+;CHECK: MACLongTest2:
+;CHECK: smlal
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %add = add nsw i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest3(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest3:
+;CHECK: umlal
+  %conv = zext i32 %b to i64
+  %conv1 = zext i32 %a to i64
+  %mul = mul i64 %conv, %conv1
+  %conv2 = zext i32 %c to i64
+  %add = add i64 %mul, %conv2
+  ret i64 %add
+}
+
+define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest4:
+;CHECK: smlal
+  %conv = sext i32 %b to i64
+  %conv1 = sext i32 %a to i64
+  %mul = mul nsw i64 %conv, %conv1
+  %conv2 = sext i32 %c to i64
+  %add = add nsw i64 %mul, %conv2
+  ret i64 %add
+}
diff --git a/test/CodeGen/ARM/mls.ll b/test/CodeGen/ARM/mls.ll
index a6cdba4..066bf98 100644
--- a/test/CodeGen/ARM/mls.ll
+++ b/test/CodeGen/ARM/mls.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=arm -mattr=+v6t2 | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+v6t2 -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
 
 define i32 @f1(i32 %a, i32 %b, i32 %c) {
     %tmp1 = mul i32 %a, %b
@@ -13,4 +14,15 @@ define i32 @f2(i32 %a, i32 %b, i32 %c) {
     ret i32 %tmp2
 }
 
+; CHECK: f1:
 ; CHECK: mls	r0, r0, r1, r2
+; NO_MULOPS: f1:
+; NO_MULOPS: mul r0, r0, r1
+; NO_MULOPS-NEXT: sub r0, r2, r0
+
+; CHECK: f2:
+; CHECK: mul r0, r0, r1
+; CHECK-NEXT: sub r0, r0, r2
+; NO_MULOPS: f2:
+; NO_MULOPS: mul r0, r0, r1
+; NO_MULOPS-NEXT: sub r0, r0, r2
diff --git a/test/CodeGen/ARM/neon-fma.ll b/test/CodeGen/ARM/neon-fma.ll
new file mode 100644
index 0000000..d2cca50
--- /dev/null
+++ b/test/CodeGen/ARM/neon-fma.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -mcpu=swift | FileCheck %s
+
+; CHECK: test_v2f32
+; CHECK: vfma.f32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+
+define <2 x float> @test_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
+entry:
+  %call = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone
+  ret <2 x float> %call
+}
+
+; CHECK: test_v4f32
+; CHECK: vfma.f32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+
+define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
+entry:
+  %call = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone
+  ret <4 x float> %call
+}
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/neon_ld2.ll b/test/CodeGen/ARM/neon_ld2.ll
index 944bfe0..497619e 100644
--- a/test/CodeGen/ARM/neon_ld2.ll
+++ b/test/CodeGen/ARM/neon_ld2.ll
@@ -1,10 +1,16 @@
 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=swift | FileCheck %s --check-prefix=SWIFT
 
 ; CHECK: t1
-; CHECK: vldmia
-; CHECK: vldmia
+; CHECK: vld1.64
+; CHECK: vld1.64
 ; CHECK: vadd.i64 q
-; CHECK: vstmia
+; CHECK: vst1.64
+; SWIFT: t1
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vadd.i64 q
+; SWIFT: vst1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
 define void @t1(<4 x i32>* %r, <2 x i64>* %a, <2 x i64>* %b) nounwind {
 entry:
 	%0 = load <2 x i64>* %a, align 16		; <<2 x i64>> [#uses=1]
@@ -16,11 +22,17 @@ entry:
 }
 
 ; CHECK: t2
-; CHECK: vldmia
-; CHECK: vldmia
+; CHECK: vld1.64
+; CHECK: vld1.64
 ; CHECK: vsub.i64 q
 ; CHECK: vmov r0, r1, d
 ; CHECK: vmov r2, r3, d
+; SWIFT: t2
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vsub.i64 q
+; SWIFT: vmov r0, r1, d
+; SWIFT: vmov r2, r3, d
 define <4 x i32> @t2(<2 x i64>* %a, <2 x i64>* %b) nounwind readonly {
 entry:
 	%0 = load <2 x i64>* %a, align 16		; <<2 x i64>> [#uses=1]
@@ -30,3 +42,18 @@ entry:
 	ret <4 x i32> %3
 }
 
+; Limited alignment.
+; SWIFT: t3
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+}}
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+}}
+; SWIFT: vadd.i64 q
+; SWIFT: vst1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+}}
+define void @t3(<4 x i32>* %r, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+entry:
+	%0 = load <2 x i64>* %a, align 8
+	%1 = load <2 x i64>* %b, align 8
+	%2 = add <2 x i64> %0, %1
+	%3 = bitcast <2 x i64> %2 to <4 x i32>
+	store <4 x i32> %3, <4 x i32>* %r, align 8
+	ret void
+}
diff --git a/test/CodeGen/ARM/opt-shuff-tstore.ll b/test/CodeGen/ARM/opt-shuff-tstore.ll
index df98e23..74c9a21 100644
--- a/test/CodeGen/ARM/opt-shuff-tstore.ll
+++ b/test/CodeGen/ARM/opt-shuff-tstore.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: func_4_8
 ; CHECK: vst1.32
-; CHECK-NEXT: bx lr
+; CHECK: bx lr
 define void @func_4_8(<4 x i8> %param, <4 x i8>* %p) {
   %r = add <4 x i8> %param, <i8 1, i8 2, i8 3, i8 4>
   store <4 x i8> %r, <4 x i8>* %p
@@ -11,7 +11,7 @@ define void @func_4_8(<4 x i8> %param, <4 x i8>* %p) {
 
 ; CHECK: func_2_16
 ; CHECK: vst1.32
-; CHECK-NEXT: bx lr
+; CHECK: bx lr
 define void @func_2_16(<2 x i16> %param, <2 x i16>* %p) {
   %r = add <2 x i16> %param, <i16 1, i16 2>
   store <2 x i16> %r, <2 x i16>* %p
diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll
index 05794e4..6d6586e 100644
--- a/test/CodeGen/ARM/reg_sequence.ll
+++ b/test/CodeGen/ARM/reg_sequence.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s
 ; Implementing vld / vst as REG_SEQUENCE eliminates the extra vmov's.
 
 %struct.int16x8_t = type { <8 x i16> }
@@ -124,7 +124,6 @@ return1:
 return2:
 ; CHECK:        %return2
 ; CHECK:        vadd.i32
-; CHECK:        vorr {{q[0-9]+}}, {{q[0-9]+}}
 ; CHECK-NOT:    vmov
 ; CHECK:        vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
   %tmp100 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1]
@@ -137,7 +136,7 @@ return2:
 
 define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind {
 ; CHECK:        t5:
-; CHECK:        vldmia
+; CHECK:        vld1.32
 ; How can FileCheck match Q and D registers? We need a lisp interpreter.
 ; CHECK:        vorr {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
 ; CHECK-NOT:    vmov
@@ -243,8 +242,8 @@ define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind {
 ; CHECK:        vldr
 ; CHECK-NOT:    vmov d{{.*}}, d16
 ; CHECK:        vmov.i32 d17
-; CHECK-NEXT:   vstmia r0, {d16, d17}
-; CHECK-NEXT:   vstmia r0, {d16, d17}
+; CHECK-NEXT:   vst1.64 {d16, d17}, [r0, :128]
+; CHECK-NEXT:   vst1.64 {d16, d17}, [r0, :128]
   %3 = bitcast double 0.000000e+00 to <2 x float> ; <<2 x float>> [#uses=2]
   %4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
   store <4 x float> %4, <4 x float>* undef, align 16
diff --git a/test/CodeGen/ARM/select.ll b/test/CodeGen/ARM/select.ll
index 5575566..62708ed 100644
--- a/test/CodeGen/ARM/select.ll
+++ b/test/CodeGen/ARM/select.ll
@@ -80,7 +80,7 @@ define double @f7(double %a, double %b) {
 ; CHECK-NEON:      adr     [[R2:r[0-9]+]], LCPI7_0
 ; CHECK-NEON-NEXT: cmp     r0, [[R3]]
 ; CHECK-NEON-NEXT: it      eq
-; CHECK-NEON-NEXT: addeq.w {{r.*}}, [[R2]]
+; CHECK-NEON-NEXT: addeq{{.*}} [[R2]], #4
 ; CHECK-NEON-NEXT: ldr
 ; CHECK-NEON:      bx
 
diff --git a/test/CodeGen/ARM/select_xform.ll b/test/CodeGen/ARM/select_xform.ll
index 26f7cb6..7507808 100644
--- a/test/CodeGen/ARM/select_xform.ll
+++ b/test/CodeGen/ARM/select_xform.ll
@@ -9,7 +9,7 @@ define i32 @t1(i32 %a, i32 %b, i32 %c) nounwind {
 
 ; T2: t1:
 ; T2: mvn r0, #-2147483648
-; T2: addle.w r1, r1
+; T2: addle r1, r0
 ; T2: mov r0, r1
   %tmp1 = icmp sgt i32 %c, 10
   %tmp2 = select i1 %tmp1, i32 0, i32 2147483647
@@ -23,7 +23,7 @@ define i32 @t2(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 ; ARM: mov r0, r1
 
 ; T2: t2:
-; T2: suble.w r1, r1, #10
+; T2: suble r1, #10
 ; T2: mov r0, r1
   %tmp1 = icmp sgt i32 %c, 10
   %tmp2 = select i1 %tmp1, i32 0, i32 10
@@ -33,12 +33,12 @@ define i32 @t2(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 
 define i32 @t3(i32 %a, i32 %b, i32 %x, i32 %y) nounwind {
 ; ARM: t3:
-; ARM: mvnlt r2, #0
-; ARM: and r0, r2, r3
+; ARM: andge r3, r3, r2
+; ARM: mov r0, r3
 
 ; T2: t3:
-; T2: movlt.w r2, #-1
-; T2: and.w r0, r2, r3
+; T2: andge r3, r2
+; T2: mov r0, r3
   %cond = icmp slt i32 %a, %b
   %z = select i1 %cond, i32 -1, i32 %x
   %s = and i32 %z, %y
@@ -47,12 +47,12 @@ define i32 @t3(i32 %a, i32 %b, i32 %x, i32 %y) nounwind {
 
 define i32 @t4(i32 %a, i32 %b, i32 %x, i32 %y) nounwind {
 ; ARM: t4:
-; ARM: movlt r2, #0
-; ARM: orr r0, r2, r3
+; ARM: orrge r3, r3, r2
+; ARM: mov r0, r3
 
 ; T2: t4:
-; T2: movlt r2, #0
-; T2: orr.w r0, r2, r3
+; T2: orrge r3, r2
+; T2: mov r0, r3
   %cond = icmp slt i32 %a, %b
   %z = select i1 %cond, i32 0, i32 %x
   %s = or i32 %z, %y
@@ -81,7 +81,7 @@ define i32 @t6(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 
 ; T2: t6:
 ; T2-NOT: movge
-; T2: eorlt.w r3, r3, r2
+; T2: eorlt r3, r2
   %cond = icmp slt i32 %a, %b
   %tmp1 = select i1 %cond, i32 %c, i32 0
   %tmp2 = xor i32 %tmp1, %d
@@ -179,3 +179,46 @@ define i32 @t12(i32 %a, i32 %b) nounwind {
   %tmp1 = select i1 %cond, i32 %a, i32 %x
   ret i32 %tmp1
 }
+
+; Handle frame index operands.
+define void @pr13628() nounwind uwtable align 2 {
+  %x3 = alloca i8, i32 256, align 8
+  %x4 = load i8* undef, align 1
+  %x5 = icmp ne i8 %x4, 0
+  %x6 = select i1 %x5, i8* %x3, i8* null
+  call void @bar(i8* %x6) nounwind
+  ret void
+}
+declare void @bar(i8*)
+
+; Fold zext i1 into predicated add
+define i32 @t13(i32 %c, i32 %a) nounwind readnone ssp {
+entry:
+; ARM: t13
+; ARM: cmp r1, #10
+; ARM: addgt r0, r0, #1
+
+; T2: t13
+; T2: cmp r1, #10
+; T2: addgt r0, #1
+  %cmp = icmp sgt i32 %a, 10
+  %conv = zext i1 %cmp to i32
+  %add = add i32 %conv, %c
+  ret i32 %add
+}
+
+; Fold sext i1 into predicated sub
+define i32 @t14(i32 %c, i32 %a) nounwind readnone ssp {
+entry:
+; ARM: t14
+; ARM: cmp r1, #10
+; ARM: subgt r0, r0, #1
+
+; T2: t14
+; T2: cmp r1, #10
+; T2: subgt r0, #1
+  %cmp = icmp sgt i32 %a, 10
+  %conv = sext i1 %cmp to i32
+  %add = add i32 %conv, %c
+  ret i32 %add
+}
diff --git a/test/CodeGen/ARM/struct_byval.ll b/test/CodeGen/ARM/struct_byval.ll
index 99ba475..e9541c2 100644
--- a/test/CodeGen/ARM/struct_byval.ll
+++ b/test/CodeGen/ARM/struct_byval.ll
@@ -44,3 +44,47 @@ entry:
 declare i32 @e1(%struct.SmallStruct* nocapture byval %in) nounwind
 declare i32 @e2(%struct.LargeStruct* nocapture byval %in) nounwind
 declare i32 @e3(%struct.LargeStruct* nocapture byval align 16 %in) nounwind
+
+; rdar://12442472
+; We can't do tail call since address of s is passed to the callee and part of
+; s is in caller's local frame.
+define void @f3(%struct.SmallStruct* nocapture byval %s) nounwind optsize {
+; CHECK: f3
+; CHECK: bl _consumestruct
+entry:
+  %0 = bitcast %struct.SmallStruct* %s to i8*
+  tail call void @consumestruct(i8* %0, i32 80) optsize
+  ret void
+}
+
+define void @f4(%struct.SmallStruct* nocapture byval %s) nounwind optsize {
+; CHECK: f4
+; CHECK: bl _consumestruct
+entry:
+  %addr = getelementptr inbounds %struct.SmallStruct* %s, i32 0, i32 0
+  %0 = bitcast i32* %addr to i8*
+  tail call void @consumestruct(i8* %0, i32 80) optsize
+  ret void
+}
+
+; We can do tail call here since s is in the incoming argument area.
+define void @f5(i32 %a, i32 %b, i32 %c, i32 %d, %struct.SmallStruct* nocapture byval %s) nounwind optsize {
+; CHECK: f5
+; CHECK: b _consumestruct
+entry:
+  %0 = bitcast %struct.SmallStruct* %s to i8*
+  tail call void @consumestruct(i8* %0, i32 80) optsize
+  ret void
+}
+
+define void @f6(i32 %a, i32 %b, i32 %c, i32 %d, %struct.SmallStruct* nocapture byval %s) nounwind optsize {
+; CHECK: f6
+; CHECK: b _consumestruct
+entry:
+  %addr = getelementptr inbounds %struct.SmallStruct* %s, i32 0, i32 0
+  %0 = bitcast i32* %addr to i8*
+  tail call void @consumestruct(i8* %0, i32 80) optsize
+  ret void
+}
+
+declare void @consumestruct(i8* nocapture %structp, i32 %structsize) nounwind
diff --git a/test/CodeGen/ARM/sub-cmp-peephole.ll b/test/CodeGen/ARM/sub-cmp-peephole.ll
index 6fcbdee..2961b94 100644
--- a/test/CodeGen/ARM/sub-cmp-peephole.ll
+++ b/test/CodeGen/ARM/sub-cmp-peephole.ll
@@ -63,3 +63,24 @@ if.then:
 if.else:
   ret i32 %sub
 }
+
+; If the sub/rsb instruction is predicated, we can't use the flags.
+; <rdar://problem/12263428>
+; Test case from MultiSource/Benchmarks/Ptrdist/bc/number.s
+; CHECK: bc_raise
+; CHECK: rsbeq
+; CHECK: cmp
+define i32 @bc_raise() nounwind ssp {
+entry:
+  %val.2.i = select i1 undef, i32 0, i32 undef
+  %sub.i = sub nsw i32 0, %val.2.i
+  %retval.0.i = select i1 undef, i32 %val.2.i, i32 %sub.i
+  %cmp1 = icmp eq i32 %retval.0.i, 0
+  br i1 %cmp1, label %land.lhs.true, label %if.end11
+
+land.lhs.true:                                    ; preds = %num2long.exit
+  ret i32 17
+
+if.end11:                                         ; preds = %num2long.exit
+  ret i32 23
+}
diff --git a/test/CodeGen/ARM/sub.ll b/test/CodeGen/ARM/sub.ll
index 474043a..7f82ca7 100644
--- a/test/CodeGen/ARM/sub.ll
+++ b/test/CodeGen/ARM/sub.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm < %s | FileCheck %s
+; RUN: llc -march=arm -mcpu=cortex-a8 < %s | FileCheck %s
 
 ; 171 = 0x000000ab
 define i64 @f1(i64 %a) {
diff --git a/test/CodeGen/ARM/subreg-remat.ll b/test/CodeGen/ARM/subreg-remat.ll
index 03ae12c..455bfce 100644
--- a/test/CodeGen/ARM/subreg-remat.ll
+++ b/test/CodeGen/ARM/subreg-remat.ll
@@ -4,14 +4,14 @@ target triple = "thumbv7-apple-ios"
 ;
 ; The vector %v2 is built like this:
 ;
-;   %vreg6:ssub_1<def> = VMOVSR %vreg0<kill>, pred:14, pred:%noreg, %vreg6<imp-def>; DPR_VFP2:%vreg6 GPR:%vreg0
+;   %vreg6:ssub_1<def> = ...
 ;   %vreg6:ssub_0<def> = VLDRS <cp#0>, 0, pred:14, pred:%noreg; mem:LD4[ConstantPool] DPR_VFP2:%vreg6
 ;
 ; When %vreg6 spills, the VLDRS constant pool load cannot be rematerialized
 ; since it implicitly reads the ssub_1 sub-register.
 ;
 ; CHECK: f1
-; CHECK: vmov    s1, r0
+; CHECK: vmov    d0, r0, r0
 ; CHECK: vldr s0, LCPI
 ; The vector must be spilled:
 ; CHECK: vstr d0,
diff --git a/test/CodeGen/ARM/trap.ll b/test/CodeGen/ARM/trap.ll
index 38842a9..21865f8 100644
--- a/test/CodeGen/ARM/trap.ll
+++ b/test/CodeGen/ARM/trap.ll
@@ -14,4 +14,16 @@ entry:
   unreachable
 }
 
+define void @t2() nounwind {
+entry:
+; INSTR: t2:
+; INSTR: trap
+
+; FUNC: t2:
+; FUNC: bl __trap
+  call void @llvm.debugtrap()
+  unreachable
+}
+
 declare void @llvm.trap() nounwind
+declare void @llvm.debugtrap() nounwind
diff --git a/test/CodeGen/ARM/twoaddrinstr.ll b/test/CodeGen/ARM/twoaddrinstr.ll
index 4e227dd..fc2aa1e 100644
--- a/test/CodeGen/ARM/twoaddrinstr.ll
+++ b/test/CodeGen/ARM/twoaddrinstr.ll
@@ -4,18 +4,18 @@
 define void @PR13378() nounwind {
 ; This was orriginally a crasher trying to schedule the instructions.
 ; CHECK:      PR13378:
-; CHECK:        vldmia
+; CHECK:        vld1.32
+; CHECK-NEXT:   vst1.32
+; CHECK-NEXT:   vst1.32
 ; CHECK-NEXT:   vmov.f32
-; CHECK-NEXT:   vstmia
-; CHECK-NEXT:   vstmia
 ; CHECK-NEXT:   vmov.f32
-; CHECK-NEXT:   vstmia
+; CHECK-NEXT:   vst1.32
 
 entry:
-  %0 = load <4 x float>* undef
-  store <4 x float> zeroinitializer, <4 x float>* undef
-  store <4 x float> %0, <4 x float>* undef
+  %0 = load <4 x float>* undef, align 4
+  store <4 x float> zeroinitializer, <4 x float>* undef, align 4
+  store <4 x float> %0, <4 x float>* undef, align 4
   %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 3
-  store <4 x float> %1, <4 x float>* undef
+  store <4 x float> %1, <4 x float>* undef, align 4
   unreachable
 }
diff --git a/test/CodeGen/ARM/unaligned_load_store.ll b/test/CodeGen/ARM/unaligned_load_store.ll
index 869b926..3064202 100644
--- a/test/CodeGen/ARM/unaligned_load_store.ll
+++ b/test/CodeGen/ARM/unaligned_load_store.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm -pre-RA-sched=source | FileCheck %s -check-prefix=EXPANDED
-; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=cortex-a8 -arm-strict-align -pre-RA-sched=source | FileCheck %s -check-prefix=EXPANDED
+; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=cortex-a8 -mattr=-neon -arm-strict-align -pre-RA-sched=source | FileCheck %s -check-prefix=EXPANDED
 ; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=UNALIGNED
 
 ; rdar://7113725
@@ -59,3 +59,19 @@ entry:
   store double %tmp, double* %b, align 1
   ret void
 }
+
+define void @byte_word_ops(i32* %a, i32* %b) nounwind {
+entry:
+; EXPANDED: byte_word_ops:
+; EXPANDED: ldrb
+; EXPANDED: strb
+
+; UNALIGNED: byte_word_ops:
+; UNALIGNED-NOT: ldrb
+; UNALIGNED: ldr
+; UNALIGNED-NOT: strb
+; UNALIGNED: str
+  %tmp = load i32* %a, align 1
+  store i32 %tmp, i32* %b, align 1
+  ret void
+}
diff --git a/test/CodeGen/ARM/unaligned_load_store_vector.ll b/test/CodeGen/ARM/unaligned_load_store_vector.ll
new file mode 100644
index 0000000..25ae651
--- /dev/null
+++ b/test/CodeGen/ARM/unaligned_load_store_vector.ll
@@ -0,0 +1,487 @@
+;RUN: llc < %s -march=arm -mattr=+v7 -mattr=+neon | FileCheck %s
+
+;ALIGN = 1
+;SIZE  = 64
+;TYPE  = <8 x i8>
+define void @v64_v8i8_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v8i8_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i8>*
+  %vo  = bitcast i8* %po to <8 x i8>*
+;CHECK: vld1.8
+  %v1 = load  <8 x i8>* %vi, align 1
+;CHECK: vst1.8
+  store <8 x i8> %v1, <8 x i8>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 64
+;TYPE  = <4 x i16>
+define void @v64_v4i16_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v4i16_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i16>*
+  %vo  = bitcast i8* %po to <4 x i16>*
+;CHECK: vld1.8
+  %v1 = load  <4 x i16>* %vi, align 1
+;CHECK: vst1.8
+  store <4 x i16> %v1, <4 x i16>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 64
+;TYPE  = <2 x i32>
+define void @v64_v2i32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2i32_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i32>*
+  %vo  = bitcast i8* %po to <2 x i32>*
+;CHECK: vld1.8
+  %v1 = load  <2 x i32>* %vi, align 1
+;CHECK: vst1.8
+  store <2 x i32> %v1, <2 x i32>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 64
+;TYPE  = <2 x float>
+define void @v64_v2f32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2f32_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x float>*
+  %vo  = bitcast i8* %po to <2 x float>*
+;CHECK: vld1.8
+  %v1 = load  <2 x float>* %vi, align 1
+;CHECK: vst1.8
+  store <2 x float> %v1, <2 x float>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <16 x i8>
+define void @v128_v16i8_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v16i8_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <16 x i8>*
+  %vo  = bitcast i8* %po to <16 x i8>*
+;CHECK: vld1.8
+  %v1 = load  <16 x i8>* %vi, align 1
+;CHECK: vst1.8
+  store <16 x i8> %v1, <16 x i8>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <8 x i16>
+define void @v128_v8i16_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v8i16_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i16>*
+  %vo  = bitcast i8* %po to <8 x i16>*
+;CHECK: vld1.8
+  %v1 = load  <8 x i16>* %vi, align 1
+;CHECK: vst1.8
+  store <8 x i16> %v1, <8 x i16>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <4 x i32>
+define void @v128_v4i32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4i32_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i32>*
+  %vo  = bitcast i8* %po to <4 x i32>*
+;CHECK: vld1.8
+  %v1 = load  <4 x i32>* %vi, align 1
+;CHECK: vst1.8
+  store <4 x i32> %v1, <4 x i32>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <2 x i64>
+define void @v128_v2i64_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v2i64_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i64>*
+  %vo  = bitcast i8* %po to <2 x i64>*
+;CHECK: vld1.8
+  %v1 = load  <2 x i64>* %vi, align 1
+;CHECK: vst1.8
+  store <2 x i64> %v1, <2 x i64>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <4 x float>
+define void @v128_v4f32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4f32_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x float>*
+  %vo  = bitcast i8* %po to <4 x float>*
+;CHECK: vld1.8
+  %v1 = load  <4 x float>* %vi, align 1
+;CHECK: vst1.8
+  store <4 x float> %v1, <4 x float>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 64
+;TYPE  = <8 x i8>
+define void @v64_v8i8_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v8i8_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i8>*
+  %vo  = bitcast i8* %po to <8 x i8>*
+;CHECK: vld1.16
+  %v1 = load  <8 x i8>* %vi, align 2
+;CHECK: vst1.16
+  store <8 x i8> %v1, <8 x i8>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 64
+;TYPE  = <4 x i16>
+define void @v64_v4i16_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v4i16_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i16>*
+  %vo  = bitcast i8* %po to <4 x i16>*
+;CHECK: vld1.16
+  %v1 = load  <4 x i16>* %vi, align 2
+;CHECK: vst1.16
+  store <4 x i16> %v1, <4 x i16>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 64
+;TYPE  = <2 x i32>
+define void @v64_v2i32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2i32_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i32>*
+  %vo  = bitcast i8* %po to <2 x i32>*
+;CHECK: vld1.16
+  %v1 = load  <2 x i32>* %vi, align 2
+;CHECK: vst1.16
+  store <2 x i32> %v1, <2 x i32>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 64
+;TYPE  = <2 x float>
+define void @v64_v2f32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2f32_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x float>*
+  %vo  = bitcast i8* %po to <2 x float>*
+;CHECK: vld1.16
+  %v1 = load  <2 x float>* %vi, align 2
+;CHECK: vst1.16
+  store <2 x float> %v1, <2 x float>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <16 x i8>
+define void @v128_v16i8_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v16i8_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <16 x i8>*
+  %vo  = bitcast i8* %po to <16 x i8>*
+;CHECK: vld1.16
+  %v1 = load  <16 x i8>* %vi, align 2
+;CHECK: vst1.16
+  store <16 x i8> %v1, <16 x i8>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <8 x i16>
+define void @v128_v8i16_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v8i16_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i16>*
+  %vo  = bitcast i8* %po to <8 x i16>*
+;CHECK: vld1.16
+  %v1 = load  <8 x i16>* %vi, align 2
+;CHECK: vst1.16
+  store <8 x i16> %v1, <8 x i16>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <4 x i32>
+define void @v128_v4i32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4i32_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i32>*
+  %vo  = bitcast i8* %po to <4 x i32>*
+;CHECK: vld1.16
+  %v1 = load  <4 x i32>* %vi, align 2
+;CHECK: vst1.16
+  store <4 x i32> %v1, <4 x i32>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <2 x i64>
+define void @v128_v2i64_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v2i64_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i64>*
+  %vo  = bitcast i8* %po to <2 x i64>*
+;CHECK: vld1.16
+  %v1 = load  <2 x i64>* %vi, align 2
+;CHECK: vst1.16
+  store <2 x i64> %v1, <2 x i64>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <4 x float>
+define void @v128_v4f32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4f32_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x float>*
+  %vo  = bitcast i8* %po to <4 x float>*
+;CHECK: vld1.16
+  %v1 = load  <4 x float>* %vi, align 2
+;CHECK: vst1.16
+  store <4 x float> %v1, <4 x float>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 64
+;TYPE  = <8 x i8>
+define void @v64_v8i8_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v8i8_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i8>*
+  %vo  = bitcast i8* %po to <8 x i8>*
+;CHECK: vldr
+  %v1 = load  <8 x i8>* %vi, align 4
+;CHECK: vstr
+  store <8 x i8> %v1, <8 x i8>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 64
+;TYPE  = <4 x i16>
+define void @v64_v4i16_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v4i16_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i16>*
+  %vo  = bitcast i8* %po to <4 x i16>*
+;CHECK: vldr
+  %v1 = load  <4 x i16>* %vi, align 4
+;CHECK: vstr
+  store <4 x i16> %v1, <4 x i16>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 64
+;TYPE  = <2 x i32>
+define void @v64_v2i32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2i32_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i32>*
+  %vo  = bitcast i8* %po to <2 x i32>*
+;CHECK: vldr
+  %v1 = load  <2 x i32>* %vi, align 4
+;CHECK: vstr
+  store <2 x i32> %v1, <2 x i32>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 64
+;TYPE  = <2 x float>
+define void @v64_v2f32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2f32_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x float>*
+  %vo  = bitcast i8* %po to <2 x float>*
+;CHECK: vldr
+  %v1 = load  <2 x float>* %vi, align 4
+;CHECK: vstr
+  store <2 x float> %v1, <2 x float>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <16 x i8>
+define void @v128_v16i8_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v16i8_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <16 x i8>*
+  %vo  = bitcast i8* %po to <16 x i8>*
+;CHECK: vld1.32
+  %v1 = load  <16 x i8>* %vi, align 4
+;CHECK: vst1.32
+  store <16 x i8> %v1, <16 x i8>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <8 x i16>
+define void @v128_v8i16_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v8i16_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i16>*
+  %vo  = bitcast i8* %po to <8 x i16>*
+;CHECK: vld1.32
+  %v1 = load  <8 x i16>* %vi, align 4
+;CHECK: vst1.32
+  store <8 x i16> %v1, <8 x i16>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <4 x i32>
+define void @v128_v4i32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4i32_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i32>*
+  %vo  = bitcast i8* %po to <4 x i32>*
+;CHECK: vld1.32
+  %v1 = load  <4 x i32>* %vi, align 4
+;CHECK: vst1.32
+  store <4 x i32> %v1, <4 x i32>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <2 x i64>
+define void @v128_v2i64_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v2i64_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i64>*
+  %vo  = bitcast i8* %po to <2 x i64>*
+;CHECK: vld1.32
+  %v1 = load  <2 x i64>* %vi, align 4
+;CHECK: vst1.32
+  store <2 x i64> %v1, <2 x i64>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <4 x float>
+define void @v128_v4f32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4f32_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x float>*
+  %vo  = bitcast i8* %po to <4 x float>*
+;CHECK: vld1.32
+  %v1 = load  <4 x float>* %vi, align 4
+;CHECK: vst1.32
+  store <4 x float> %v1, <4 x float>* %vo, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/ARM/vbsl-constant.ll b/test/CodeGen/ARM/vbsl-constant.ll
index f157dbd..ffda0a5 100644
--- a/test/CodeGen/ARM/vbsl-constant.ll
+++ b/test/CodeGen/ARM/vbsl-constant.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mattr=+neon | FileCheck %s
 
 define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
 ;CHECK: v_bsli8:
@@ -59,8 +59,8 @@ define <1 x i64> @v_bsli64(<1 x i64>* %A, <1 x i64>* %B, <1 x i64>* %C) nounwind
 
 define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
 ;CHECK: v_bslQi8:
-;CHECK: vldmia
-;CHECK: vldmia
+;CHECK: vld1.32
+;CHECK: vld1.32
 ;CHECK: vbsl
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
@@ -73,8 +73,8 @@ define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind
 
 define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
 ;CHECK: v_bslQi16:
-;CHECK: vldmia
-;CHECK: vldmia
+;CHECK: vld1.32
+;CHECK: vld1.32
 ;CHECK: vbsl
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
@@ -87,8 +87,8 @@ define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwin
 
 define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
 ;CHECK: v_bslQi32:
-;CHECK: vldmia
-;CHECK: vldmia
+;CHECK: vld1.32
+;CHECK: vld1.32
 ;CHECK: vbsl
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
@@ -101,9 +101,9 @@ define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwin
 
 define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwind {
 ;CHECK: v_bslQi64:
-;CHECK: vldmia
-;CHECK: vldmia
-;CHECK: vldmia
+;CHECK: vld1.32
+;CHECK: vld1.32
+;CHECK: vld1.64
 ;CHECK: vbsl
 	%tmp1 = load <2 x i64>* %A
 	%tmp2 = load <2 x i64>* %B
diff --git a/test/CodeGen/ARM/vbsl.ll b/test/CodeGen/ARM/vbsl.ll
index 9f3bb4e..750fb0d 100644
--- a/test/CodeGen/ARM/vbsl.ll
+++ b/test/CodeGen/ARM/vbsl.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
 
+; rdar://12471808
+
 define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
 ;CHECK: v_bsli8:
 ;CHECK: vbsl
@@ -103,3 +105,98 @@ define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwin
 	%tmp7 = or <2 x i64> %tmp4, %tmp6
 	ret <2 x i64> %tmp7
 }
+
+define <8 x i8> @f1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind readnone optsize ssp {
+; CHECK: f1:
+; CHECK: vbsl
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind
+  ret <8 x i8> %vbsl.i
+}
+
+define <4 x i16> @f2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK: f2:
+; CHECK: vbsl
+  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind
+  ret <4 x i16> %vbsl3.i
+}
+
+define <2 x i32> @f3(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK: f3:
+; CHECK: vbsl
+  %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind
+  ret <2 x i32> %vbsl3.i
+}
+
+define <2 x float> @f4(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone optsize ssp {
+; CHECK: f4:
+; CHECK: vbsl
+  %vbsl4.i = tail call <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind
+  ret <2 x float> %vbsl4.i
+}
+
+define <16 x i8> @g1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind readnone optsize ssp {
+; CHECK: g1:
+; CHECK: vbsl
+  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind
+  ret <16 x i8> %vbsl.i
+}
+
+define <8 x i16> @g2(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone optsize ssp {
+; CHECK: g2:
+; CHECK: vbsl
+  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind
+  ret <8 x i16> %vbsl3.i
+}
+
+define <4 x i32> @g3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK: g3:
+; CHECK: vbsl
+  %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind
+  ret <4 x i32> %vbsl3.i
+}
+
+define <4 x float> @g4(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone optsize ssp {
+; CHECK: g4:
+; CHECK: vbsl
+  %vbsl4.i = tail call <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind
+  ret <4 x float> %vbsl4.i
+}
+
+define <1 x i64> @test_vbsl_s64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind readnone optsize ssp {
+; CHECK: test_vbsl_s64:
+; CHECK: vbsl d
+  %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind
+  ret <1 x i64> %vbsl3.i
+}
+
+define <1 x i64> @test_vbsl_u64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind readnone optsize ssp {
+; CHECK: test_vbsl_u64:
+; CHECK: vbsl d
+  %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind
+  ret <1 x i64> %vbsl3.i
+}
+
+define <2 x i64> @test_vbslq_s64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK: test_vbslq_s64:
+; CHECK: vbsl q
+  %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind
+  ret <2 x i64> %vbsl3.i
+}
+
+define <2 x i64> @test_vbslq_u64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK: test_vbslq_u64:
+; CHECK: vbsl q
+  %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind
+  ret <2 x i64> %vbsl3.i
+}
+
+declare <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64>, <1 x i64>, <1 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll
index 05332e4..2cf94d6 100644
--- a/test/CodeGen/ARM/vdup.ll
+++ b/test/CodeGen/ARM/vdup.ll
@@ -261,3 +261,73 @@ define void @redundantVdup(<8 x i8>* %ptr) nounwind {
   store <8 x i8> %2, <8 x i8>* %ptr, align 8
   ret void
 }
+
+define <4 x i32> @tdupi(i32 %x, i32 %y) {
+;CHECK: tdupi
+;CHECK: vdup.32
+  %1 = insertelement <4 x i32> undef, i32 %x, i32 0
+  %2 = insertelement <4 x i32> %1, i32 %x, i32 1
+  %3 = insertelement <4 x i32> %2, i32 %x, i32 2
+  %4 = insertelement <4 x i32> %3, i32 %y, i32 3
+  ret <4 x i32> %4
+}
+
+define <4 x float> @tdupf(float %x, float %y) {
+;CHECK: tdupf
+;CHECK: vdup.32
+  %1 = insertelement <4 x float> undef, float %x, i32 0
+  %2 = insertelement <4 x float> %1, float %x, i32 1
+  %3 = insertelement <4 x float> %2, float %x, i32 2
+  %4 = insertelement <4 x float> %3, float %y, i32 3
+  ret <4 x float> %4
+}
+
+; This test checks that when splatting an element from a vector into another,
+; the value isn't moved out to GPRs first.
+define <4 x i32> @tduplane(<4 x i32> %invec) {
+;CHECK: tduplane
+;CHECK-NOT: vmov {{.*}}, d16[1]
+;CHECK: vdup.32 {{.*}}, d16[1]
+  %in = extractelement <4 x i32> %invec, i32 1
+  %1 = insertelement <4 x i32> undef, i32 %in, i32 0
+  %2 = insertelement <4 x i32> %1, i32 %in, i32 1
+  %3 = insertelement <4 x i32> %2, i32 %in, i32 2
+  %4 = insertelement <4 x i32> %3, i32 255, i32 3
+  ret <4 x i32> %4
+}
+
+define <2 x float> @check_f32(<4 x float> %v) nounwind {
+;CHECK: check_f32:
+;CHECK: vdup.32 {{.*}}, d{{..}}[1]
+  %x = extractelement <4 x float> %v, i32 3
+  %1 = insertelement  <2 x float> undef, float %x, i32 0
+  %2 = insertelement  <2 x float> %1, float %x, i32 1
+  ret <2 x float> %2
+}
+
+define <2 x i32> @check_i32(<4 x i32> %v) nounwind {
+;CHECK: check_i32:
+;CHECK: vdup.32 {{.*}}, d{{..}}[1]
+  %x = extractelement <4 x i32> %v, i32 3
+  %1 = insertelement  <2 x i32> undef, i32 %x, i32 0
+  %2 = insertelement  <2 x i32> %1, i32 %x, i32 1
+  ret <2 x i32> %2
+}
+
+define <4 x i16> @check_i16(<8 x i16> %v) nounwind {
+;CHECK: check_i16:
+;CHECK: vdup.16 {{.*}}, d{{..}}[3]
+  %x = extractelement <8 x i16> %v, i32 3
+  %1 = insertelement  <4 x i16> undef, i16 %x, i32 0
+  %2 = insertelement  <4 x i16> %1, i16 %x, i32 1
+  ret <4 x i16> %2
+}
+
+define <8 x i8> @check_i8(<16 x i8> %v) nounwind {
+;CHECK: check_i8:
+;CHECK: vdup.8 {{.*}}, d{{..}}[3]
+  %x = extractelement <16 x i8> %v, i32 3
+  %1 = insertelement  <8  x i8> undef, i8 %x, i32 0
+  %2 = insertelement  <8  x i8> %1, i8 %x, i32 1
+  ret <8 x i8> %2
+}
diff --git a/test/CodeGen/ARM/vector-extend-narrow.ll b/test/CodeGen/ARM/vector-extend-narrow.ll
index 8fd3db2..22af797 100644
--- a/test/CodeGen/ARM/vector-extend-narrow.ll
+++ b/test/CodeGen/ARM/vector-extend-narrow.ll
@@ -62,3 +62,14 @@ define <4 x i8> @i(<4 x i8>* %x) {
   %2 = sdiv <4 x i8> zeroinitializer, %1
   ret <4 x i8> %2
 }
+; CHECK: j:
+define <4 x i32> @j(<4 x i8>* %in) nounwind {
+  ; CHECK: vld1
+  ; CHECK: vmovl.u8
+  ; CHECK: vmovl.u16
+  ; CHECK-NOT: vand
+  %1 = load <4 x i8>* %in, align 4
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
diff --git a/test/CodeGen/ARM/vext.ll b/test/CodeGen/ARM/vext.ll
index e224bdf..f404eb8 100644
--- a/test/CodeGen/ARM/vext.ll
+++ b/test/CodeGen/ARM/vext.ll
@@ -74,6 +74,39 @@ define <16 x i8> @test_vextRq_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 	ret <16 x i8> %tmp3
 }
 
+define <16 x i8> @test_vextq_undef_op2(<16 x i8> %a) nounwind {
+;CHECK: test_vextq_undef_op2:
+;CHECK: vext
+entry:
+  %tmp1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1>
+  ret <16 x i8> %tmp1
+}
+
+define <8 x i8> @test_vextd_undef_op2(<8 x i8> %a) nounwind {
+;CHECK: test_vextd_undef_op2:
+;CHECK: vext
+entry:
+  %tmp1 = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1>
+  ret <8 x i8> %tmp1
+}
+
+
+define <16 x i8> @test_vextq_undef_op2_undef(<16 x i8> %a) nounwind {
+;CHECK: test_vextq_undef_op2_undef:
+;CHECK: vext
+entry:
+  %tmp1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1>
+  ret <16 x i8> %tmp1
+}
+
+define <8 x i8> @test_vextd_undef_op2_undef(<8 x i8> %a) nounwind {
+;CHECK: test_vextd_undef_op2_undef:
+;CHECK: vext
+entry:
+  %tmp1 = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 1>
+  ret <8 x i8> %tmp1
+}
+
 ; Tests for ReconstructShuffle function. Indices have to be carefully
 ; chosen to reach lowering phase as a BUILD_VECTOR.
 
diff --git a/test/CodeGen/ARM/vget_lane.ll b/test/CodeGen/ARM/vget_lane.ll
index 1fc885d..c9ce3b7 100644
--- a/test/CodeGen/ARM/vget_lane.ll
+++ b/test/CodeGen/ARM/vget_lane.ll
@@ -200,7 +200,7 @@ define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind {
 
 define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind {
 ;CHECK: vsetQ_lane32:
-;CHECK: vmov.32
+;CHECK: vmov.32 d{{.*}}[1], r1
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1
 	ret <4 x i32> %tmp2
diff --git a/test/CodeGen/ARM/vselect_imax.ll b/test/CodeGen/ARM/vselect_imax.ll
new file mode 100644
index 0000000..f599404
--- /dev/null
+++ b/test/CodeGen/ARM/vselect_imax.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; Make sure that ARM backend with NEON handles vselect.
+
+define void @vmax_v4i32(<4 x i32>* %m, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: vcgt.s32 [[QR:q[0-9]+]], [[Q1:q[0-9]+]], [[Q2:q[0-9]+]]
+; CHECK: vbsl [[QR]], [[Q1]], [[Q2]]
+    %cmpres = icmp sgt <4 x i32> %a, %b
+    %maxres = select <4 x i1> %cmpres, <4 x i32> %a,  <4 x i32> %b
+    store <4 x i32> %maxres, <4 x i32>* %m
+    ret void
+}
+
diff --git a/test/CodeGen/CellSPU/icmp16.ll b/test/CodeGen/CellSPU/icmp16.ll
index 2f9b091..853ae1d 100644
--- a/test/CodeGen/CellSPU/icmp16.ll
+++ b/test/CodeGen/CellSPU/icmp16.ll
@@ -534,7 +534,7 @@ entry:
 define i16 @icmp_slt_immed04_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
 ; CHECK:      icmp_slt_immed04_i16:
 ; CHECK:        lr
-; CHECK-NETX:   bi
+; CHECK-NEXT:   bi
 
 entry:
        %A = icmp slt i16 %arg1, 32768
@@ -559,7 +559,7 @@ define i1 @icmp_sle_setcc_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwi
 ; CHECK:        ilhu
 ; CHECK:        xorhi
 ; CHECK:        iohl
-; CHECK-NETX:   bi
+; CHECK:   bi
 
 entry:
        %A = icmp sle i16 %arg1, %arg2
diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll
new file mode 100644
index 0000000..802ee2c
--- /dev/null
+++ b/test/CodeGen/Generic/MachineBranchProb.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -print-machineinstrs=expand-isel-pseudos -o /dev/null 2>&1 | FileCheck %s
+
+; Make sure we have the correct weight attached to each successor.
+define i32 @test2(i32 %x) nounwind uwtable readnone ssp {
+; CHECK: Machine code for function test2:
+entry:
+  %conv = sext i32 %x to i64
+  switch i64 %conv, label %return [
+    i64 0, label %sw.bb
+    i64 1, label %sw.bb
+    i64 4, label %sw.bb
+    i64 5, label %sw.bb1
+  ], !prof !0
+; CHECK: BB#0: derived from LLVM BB %entry
+; CHECK: Successors according to CFG: BB#2(64) BB#4(14)
+; CHECK: BB#4: derived from LLVM BB %entry
+; CHECK: Successors according to CFG: BB#1(10) BB#5(4)
+; CHECK: BB#5: derived from LLVM BB %entry
+; CHECK: Successors according to CFG: BB#1(4) BB#3(7)
+
+sw.bb:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 5, %sw.bb1 ], [ 1, %sw.bb ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 4, i32 4, i32 64}
diff --git a/test/CodeGen/Hexagon/args.ll b/test/CodeGen/Hexagon/args.ll
index e9ac8b6..8a6efb6 100644
--- a/test/CodeGen/Hexagon/args.ll
+++ b/test/CodeGen/Hexagon/args.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-hexagon-misched < %s | FileCheck %s
 ; CHECK: r[[T0:[0-9]+]] = #7
 ; CHECK: memw(r29 + #0) = r[[T0]]
+; CHECK: r5 = #6
 ; CHECK: r0 = #1
 ; CHECK: r1 = #2
 ; CHECK: r2 = #3
 ; CHECK: r3 = #4
 ; CHECK: r4 = #5
-; CHECK: r5 = #6
 
 
 define void @foo() nounwind {
diff --git a/test/CodeGen/Hexagon/newvaluestore.ll b/test/CodeGen/Hexagon/newvaluestore.ll
index ab69b22..186e393 100644
--- a/test/CodeGen/Hexagon/newvaluestore.ll
+++ b/test/CodeGen/Hexagon/newvaluestore.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4  < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-hexagon-misched < %s | FileCheck %s
 ; Check that we generate new value store packet in V4
 
 @i = global i32 0, align 4
diff --git a/test/CodeGen/Hexagon/remove_lsr.ll b/test/CodeGen/Hexagon/remove_lsr.ll
new file mode 100644
index 0000000..79b5f4a
--- /dev/null
+++ b/test/CodeGen/Hexagon/remove_lsr.ll
@@ -0,0 +1,80 @@
+; Test fix for PR-13709.
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; CHECK: foo
+; CHECK-NOT: lsr(r{{[0-9]+}}:{{[0-9]+}}, #32)
+; CHECK-NOT: lsr(r{{[0-9]+}}:{{[0-9]+}}, #32)
+
+; Convert the sequence
+; r17:16 = lsr(r11:10, #32)
+; .. = r16
+; into
+; r17:16 = lsr(r11:10, #32)
+; .. = r11
+; This makes the lsr instruction dead and it gets removed subsequently
+; by a dead code removal pass.
+
+%union.vect64 = type { i64 }
+%union.vect32 = type { i32 }
+
+define void @foo(%union.vect64* nocapture %sss_extracted_bit_rx_data_ptr,
+ %union.vect32* nocapture %s_even, %union.vect32* nocapture %s_odd,
+ i8* nocapture %scr_s_even_code_ptr, i8* nocapture %scr_s_odd_code_ptr)
+ nounwind {
+entry:
+  %scevgep = getelementptr %union.vect64* %sss_extracted_bit_rx_data_ptr, i32 1
+  %scevgep28 = getelementptr %union.vect32* %s_odd, i32 1
+  %scevgep32 = getelementptr %union.vect32* %s_even, i32 1
+  %scevgep36 = getelementptr i8* %scr_s_odd_code_ptr, i32 1
+  %scevgep39 = getelementptr i8* %scr_s_even_code_ptr, i32 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %lsr.iv42 = phi i32 [ %lsr.iv.next, %for.body ], [ 2, %entry ]
+  %lsr.iv40 = phi i8* [ %scevgep41, %for.body ], [ %scevgep39, %entry ]
+  %lsr.iv37 = phi i8* [ %scevgep38, %for.body ], [ %scevgep36, %entry ]
+  %lsr.iv33 = phi %union.vect32* [ %scevgep34, %for.body ], [ %scevgep32, %entry ]
+  %lsr.iv29 = phi %union.vect32* [ %scevgep30, %for.body ], [ %scevgep28, %entry ]
+  %lsr.iv = phi %union.vect64* [ %scevgep26, %for.body ], [ %scevgep, %entry ]
+  %predicate_1.023 = phi i8 [ undef, %entry ], [ %10, %for.body ]
+  %predicate.022 = phi i8 [ undef, %entry ], [ %9, %for.body ]
+  %val.021 = phi i64 [ undef, %entry ], [ %srcval, %for.body ]
+  %lsr.iv3335 = bitcast %union.vect32* %lsr.iv33 to i32*
+  %lsr.iv2931 = bitcast %union.vect32* %lsr.iv29 to i32*
+  %lsr.iv27 = bitcast %union.vect64* %lsr.iv to i64*
+  %0 = tail call i64 @llvm.hexagon.A2.vsubhs(i64 0, i64 %val.021)
+  %conv3 = sext i8 %predicate.022 to i32
+  %1 = trunc i64 %val.021 to i32
+  %2 = trunc i64 %0 to i32
+  %3 = tail call i32 @llvm.hexagon.C2.mux(i32 %conv3, i32 %1, i32 %2)
+  store i32 %3, i32* %lsr.iv3335, align 4, !tbaa !0
+  %conv8 = sext i8 %predicate_1.023 to i32
+  %4 = lshr i64 %val.021, 32
+  %5 = trunc i64 %4 to i32
+  %6 = lshr i64 %0, 32
+  %7 = trunc i64 %6 to i32
+  %8 = tail call i32 @llvm.hexagon.C2.mux(i32 %conv8, i32 %5, i32 %7)
+  store i32 %8, i32* %lsr.iv2931, align 4, !tbaa !0
+  %srcval = load i64* %lsr.iv27, align 8
+  %9 = load i8* %lsr.iv40, align 1, !tbaa !1
+  %10 = load i8* %lsr.iv37, align 1, !tbaa !1
+  %lftr.wideiv = trunc i32 %lsr.iv42 to i8
+  %exitcond = icmp eq i8 %lftr.wideiv, 32
+  %scevgep26 = getelementptr %union.vect64* %lsr.iv, i32 1
+  %scevgep30 = getelementptr %union.vect32* %lsr.iv29, i32 1
+  %scevgep34 = getelementptr %union.vect32* %lsr.iv33, i32 1
+  %scevgep38 = getelementptr i8* %lsr.iv37, i32 1
+  %scevgep41 = getelementptr i8* %lsr.iv40, i32 1
+  %lsr.iv.next = add i32 %lsr.iv42, 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare i64 @llvm.hexagon.A2.vsubhs(i64, i64) nounwind readnone
+
+declare i32 @llvm.hexagon.C2.mux(i32, i32, i32) nounwind readnone
+
+!0 = metadata !{metadata !"long", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/Hexagon/static.ll b/test/CodeGen/Hexagon/static.ll
index 2e4ab63..683a4c2 100644
--- a/test/CodeGen/Hexagon/static.ll
+++ b/test/CodeGen/Hexagon/static.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched -disable-hexagon-misched < %s | FileCheck %s
 
 @num = external global i32
 @acc = external global i32
diff --git a/test/CodeGen/MSP430/fp.ll b/test/CodeGen/MSP430/fp.ll
new file mode 100644
index 0000000..c3273ef
--- /dev/null
+++ b/test/CodeGen/MSP430/fp.ll
@@ -0,0 +1,17 @@
+; RUN: llc -O0 -disable-fp-elim < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"
+target triple = "msp430---elf"
+
+define void @fp() nounwind {
+entry:
+; CHECK: fp:
+; CHECK: push.w r4
+; CHECK: mov.w r1, r4
+; CHECK: sub.w #2, r1
+  %i = alloca i16, align 2
+; CHECK: mov.w #0, -2(r4)
+  store i16 0, i16* %i, align 2
+; CHECK: pop.w r4
+  ret void
+}
diff --git a/test/CodeGen/Mips/alloca16.ll b/test/CodeGen/Mips/alloca16.ll
new file mode 100644
index 0000000..731edae
--- /dev/null
+++ b/test/CodeGen/Mips/alloca16.ll
@@ -0,0 +1,75 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 25, align 4
+@jjjj = global i32 35, align 4
+@kkkk = global i32 100, align 4
+@t = global i32 25, align 4
+@riii = common global i32 0, align 4
+@rjjj = common global i32 0, align 4
+@rkkk = common global i32 0, align 4
+
+define void @temp(i32 %foo) nounwind {
+entry:
+  %foo.addr = alloca i32, align 4
+  store i32 %foo, i32* %foo.addr, align 4
+  %0 = load i32* %foo.addr, align 4
+  store i32 %0, i32* @t, align 4
+  ret void
+}
+
+define void @test() nounwind {
+entry:
+; 16: 	.frame	$16,24,$ra
+; 16: 	save 	$ra, $s0, $s1, 24
+; 16: 	move	$16, $sp
+; 16:	move	${{[0-9]+}}, $sp
+; 16:	subu	$[[REGISTER:[0-9]+]], ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	$sp, $[[REGISTER]]
+  %sssi = alloca i32, align 4
+  %ip = alloca i32*, align 4
+  %sssj = alloca i32, align 4
+  %0 = load i32* @iiii, align 4
+  store i32 %0, i32* %sssi, align 4
+  %1 = load i32* @kkkk, align 4
+  %mul = mul nsw i32 %1, 100
+  %2 = alloca i8, i32 %mul
+  %3 = bitcast i8* %2 to i32*
+  store i32* %3, i32** %ip, align 4
+  %4 = load i32* @jjjj, align 4
+  store i32 %4, i32* %sssj, align 4
+  %5 = load i32* @jjjj, align 4
+  %6 = load i32* @iiii, align 4
+  %7 = load i32** %ip, align 4
+  %arrayidx = getelementptr inbounds i32* %7, i32 %6
+  store i32 %5, i32* %arrayidx, align 4
+  %8 = load i32* @kkkk, align 4
+  %9 = load i32* @jjjj, align 4
+  %10 = load i32** %ip, align 4
+  %arrayidx1 = getelementptr inbounds i32* %10, i32 %9
+  store i32 %8, i32* %arrayidx1, align 4
+  %11 = load i32* @iiii, align 4
+  %12 = load i32* @kkkk, align 4
+  %13 = load i32** %ip, align 4
+  %arrayidx2 = getelementptr inbounds i32* %13, i32 %12
+  store i32 %11, i32* %arrayidx2, align 4
+  %14 = load i32** %ip, align 4
+  %arrayidx3 = getelementptr inbounds i32* %14, i32 25
+  %15 = load i32* %arrayidx3, align 4
+  store i32 %15, i32* @riii, align 4
+  %16 = load i32** %ip, align 4
+  %arrayidx4 = getelementptr inbounds i32* %16, i32 35
+  %17 = load i32* %arrayidx4, align 4
+  store i32 %17, i32* @rjjj, align 4
+  %18 = load i32** %ip, align 4
+  %arrayidx5 = getelementptr inbounds i32* %18, i32 100
+  %19 = load i32* %arrayidx5, align 4
+  store i32 %19, i32* @rkkk, align 4
+  %20 = load i32* @t, align 4
+  %21 = load i32** %ip, align 4
+  %arrayidx6 = getelementptr inbounds i32* %21, i32 %20
+  %22 = load i32* %arrayidx6, align 4
+; 16: 	save	16
+  call void @temp(i32 %22)
+; 16: 	restore	16
+  ret void
+}
diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll
index 050689d..819f258 100644
--- a/test/CodeGen/Mips/atomic.ll
+++ b/test/CodeGen/Mips/atomic.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel --disable-machine-licm < %s | FileCheck %s
 
 @x = common global i32 0, align 4
 
@@ -181,8 +181,9 @@ entry:
 
 ; CHECK:   $[[BB0:[A-Z_0-9]+]]:
 ; CHECK:   ll      $[[R10:[0-9]+]], 0($[[R2]])
+; CHECK:   and     $[[R18:[0-9]+]], $[[R9]], $[[R6]]
 ; CHECK:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK:   or      $[[R14:[0-9]+]], $[[R13]], $[[R9]]
+; CHECK:   or      $[[R14:[0-9]+]], $[[R13]], $[[R18]]
 ; CHECK:   sc      $[[R14]], 0($[[R2]])
 ; CHECK:   beq     $[[R14]], $zero, $[[BB0]]
 
diff --git a/test/CodeGen/Mips/atomicops.ll b/test/CodeGen/Mips/atomicops.ll
new file mode 100644
index 0000000..b9c3804
--- /dev/null
+++ b/test/CodeGen/Mips/atomicops.ll
@@ -0,0 +1,40 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@.str = private unnamed_addr constant [8 x i8] c"%d, %d\0A\00", align 1
+
+define i32 @foo(i32* %mem, i32 %val, i32 %c) nounwind {
+entry:
+  %0 = atomicrmw add i32* %mem, i32 %val seq_cst
+  %add = add nsw i32 %0, %c
+  ret i32 %add
+; 16: foo:
+; 16:	lw	${{[0-9]+}}, %call16(__sync_synchronize)(${{[0-9]+}})
+; 16: 	lw	${{[0-9]+}}, %call16(__sync_fetch_and_add_4)(${{[0-9]+}})
+}
+
+define i32 @main() nounwind {
+entry:
+  %x = alloca i32, align 4
+  store volatile i32 0, i32* %x, align 4
+  %0 = atomicrmw add i32* %x, i32 1 seq_cst
+  %add.i = add nsw i32 %0, 2
+  %1 = load volatile i32* %x, align 4
+  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %add.i, i32 %1) nounwind
+  %2 = cmpxchg i32* %x, i32 1, i32 2 seq_cst
+  %3 = load volatile i32* %x, align 4
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %2, i32 %3) nounwind
+  %4 = atomicrmw xchg i32* %x, i32 1 seq_cst
+  %5 = load volatile i32* %x, align 4
+  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %4, i32 %5) nounwind
+; 16: main:
+; 16:	lw	${{[0-9]+}}, %call16(__sync_synchronize)(${{[0-9]+}})
+; 16: 	lw	${{[0-9]+}}, %call16(__sync_fetch_and_add_4)(${{[0-9]+}})
+; 16:	lw	${{[0-9]+}}, %call16(__sync_val_compare_and_swap_4)(${{[0-9]+}})
+; 16:	lw	${{[0-9]+}}, %call16(__sync_lock_test_and_set_4)(${{[0-9]+}})
+
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+
diff --git a/test/CodeGen/Mips/brconeq.ll b/test/CodeGen/Mips/brconeq.ll
new file mode 100644
index 0000000..6133915
--- /dev/null
+++ b/test/CodeGen/Mips/brconeq.ll
@@ -0,0 +1,38 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 10, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp eq i32 %0, %1
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/test/CodeGen/Mips/brconeqk.ll b/test/CodeGen/Mips/brconeqk.ll
new file mode 100644
index 0000000..2c0e72d
--- /dev/null
+++ b/test/CodeGen/Mips/brconeqk.ll
@@ -0,0 +1,22 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, 10
+  br i1 %cmp, label %if.end, label %if.then
+; 16:	cmpi	${{[0-9]+}}, {{[0-9]+}}
+; 16:	bteqz	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconeqz.ll b/test/CodeGen/Mips/brconeqz.ll
new file mode 100644
index 0000000..5586e7b
--- /dev/null
+++ b/test/CodeGen/Mips/brconeqz.ll
@@ -0,0 +1,20 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.end, label %if.then
+; 16:	beqz	${{[0-9]+}}, $[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/brconge.ll b/test/CodeGen/Mips/brconge.ll
new file mode 100644
index 0000000..02f0a63
--- /dev/null
+++ b/test/CodeGen/Mips/brconge.ll
@@ -0,0 +1,37 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 10, align 4
+@k = global i32 5, align 4
+@result1 = global i32 0, align 4
+@result2 = global i32 1, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.end
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %2 = load i32* @k, align 4
+  %cmp1 = icmp slt i32 %0, %2
+  br i1 %cmp1, label %if.then2, label %if.end3
+
+if.then2:                                         ; preds = %if.end
+  store i32 1, i32* @result1, align 4
+  br label %if.end3
+
+if.end3:                                          ; preds = %if.then2, %if.end
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brcongt.ll b/test/CodeGen/Mips/brcongt.ll
new file mode 100644
index 0000000..767b51b
--- /dev/null
+++ b/test/CodeGen/Mips/brcongt.ll
@@ -0,0 +1,25 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 10, align 4
+@k = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp sgt i32 %0, %1
+  br i1 %cmp, label %if.end, label %if.then
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconle.ll b/test/CodeGen/Mips/brconle.ll
new file mode 100644
index 0000000..854b248
--- /dev/null
+++ b/test/CodeGen/Mips/brconle.ll
@@ -0,0 +1,37 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 -5, align 4
+@j = global i32 10, align 4
+@k = global i32 -5, align 4
+@result1 = global i32 0, align 4
+@result2 = global i32 1, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @i, align 4
+  %cmp = icmp sgt i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.end
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %2 = load i32* @k, align 4
+  %cmp1 = icmp sgt i32 %1, %2
+  br i1 %cmp1, label %if.then2, label %if.end3
+
+if.then2:                                         ; preds = %if.end
+  store i32 0, i32* @result1, align 4
+  br label %if.end3
+
+if.end3:                                          ; preds = %if.then2, %if.end
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconlt.ll b/test/CodeGen/Mips/brconlt.ll
new file mode 100644
index 0000000..931a3e8
--- /dev/null
+++ b/test/CodeGen/Mips/brconlt.ll
@@ -0,0 +1,27 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 10, align 4
+@k = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @i, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %if.end, label %if.then
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconne.ll b/test/CodeGen/Mips/brconne.ll
new file mode 100644
index 0000000..5d5bde3
--- /dev/null
+++ b/test/CodeGen/Mips/brconne.ll
@@ -0,0 +1,26 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.end
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16:   lw ${{[0-9]+}}, %got(result)(${{[0-9]+}})
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconnek.ll b/test/CodeGen/Mips/brconnek.ll
new file mode 100644
index 0000000..6208d7c
--- /dev/null
+++ b/test/CodeGen/Mips/brconnek.ll
@@ -0,0 +1,25 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %cmp = icmp eq i32 %0, 5
+  br i1 %cmp, label %if.then, label %if.end
+
+; 16:	cmpi	${{[0-9]+}}, {{[0-9]+}}
+; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16:   lw ${{[0-9]+}}, %got(result)(${{[0-9]+}})
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconnez.ll b/test/CodeGen/Mips/brconnez.ll
new file mode 100644
index 0000000..47db790
--- /dev/null
+++ b/test/CodeGen/Mips/brconnez.ll
@@ -0,0 +1,24 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 0, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; 16:	bnez	${{[0-9]+}}, $[[LABEL:[0-9A-Ba-b_]+]]
+; 16:   lw ${{[0-9]+}}, %got(result)(${{[0-9]+}})
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brdelayslot.ll b/test/CodeGen/Mips/brdelayslot.ll
index b266ce6..2fdb736 100644
--- a/test/CodeGen/Mips/brdelayslot.ll
+++ b/test/CodeGen/Mips/brdelayslot.ll
@@ -1,15 +1,37 @@
-; RUN: llc -march=mipsel  -enable-mips-delay-filler < %s | FileCheck %s
+; RUN: llc -march=mipsel -O0 < %s | FileCheck %s -check-prefix=None
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=Default
 
 define void @foo1() nounwind {
 entry:
-; CHECK:      jalr 
-; CHECK-NOT:  nop 
-; CHECK:      jr 
-; CHECK-NOT:  nop
-; CHECK:      .end
+; Default:     jalr 
+; Default-NOT: nop 
+; Default:     jr 
+; Default-NOT: nop
+; Default:     .end
+; None: jalr 
+; None: nop 
+; None: jr 
+; None: nop
+; None: .end
 
   tail call void @foo2(i32 3) nounwind
   ret void
 }
 
 declare void @foo2(i32)
+
+; Check that cvt.d.w goes into jalr's delay slot.
+;
+define void @foo3(i32 %a) nounwind {
+entry:
+; Default:     foo3:
+; Default:     jalr
+; Default:     cvt.d.w
+
+  %conv = sitofp i32 %a to double
+  tail call void @foo4(double %conv) nounwind
+  ret void
+}
+
+declare void @foo4(double)
+
diff --git a/test/CodeGen/Mips/brind.ll b/test/CodeGen/Mips/brind.ll
new file mode 100644
index 0000000..4c591fa
--- /dev/null
+++ b/test/CodeGen/Mips/brind.ll
@@ -0,0 +1,40 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@main.L = internal unnamed_addr constant [5 x i8*] [i8* blockaddress(@main, %L1), i8* blockaddress(@main, %L2), i8* blockaddress(@main, %L3), i8* blockaddress(@main, %L4), i8* null], align 4
+@str = private unnamed_addr constant [2 x i8] c"A\00"
+@str5 = private unnamed_addr constant [2 x i8] c"B\00"
+@str6 = private unnamed_addr constant [2 x i8] c"C\00"
+@str7 = private unnamed_addr constant [2 x i8] c"D\00"
+@str8 = private unnamed_addr constant [2 x i8] c"E\00"
+
+define i32 @main() nounwind {
+entry:
+  %puts = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str, i32 0, i32 0))
+  br label %L1
+
+L1:                                               ; preds = %entry, %L3
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %L3 ]
+  %puts5 = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str5, i32 0, i32 0))
+  br label %L2
+
+L2:                                               ; preds = %L1, %L3
+  %i.1 = phi i32 [ %i.0, %L1 ], [ %inc, %L3 ]
+  %puts6 = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str6, i32 0, i32 0))
+  br label %L3
+
+L3:                                               ; preds = %L2, %L3
+  %i.2 = phi i32 [ %i.1, %L2 ], [ %inc, %L3 ]
+  %puts7 = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str7, i32 0, i32 0))
+  %inc = add i32 %i.2, 1
+  %arrayidx = getelementptr inbounds [5 x i8*]* @main.L, i32 0, i32 %i.2
+  %0 = load i8** %arrayidx, align 4
+  indirectbr i8* %0, [label %L1, label %L2, label %L3, label %L4]
+; 16: 	jrc	 ${{[0-9]+}}
+L4:                                               ; preds = %L3
+  %puts8 = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str8, i32 0, i32 0))
+  ret i32 0
+}
+
+declare i32 @puts(i8* nocapture) nounwind
+
+
diff --git a/test/CodeGen/Mips/check-noat.ll b/test/CodeGen/Mips/check-noat.ll
new file mode 100644
index 0000000..bfeff67
--- /dev/null
+++ b/test/CodeGen/Mips/check-noat.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s 
+
+define void @f() nounwind readnone {
+entry:
+; CHECK: f:
+; CHECK: .set  noat
+; CHECK: .set  at
+
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/div.ll b/test/CodeGen/Mips/div.ll
new file mode 100644
index 0000000..00e2c19
--- /dev/null
+++ b/test/CodeGen/Mips/div.ll
@@ -0,0 +1,18 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 100, align 4
+@jjjj = global i32 -4, align 4
+@kkkk = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %div = sdiv i32 %0, %1
+; 16:	div	$zero, ${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+  store i32 %div, i32* @kkkk, align 4
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/div_rem.ll b/test/CodeGen/Mips/div_rem.ll
new file mode 100644
index 0000000..950192e
--- /dev/null
+++ b/test/CodeGen/Mips/div_rem.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 103, align 4
+@jjjj = global i32 -4, align 4
+@kkkk = common global i32 0, align 4
+@llll = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %div = sdiv i32 %0, %1
+  store i32 %div, i32* @kkkk, align 4
+  %rem = srem i32 %0, %1
+; 16:	div	$zero, ${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+; 16: 	mfhi	${{[0-9]+}}
+  store i32 %rem, i32* @llll, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/divu.ll b/test/CodeGen/Mips/divu.ll
new file mode 100644
index 0000000..b96a439
--- /dev/null
+++ b/test/CodeGen/Mips/divu.ll
@@ -0,0 +1,18 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 100, align 4
+@jjjj = global i32 4, align 4
+@kkkk = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %div = udiv i32 %0, %1
+; 16:	divu	$zero, ${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+  store i32 %div, i32* @kkkk, align 4
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/divu_remu.ll b/test/CodeGen/Mips/divu_remu.ll
new file mode 100644
index 0000000..a6c1563
--- /dev/null
+++ b/test/CodeGen/Mips/divu_remu.ll
@@ -0,0 +1,23 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 103, align 4
+@jjjj = global i32 4, align 4
+@kkkk = common global i32 0, align 4
+@llll = common global i32 0, align 4
+
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %div = udiv i32 %0, %1
+  store i32 %div, i32* @kkkk, align 4
+  %rem = urem i32 %0, %1
+; 16:	divu	$zero, ${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+; 16: 	mfhi	${{[0-9]+}}
+  store i32 %rem, i32* @llll, align 4
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/dsp-r1.ll b/test/CodeGen/Mips/dsp-r1.ll
new file mode 100644
index 0000000..c9dc8cf
--- /dev/null
+++ b/test/CodeGen/Mips/dsp-r1.ll
@@ -0,0 +1,1241 @@
+; RUN: llc -march=mipsel -mattr=+dsp < %s | FileCheck %s
+
+define i32 @test__builtin_mips_extr_w1(i32 %i0, i32, i64 %a0) nounwind {
+entry:
+; CHECK: extr.w
+
+  %1 = tail call i32 @llvm.mips.extr.w(i64 %a0, i32 15)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.extr.w(i64, i32) nounwind
+
+define i32 @test__builtin_mips_extr_w2(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: extrv.w
+
+  %1 = tail call i32 @llvm.mips.extr.w(i64 %a0, i32 %a1)
+  ret i32 %1
+}
+
+define i32 @test__builtin_mips_extr_r_w1(i32 %i0, i32, i64 %a0) nounwind {
+entry:
+; CHECK: extr_r.w
+
+  %1 = tail call i32 @llvm.mips.extr.r.w(i64 %a0, i32 15)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.extr.r.w(i64, i32) nounwind
+
+define i32 @test__builtin_mips_extr_s_h1(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: extrv_s.h
+
+  %1 = tail call i32 @llvm.mips.extr.s.h(i64 %a0, i32 %a1)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.extr.s.h(i64, i32) nounwind
+
+define i32 @test__builtin_mips_extr_rs_w1(i32 %i0, i32, i64 %a0) nounwind {
+entry:
+; CHECK: extr_rs.w
+
+  %1 = tail call i32 @llvm.mips.extr.rs.w(i64 %a0, i32 15)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.extr.rs.w(i64, i32) nounwind
+
+define i32 @test__builtin_mips_extr_rs_w2(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: extrv_rs.w
+
+  %1 = tail call i32 @llvm.mips.extr.rs.w(i64 %a0, i32 %a1)
+  ret i32 %1
+}
+
+define i32 @test__builtin_mips_extr_s_h2(i32 %i0, i32, i64 %a0) nounwind {
+entry:
+; CHECK: extr_s.h
+
+  %1 = tail call i32 @llvm.mips.extr.s.h(i64 %a0, i32 15)
+  ret i32 %1
+}
+
+define i32 @test__builtin_mips_extr_r_w2(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: extrv_r.w
+
+  %1 = tail call i32 @llvm.mips.extr.r.w(i64 %a0, i32 %a1)
+  ret i32 %1
+}
+
+define i32 @test__builtin_mips_extp1(i32 %i0, i32, i64 %a0) nounwind {
+entry:
+; CHECK: extp ${{[0-9]+}}
+
+  %1 = tail call i32 @llvm.mips.extp(i64 %a0, i32 15)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.extp(i64, i32) nounwind
+
+define i32 @test__builtin_mips_extp2(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: extpv
+
+  %1 = tail call i32 @llvm.mips.extp(i64 %a0, i32 %a1)
+  ret i32 %1
+}
+
+define i32 @test__builtin_mips_extpdp1(i32 %i0, i32, i64 %a0) nounwind {
+entry:
+; CHECK: extpdp ${{[0-9]+}}
+
+  %1 = tail call i32 @llvm.mips.extpdp(i64 %a0, i32 15)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.extpdp(i64, i32) nounwind
+
+define i32 @test__builtin_mips_extpdp2(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: extpdpv
+
+  %1 = tail call i32 @llvm.mips.extpdp(i64 %a0, i32 %a1)
+  ret i32 %1
+}
+
+define i64 @test__builtin_mips_dpau_h_qbl1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpau.h.qbl
+
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = bitcast i32 %a2.coerce to <4 x i8>
+  %3 = tail call i64 @llvm.mips.dpau.h.qbl(i64 %a0, <4 x i8> %1, <4 x i8> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpau.h.qbl(i64, <4 x i8>, <4 x i8>) nounwind readnone
+
+define i64 @test__builtin_mips_dpau_h_qbr1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpau.h.qbr
+
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = bitcast i32 %a2.coerce to <4 x i8>
+  %3 = tail call i64 @llvm.mips.dpau.h.qbr(i64 %a0, <4 x i8> %1, <4 x i8> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpau.h.qbr(i64, <4 x i8>, <4 x i8>) nounwind readnone
+
+define i64 @test__builtin_mips_dpsu_h_qbl1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpsu.h.qbl
+
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = bitcast i32 %a2.coerce to <4 x i8>
+  %3 = tail call i64 @llvm.mips.dpsu.h.qbl(i64 %a0, <4 x i8> %1, <4 x i8> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpsu.h.qbl(i64, <4 x i8>, <4 x i8>) nounwind readnone
+
+define i64 @test__builtin_mips_dpsu_h_qbr1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpsu.h.qbr
+
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = bitcast i32 %a2.coerce to <4 x i8>
+  %3 = tail call i64 @llvm.mips.dpsu.h.qbr(i64 %a0, <4 x i8> %1, <4 x i8> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpsu.h.qbr(i64, <4 x i8>, <4 x i8>) nounwind readnone
+
+define i64 @test__builtin_mips_dpaq_s_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: dpaq_s.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpaq.s.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpaq.s.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_dpaq_sa_l_w1(i32 %i0, i32, i64 %a0, i32 %a1, i32 %a2) nounwind {
+entry:
+; CHECK: dpaq_sa.l.w
+
+  %1 = tail call i64 @llvm.mips.dpaq.sa.l.w(i64 %a0, i32 %a1, i32 %a2)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.dpaq.sa.l.w(i64, i32, i32) nounwind
+
+define i64 @test__builtin_mips_dpsq_s_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: dpsq_s.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpsq.s.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpsq.s.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_dpsq_sa_l_w1(i32 %i0, i32, i64 %a0, i32 %a1, i32 %a2) nounwind {
+entry:
+; CHECK: dpsq_sa.l.w
+
+  %1 = tail call i64 @llvm.mips.dpsq.sa.l.w(i64 %a0, i32 %a1, i32 %a2)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.dpsq.sa.l.w(i64, i32, i32) nounwind
+
+define i64 @test__builtin_mips_mulsaq_s_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: mulsaq_s.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.mulsaq.s.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.mulsaq.s.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_maq_s_w_phl1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: maq_s.w.phl
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.maq.s.w.phl(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.maq.s.w.phl(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_maq_s_w_phr1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: maq_s.w.phr
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.maq.s.w.phr(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.maq.s.w.phr(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_maq_sa_w_phl1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: maq_sa.w.phl
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.maq.sa.w.phl(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.maq.sa.w.phl(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_maq_sa_w_phr1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: maq_sa.w.phr
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.maq.sa.w.phr(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.maq.sa.w.phr(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_shilo1(i32 %i0, i32, i64 %a0) nounwind readnone {
+entry:
+; CHECK: shilo $ac{{[0-9]}}
+
+  %1 = tail call i64 @llvm.mips.shilo(i64 %a0, i32 0)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.shilo(i64, i32) nounwind readnone
+
+define i64 @test__builtin_mips_shilo2(i32 %i0, i32, i64 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shilov
+
+  %1 = tail call i64 @llvm.mips.shilo(i64 %a0, i32 %a1)
+  ret i64 %1
+}
+
+define i64 @test__builtin_mips_mthlip1(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: mthlip ${{[0-9]+}}
+
+  %1 = tail call i64 @llvm.mips.mthlip(i64 %a0, i32 %a1)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.mthlip(i64, i32) nounwind
+
+define i32 @test__builtin_mips_bposge321(i32 %i0) nounwind readonly {
+entry:
+; CHECK: bposge32 $BB{{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.bposge32()
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.bposge32() nounwind readonly
+
+define i64 @test__builtin_mips_madd1(i32 %i0, i32, i64 %a0, i32 %a1, i32 %a2) nounwind readnone {
+entry:
+; CHECK: madd $ac{{[0-9]}}
+
+  %1 = tail call i64 @llvm.mips.madd(i64 %a0, i32 %a1, i32 %a2)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.madd(i64, i32, i32) nounwind readnone
+
+define i64 @test__builtin_mips_maddu1(i32 %i0, i32, i64 %a0, i32 %a1, i32 %a2) nounwind readnone {
+entry:
+; CHECK: maddu $ac{{[0-9]}}
+
+  %1 = tail call i64 @llvm.mips.maddu(i64 %a0, i32 %a1, i32 %a2)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.maddu(i64, i32, i32) nounwind readnone
+
+define i64 @test__builtin_mips_msub1(i32 %i0, i32, i64 %a0, i32 %a1, i32 %a2) nounwind readnone {
+entry:
+; CHECK: msub $ac{{[0-9]}}
+
+  %1 = tail call i64 @llvm.mips.msub(i64 %a0, i32 %a1, i32 %a2)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.msub(i64, i32, i32) nounwind readnone
+
+define i64 @test__builtin_mips_msubu1(i32 %i0, i32, i64 %a0, i32 %a1, i32 %a2) nounwind readnone {
+entry:
+; CHECK: msubu $ac{{[0-9]}}
+
+  %1 = tail call i64 @llvm.mips.msubu(i64 %a0, i32 %a1, i32 %a2)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.msubu(i64, i32, i32) nounwind readnone
+
+define i64 @test__builtin_mips_mult1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: mult $ac{{[0-9]}}
+
+  %0 = tail call i64 @llvm.mips.mult(i32 %a0, i32 %a1)
+  ret i64 %0
+}
+
+declare i64 @llvm.mips.mult(i32, i32) nounwind readnone
+
+define i64 @test__builtin_mips_multu1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: multu $ac{{[0-9]}}
+
+  %0 = tail call i64 @llvm.mips.multu(i32 %a0, i32 %a1)
+  ret i64 %0
+}
+
+declare i64 @llvm.mips.multu(i32, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_addq_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: addq.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.addq.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.addq.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_addq_s_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: addq_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.addq.s.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.addq.s.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_addq_s_w1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: addq_s.w
+
+  %0 = tail call i32 @llvm.mips.addq.s.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.addq.s.w(i32, i32) nounwind
+
+define { i32 } @test__builtin_mips_addu_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: addu.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.addu.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.addu.qb(<4 x i8>, <4 x i8>) nounwind
+
+define { i32 } @test__builtin_mips_addu_s_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: addu_s.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.addu.s.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.addu.s.qb(<4 x i8>, <4 x i8>) nounwind
+
+define { i32 } @test__builtin_mips_subq_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: subq.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.subq.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.subq.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_subq_s_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: subq_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.subq.s.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.subq.s.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_subq_s_w1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: subq_s.w
+
+  %0 = tail call i32 @llvm.mips.subq.s.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.subq.s.w(i32, i32) nounwind
+
+define { i32 } @test__builtin_mips_subu_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: subu.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.subu.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.subu.qb(<4 x i8>, <4 x i8>) nounwind
+
+define { i32 } @test__builtin_mips_subu_s_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: subu_s.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.subu.s.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.subu.s.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_addsc1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: addsc ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.addsc(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.addsc(i32, i32) nounwind
+
+define i32 @test__builtin_mips_addwc1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: addwc ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.addwc(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.addwc(i32, i32) nounwind
+
+define i32 @test__builtin_mips_modsub1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: modsub ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.modsub(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.modsub(i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_raddu_w_qb1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: raddu.w.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call i32 @llvm.mips.raddu.w.qb(<4 x i8> %0)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.raddu.w.qb(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_muleu_s_ph_qbl1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: muleu_s.ph.qbl
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.muleu.s.ph.qbl(<4 x i8> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.muleu.s.ph.qbl(<4 x i8>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_muleu_s_ph_qbr1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: muleu_s.ph.qbr
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.muleu.s.ph.qbr(<4 x i8> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.muleu.s.ph.qbr(<4 x i8>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_mulq_rs_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: mulq_rs.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.mulq.rs.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.mulq.rs.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_muleq_s_w_phl1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: muleq_s.w.phl
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call i32 @llvm.mips.muleq.s.w.phl(<2 x i16> %0, <2 x i16> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.muleq.s.w.phl(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_muleq_s_w_phr1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: muleq_s.w.phr
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call i32 @llvm.mips.muleq.s.w.phr(<2 x i16> %0, <2 x i16> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.muleq.s.w.phr(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_precrq_qb_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: precrq.qb.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <4 x i8> @llvm.mips.precrq.qb.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.precrq.qb.ph(<2 x i16>, <2 x i16>) nounwind readnone
+
+define { i32 } @test__builtin_mips_precrq_ph_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: precrq.ph.w
+
+  %0 = tail call <2 x i16> @llvm.mips.precrq.ph.w(i32 %a0, i32 %a1)
+  %1 = bitcast <2 x i16> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precrq.ph.w(i32, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_precrq_rs_ph_w1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: precrq_rs.ph.w
+
+  %0 = tail call <2 x i16> @llvm.mips.precrq.rs.ph.w(i32 %a0, i32 %a1)
+  %1 = bitcast <2 x i16> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precrq.rs.ph.w(i32, i32) nounwind
+
+define { i32 } @test__builtin_mips_precrqu_s_qb_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: precrqu_s.qb.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <4 x i8> @llvm.mips.precrqu.s.qb.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.precrqu.s.qb.ph(<2 x i16>, <2 x i16>) nounwind
+
+
+define i32 @test__builtin_mips_cmpu_eq_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpu.eq.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  tail call void @llvm.mips.cmpu.eq.qb(<4 x i8> %0, <4 x i8> %1)
+  %2 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %2
+}
+
+declare void @llvm.mips.cmpu.eq.qb(<4 x i8>, <4 x i8>) nounwind
+
+declare i32 @llvm.mips.rddsp(i32) nounwind readonly
+
+define i32 @test__builtin_mips_cmpu_lt_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpu.lt.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  tail call void @llvm.mips.cmpu.lt.qb(<4 x i8> %0, <4 x i8> %1)
+  %2 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %2
+}
+
+declare void @llvm.mips.cmpu.lt.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmpu_le_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpu.le.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  tail call void @llvm.mips.cmpu.le.qb(<4 x i8> %0, <4 x i8> %1)
+  %2 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %2
+}
+
+declare void @llvm.mips.cmpu.le.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmpgu_eq_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpgu.eq.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call i32 @llvm.mips.cmpgu.eq.qb(<4 x i8> %0, <4 x i8> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.cmpgu.eq.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmpgu_lt_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpgu.lt.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call i32 @llvm.mips.cmpgu.lt.qb(<4 x i8> %0, <4 x i8> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.cmpgu.lt.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmpgu_le_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpgu.le.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call i32 @llvm.mips.cmpgu.le.qb(<4 x i8> %0, <4 x i8> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.cmpgu.le.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmp_eq_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmp.eq.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  tail call void @llvm.mips.cmp.eq.ph(<2 x i16> %0, <2 x i16> %1)
+  %2 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %2
+}
+
+declare void @llvm.mips.cmp.eq.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_cmp_lt_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmp.lt.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  tail call void @llvm.mips.cmp.lt.ph(<2 x i16> %0, <2 x i16> %1)
+  %2 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %2
+}
+
+declare void @llvm.mips.cmp.lt.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_cmp_le_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmp.le.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  tail call void @llvm.mips.cmp.le.ph(<2 x i16> %0, <2 x i16> %1)
+  %2 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %2
+}
+
+declare void @llvm.mips.cmp.le.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_pick_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readonly {
+entry:
+; CHECK: pick.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.pick.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.pick.qb(<4 x i8>, <4 x i8>) nounwind readonly
+
+define { i32 } @test__builtin_mips_pick_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readonly {
+entry:
+; CHECK: pick.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.pick.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.pick.ph(<2 x i16>, <2 x i16>) nounwind readonly
+
+define { i32 } @test__builtin_mips_packrl_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: packrl.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.packrl.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.packrl.ph(<2 x i16>, <2 x i16>) nounwind readnone
+
+define i32 @test__builtin_mips_rddsp1(i32 %i0) nounwind readonly {
+entry:
+; CHECK: rddsp ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %0
+}
+
+define { i32 } @test__builtin_mips_shll_qb1(i32 %i0, i32 %a0.coerce) nounwind {
+entry:
+; CHECK: shll.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shll.qb(<4 x i8> %0, i32 3)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.shll.qb(<4 x i8>, i32) nounwind
+
+define { i32 } @test__builtin_mips_shll_qb2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind {
+entry:
+; CHECK: shllv.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shll.qb(<4 x i8> %0, i32 %a1)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_shll_ph1(i32 %i0, i32 %a0.coerce) nounwind {
+entry:
+; CHECK: shll.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shll.ph(<2 x i16> %0, i32 7)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.shll.ph(<2 x i16>, i32) nounwind
+
+define { i32 } @test__builtin_mips_shll_ph2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind {
+entry:
+; CHECK: shllv.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shll.ph(<2 x i16> %0, i32 %a1)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_shll_s_ph1(i32 %i0, i32 %a0.coerce) nounwind {
+entry:
+; CHECK: shll_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shll.s.ph(<2 x i16> %0, i32 7)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.shll.s.ph(<2 x i16>, i32) nounwind
+
+define { i32 } @test__builtin_mips_shll_s_ph2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind {
+entry:
+; CHECK: shllv_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shll.s.ph(<2 x i16> %0, i32 %a1)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define i32 @test__builtin_mips_shll_s_w1(i32 %i0, i32 %a0) nounwind {
+entry:
+; CHECK: shll_s.w
+
+  %0 = tail call i32 @llvm.mips.shll.s.w(i32 %a0, i32 15)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.shll.s.w(i32, i32) nounwind
+
+define i32 @test__builtin_mips_shll_s_w2(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: shllv_s.w
+
+  %0 = tail call i32 @llvm.mips.shll.s.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+define { i32 } @test__builtin_mips_shrl_qb1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: shrl.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shrl.qb(<4 x i8> %0, i32 3)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.shrl.qb(<4 x i8>, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shrl_qb2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrlv.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shrl.qb(<4 x i8> %0, i32 %a1)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_shra_ph1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: shra.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shra.ph(<2 x i16> %0, i32 7)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.shra.ph(<2 x i16>, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shra_ph2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrav.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shra.ph(<2 x i16> %0, i32 %a1)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_shra_r_ph1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: shra_r.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shra.r.ph(<2 x i16> %0, i32 7)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.shra.r.ph(<2 x i16>, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shra_r_ph2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrav_r.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shra.r.ph(<2 x i16> %0, i32 %a1)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define i32 @test__builtin_mips_shra_r_w1(i32 %i0, i32 %a0) nounwind readnone {
+entry:
+; CHECK: shra_r.w
+
+  %0 = tail call i32 @llvm.mips.shra.r.w(i32 %a0, i32 15)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.shra.r.w(i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_shra_r_w2(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrav_r.w
+
+  %0 = tail call i32 @llvm.mips.shra.r.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+define { i32 } @test__builtin_mips_absq_s_ph1(i32 %i0, i32 %a0.coerce) nounwind {
+entry:
+; CHECK: absq_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.absq.s.ph(<2 x i16> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.absq.s.ph(<2 x i16>) nounwind
+
+define i32 @test__builtin_mips_absq_s_w1(i32 %i0, i32 %a0) nounwind {
+entry:
+; CHECK: absq_s.w
+
+  %0 = tail call i32 @llvm.mips.absq.s.w(i32 %a0)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.absq.s.w(i32) nounwind
+
+define i32 @test__builtin_mips_preceq_w_phl1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: preceq.w.phl
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call i32 @llvm.mips.preceq.w.phl(<2 x i16> %0)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.preceq.w.phl(<2 x i16>) nounwind readnone
+
+define i32 @test__builtin_mips_preceq_w_phr1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: preceq.w.phr
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call i32 @llvm.mips.preceq.w.phr(<2 x i16> %0)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.preceq.w.phr(<2 x i16>) nounwind readnone
+
+define { i32 } @test__builtin_mips_precequ_ph_qbl1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: precequ.ph.qbl
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.precequ.ph.qbl(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precequ.ph.qbl(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_precequ_ph_qbr1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: precequ.ph.qbr
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.precequ.ph.qbr(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precequ.ph.qbr(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_precequ_ph_qbla1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: precequ.ph.qbla
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.precequ.ph.qbla(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precequ.ph.qbla(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_precequ_ph_qbra1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: precequ.ph.qbra
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.precequ.ph.qbra(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precequ.ph.qbra(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_preceu_ph_qbl1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: preceu.ph.qbl
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.preceu.ph.qbl(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.preceu.ph.qbl(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_preceu_ph_qbr1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: preceu.ph.qbr
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.preceu.ph.qbr(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.preceu.ph.qbr(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_preceu_ph_qbla1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: preceu.ph.qbla
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.preceu.ph.qbla(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.preceu.ph.qbla(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_preceu_ph_qbra1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: preceu.ph.qbra
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.preceu.ph.qbra(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.preceu.ph.qbra(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_repl_qb1(i32 %i0) nounwind readnone {
+entry:
+; CHECK: repl.qb
+
+  %0 = tail call <4 x i8> @llvm.mips.repl.qb(i32 127)
+  %1 = bitcast <4 x i8> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.repl.qb(i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_repl_qb2(i32 %i0, i32 %a0) nounwind readnone {
+entry:
+; CHECK: replv.qb
+
+  %0 = tail call <4 x i8> @llvm.mips.repl.qb(i32 %a0)
+  %1 = bitcast <4 x i8> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_repl_ph1(i32 %i0) nounwind readnone {
+entry:
+; CHECK: repl.ph
+
+  %0 = tail call <2 x i16> @llvm.mips.repl.ph(i32 0)
+  %1 = bitcast <2 x i16> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.repl.ph(i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_repl_ph2(i32 %i0, i32 %a0) nounwind readnone {
+entry:
+; CHECK: replv.ph
+
+  %0 = tail call <2 x i16> @llvm.mips.repl.ph(i32 %a0)
+  %1 = bitcast <2 x i16> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define i32 @test__builtin_mips_bitrev1(i32 %i0, i32 %a0) nounwind readnone {
+entry:
+; CHECK: bitrev ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.bitrev(i32 %a0)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.bitrev(i32) nounwind readnone
+
+define i32 @test__builtin_mips_lbux1(i32 %i0, i8* %a0, i32 %a1) nounwind readonly {
+entry:
+; CHECK: lbux ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.lbux(i8* %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.lbux(i8*, i32) nounwind readonly
+
+define i32 @test__builtin_mips_lhx1(i32 %i0, i8* %a0, i32 %a1) nounwind readonly {
+entry:
+; CHECK: lhx ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.lhx(i8* %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.lhx(i8*, i32) nounwind readonly
+
+define i32 @test__builtin_mips_lwx1(i32 %i0, i8* %a0, i32 %a1) nounwind readonly {
+entry:
+; CHECK: lwx ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.lwx(i8* %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.lwx(i8*, i32) nounwind readonly
+
+define i32 @test__builtin_mips_wrdsp1(i32 %i0, i32 %a0) nounwind {
+entry:
+; CHECK: wrdsp ${{[0-9]+}}
+
+  tail call void @llvm.mips.wrdsp(i32 %a0, i32 31)
+  %0 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %0
+}
+
+declare void @llvm.mips.wrdsp(i32, i32) nounwind
diff --git a/test/CodeGen/Mips/dsp-r2.ll b/test/CodeGen/Mips/dsp-r2.ll
new file mode 100644
index 0000000..631f9e4
--- /dev/null
+++ b/test/CodeGen/Mips/dsp-r2.ll
@@ -0,0 +1,568 @@
+; RUN: llc -march=mipsel -mattr=+dspr2 < %s | FileCheck %s
+
+define i64 @test__builtin_mips_dpa_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpa.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpa.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpa.w.ph(i64, <2 x i16>, <2 x i16>) nounwind readnone
+
+define i64 @test__builtin_mips_dps_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dps.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dps.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dps.w.ph(i64, <2 x i16>, <2 x i16>) nounwind readnone
+
+define i64 @test__builtin_mips_mulsa_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: mulsa.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.mulsa.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.mulsa.w.ph(i64, <2 x i16>, <2 x i16>) nounwind readnone
+
+define i64 @test__builtin_mips_dpax_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpax.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpax.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpax.w.ph(i64, <2 x i16>, <2 x i16>) nounwind readnone
+
+define i64 @test__builtin_mips_dpsx_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpsx.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpsx.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpsx.w.ph(i64, <2 x i16>, <2 x i16>) nounwind readnone
+
+define i64 @test__builtin_mips_dpaqx_s_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: dpaqx_s.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpaqx.s.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpaqx.s.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_dpaqx_sa_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: dpaqx_sa.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpaqx.sa.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpaqx.sa.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_dpsqx_s_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: dpsqx_s.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpsqx.s.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpsqx.s.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_dpsqx_sa_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: dpsqx_sa.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpsqx.sa.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpsqx.sa.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_addu_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: addu.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.addu.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.addu.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_addu_s_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: addu_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.addu.s.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.addu.s.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_mulq_s_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: mulq_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.mulq.s.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.mulq.s.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_subu_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: subu.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.subu.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.subu.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_subu_s_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: subu_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.subu.s.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.subu.s.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_cmpgdu_eq_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpgdu.eq.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call i32 @llvm.mips.cmpgdu.eq.qb(<4 x i8> %0, <4 x i8> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.cmpgdu.eq.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmpgdu_lt_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpgdu.lt.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call i32 @llvm.mips.cmpgdu.lt.qb(<4 x i8> %0, <4 x i8> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.cmpgdu.lt.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmpgdu_le_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpgdu.le.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call i32 @llvm.mips.cmpgdu.le.qb(<4 x i8> %0, <4 x i8> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.cmpgdu.le.qb(<4 x i8>, <4 x i8>) nounwind
+
+define { i32 } @test__builtin_mips_precr_qb_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: precr.qb.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <4 x i8> @llvm.mips.precr.qb.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.precr.qb.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_precr_sra_ph_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: precr_sra.ph.w
+
+  %0 = tail call <2 x i16> @llvm.mips.precr.sra.ph.w(i32 %a0, i32 %a1, i32 15)
+  %1 = bitcast <2 x i16> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precr.sra.ph.w(i32, i32, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_precr_sra_r_ph_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: precr_sra_r.ph.w
+
+  %0 = tail call <2 x i16> @llvm.mips.precr.sra.r.ph.w(i32 %a0, i32 %a1, i32 15)
+  %1 = bitcast <2 x i16> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precr.sra.r.ph.w(i32, i32, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shra_qb1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: shra.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shra.qb(<4 x i8> %0, i32 3)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.shra.qb(<4 x i8>, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shra_r_qb1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: shra_r.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shra.r.qb(<4 x i8> %0, i32 3)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.shra.r.qb(<4 x i8>, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shra_qb2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrav.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shra.qb(<4 x i8> %0, i32 %a1)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_shra_r_qb2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrav_r.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shra.r.qb(<4 x i8> %0, i32 %a1)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_shrl_ph1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: shrl.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shrl.ph(<2 x i16> %0, i32 7)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.shrl.ph(<2 x i16>, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shrl_ph2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrlv.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shrl.ph(<2 x i16> %0, i32 %a1)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_absq_s_qb1(i32 %i0, i32 %a0.coerce) nounwind {
+entry:
+; CHECK: absq_s.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.absq.s.qb(<4 x i8> %0)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.absq.s.qb(<4 x i8>) nounwind
+
+define { i32 } @test__builtin_mips_mul_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: mul.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.mul.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.mul.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_mul_s_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: mul_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.mul.s.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.mul.s.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_mulq_rs_w1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: mulq_rs.w
+
+  %0 = tail call i32 @llvm.mips.mulq.rs.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.mulq.rs.w(i32, i32) nounwind
+
+define i32 @test__builtin_mips_mulq_s_w1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: mulq_s.w
+
+  %0 = tail call i32 @llvm.mips.mulq.s.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.mulq.s.w(i32, i32) nounwind
+
+define { i32 } @test__builtin_mips_adduh_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: adduh.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.adduh.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.adduh.qb(<4 x i8>, <4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_adduh_r_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: adduh_r.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.adduh.r.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.adduh.r.qb(<4 x i8>, <4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_subuh_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: subuh.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.subuh.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.subuh.qb(<4 x i8>, <4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_subuh_r_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: subuh_r.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.subuh.r.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.subuh.r.qb(<4 x i8>, <4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_addqh_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: addqh.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.addqh.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.addqh.ph(<2 x i16>, <2 x i16>) nounwind readnone
+
+define { i32 } @test__builtin_mips_addqh_r_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: addqh_r.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.addqh.r.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.addqh.r.ph(<2 x i16>, <2 x i16>) nounwind readnone
+
+define i32 @test__builtin_mips_addqh_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: addqh.w
+
+  %0 = tail call i32 @llvm.mips.addqh.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.addqh.w(i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_addqh_r_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: addqh_r.w
+
+  %0 = tail call i32 @llvm.mips.addqh.r.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.addqh.r.w(i32, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_subqh_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: subqh.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.subqh.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.subqh.ph(<2 x i16>, <2 x i16>) nounwind readnone
+
+define { i32 } @test__builtin_mips_subqh_r_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: subqh_r.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.subqh.r.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.subqh.r.ph(<2 x i16>, <2 x i16>) nounwind readnone
+
+define i32 @test__builtin_mips_subqh_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: subqh.w
+
+  %0 = tail call i32 @llvm.mips.subqh.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.subqh.w(i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_subqh_r_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: subqh_r.w
+
+  %0 = tail call i32 @llvm.mips.subqh.r.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.subqh.r.w(i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_append1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: append ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.append(i32 %a0, i32 %a1, i32 15)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.append(i32, i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_balign1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: balign ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.balign(i32 %a0, i32 %a1, i32 1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.balign(i32, i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_prepend1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: prepend ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.prepend(i32 %a0, i32 %a1, i32 15)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.prepend(i32, i32, i32) nounwind readnone
diff --git a/test/CodeGen/Mips/eh-dwarf-cfa.ll b/test/CodeGen/Mips/eh-dwarf-cfa.ll
new file mode 100644
index 0000000..3a21332
--- /dev/null
+++ b/test/CodeGen/Mips/eh-dwarf-cfa.ll
@@ -0,0 +1,63 @@
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips64 < %s | \
+; RUN:      FileCheck %s -check-prefix=CHECK-MIPS64
+
+declare i8* @llvm.eh.dwarf.cfa(i32) nounwind
+declare i8* @llvm.frameaddress(i32) nounwind readnone
+
+define i8* @f1() nounwind {
+entry:
+  %x = alloca [32 x i8], align 1
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  ret i8* %0
+
+; CHECK:        addiu   $sp, $sp, -32
+; CHECK:        addiu   $2, $sp, 32
+}
+
+
+define i8* @f2() nounwind {
+entry:
+  %x = alloca [65536 x i8], align 1
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  ret i8* %0
+
+; check stack size (65536 + 8)
+; CHECK:        lui     $[[R0:[a-z0-9]+]], 65535
+; CHECK:        addiu   $[[R0]], $[[R0]], -8
+; CHECK:        addu    $sp, $sp, $[[R0]]
+
+; check return value ($sp + stack size)
+; CHECK:        lui     $[[R1:[a-z0-9]+]], 1
+; CHECK:        addu    $[[R1]], $sp, $[[R1]]
+; CHECK:        addiu   $2, $[[R1]], 8
+}
+
+
+define i32 @f3() nounwind {
+entry:
+  %x = alloca [32 x i8], align 1
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  %1 = ptrtoint i8* %0 to i32
+  %2 = call i8* @llvm.frameaddress(i32 0)
+  %3 = ptrtoint i8* %2 to i32
+  %add = add i32 %1, %3
+  ret i32 %add
+
+; CHECK:        addiu   $sp, $sp, -40
+
+; check return value ($fp + stack size + $fp)
+; CHECK:        addiu   $[[R0:[a-z0-9]+]], $fp, 40
+; CHECK:        addu    $2, $[[R0]], $fp
+}
+
+
+define i8* @f4() nounwind {
+entry:
+  %x = alloca [32 x i8], align 1
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  ret i8* %0
+
+; CHECK-MIPS64:        daddiu   $sp, $sp, -32
+; CHECK-MIPS64:        daddiu   $2, $sp, 32
+}
diff --git a/test/CodeGen/Mips/helloworld.ll b/test/CodeGen/Mips/helloworld.ll
index bee93ac..aee58b6 100644
--- a/test/CodeGen/Mips/helloworld.ll
+++ b/test/CodeGen/Mips/helloworld.ll
@@ -24,10 +24,10 @@ entry:
 ; C1:	addiu	${{[0-9]+}}, %lo($.str)
 ; C2:	move	$25, ${{[0-9]+}}
 ; C1:	move 	$gp, ${{[0-9]+}}
-; C1:	jalr 	${{[0-9]+}}
+; C1:	jalrc 	${{[0-9]+}}
 ; SR:	restore 	$ra, [[FS]]
 ; PE:	li	$2, 0
-; PE:	jr 	$ra
+; PE:	jrc 	$ra
 
 }
 
diff --git a/test/CodeGen/Mips/i32k.ll b/test/CodeGen/Mips/i32k.ll
new file mode 100644
index 0000000..c6da8b1
--- /dev/null
+++ b/test/CodeGen/Mips/i32k.ll
@@ -0,0 +1,17 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16a
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16b
+
+@.str = private unnamed_addr constant [4 x i8] c"%i\0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 1075344593) nounwind
+; 16a:	li	${{[0-9]+}}, 29905
+; 16b:	li	${{[0-9]+}}, 16408
+  %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 -1075344593) nounwind
+; 16a:	li	${{[0-9]+}}, 49127
+; 16b:	li	${{[0-9]+}}, 35631
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
diff --git a/test/CodeGen/Mips/init-array.ll b/test/CodeGen/Mips/init-array.ll
new file mode 100644
index 0000000..f96ce26
--- /dev/null
+++ b/test/CodeGen/Mips/init-array.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple mipsel-unknown-linux -use-init-array < %s | FileCheck  %s
+
+target triple = "mipsel-unknown-linux"
+
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @test }]
+; CHECK: .section
+; CHECK: .init_array
+; CHECK-NOT: .ctors
+; CHECK: .4byte test
+
+define internal void @test() section ".text.startup" {
+entry:
+  ret void
+}
diff --git a/test/CodeGen/Mips/largeimm1.ll b/test/CodeGen/Mips/largeimm1.ll
index d65cc02..1c0f69c 100644
--- a/test/CodeGen/Mips/largeimm1.ll
+++ b/test/CodeGen/Mips/largeimm1.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s
 
-; CHECK: lui $at, 49152
-; CHECK: lui $at, 16384
+; CHECK: lui ${{[0-9]+}}, 49152
+; CHECK: lui ${{[0-9]+}}, 16384
 define void @f() nounwind {
 entry:
   %a1 = alloca [1073741824 x i8], align 1
diff --git a/test/CodeGen/Mips/largeimmprinting.ll b/test/CodeGen/Mips/largeimmprinting.ll
index 2e54879..1e96346 100644
--- a/test/CodeGen/Mips/largeimmprinting.ll
+++ b/test/CodeGen/Mips/largeimmprinting.ll
@@ -1,4 +1,6 @@
-; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | \
+; RUN: FileCheck %s -check-prefix=64
 
 %struct.S1 = type { [65536 x i8] }
 
@@ -6,9 +8,21 @@
 
 define void @f() nounwind {
 entry:
-; CHECK:  lui $at, 65535
-; CHECK:  addiu $at, $at, -16
-; CHECK:  addu  $sp, $sp, $at
+; 32:  lui $[[R0:[0-9]+]], 65535
+; 32:  addiu $[[R0]], $[[R0]], -24
+; 32:  addu $sp, $sp, $[[R0]]
+; 32:  lui $[[R1:[0-9]+]], 1
+; 32:  addu $[[R1]], $sp, $[[R1]]
+; 32:  sw $ra, 20($[[R1]])
+; 64:  daddiu  $[[R0:[0-9]+]], $zero, 1
+; 64:  dsll  $[[R0]], $[[R0]], 48
+; 64:  daddiu  $[[R0]], $[[R0]], -1
+; 64:  dsll  $[[R0]], $[[R0]], 16
+; 64:  daddiu  $[[R0]], $[[R0]], -48
+; 64:  daddu $sp, $sp, $[[R0]]
+; 64:  lui $[[R1:[0-9]+]], 1
+; 64:  daddu $[[R1]], $sp, $[[R1]]
+; 64:  sd  $ra, 40($[[R1]])
 
   %agg.tmp = alloca %struct.S1, align 1
   %tmp = getelementptr inbounds %struct.S1* %agg.tmp, i32 0, i32 0, i32 0
diff --git a/test/CodeGen/Mips/llcarry.ll b/test/CodeGen/Mips/llcarry.ll
new file mode 100644
index 0000000..7763dae
--- /dev/null
+++ b/test/CodeGen/Mips/llcarry.ll
@@ -0,0 +1,51 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i64 4294967295, align 8
+@j = global i64 15, align 8
+@ii = global i64 4294967295, align 8
+@k = common global i64 0, align 8
+@l = common global i64 0, align 8
+@m = common global i64 0, align 8
+
+define void @test1() nounwind {
+entry:
+  %0 = load i64* @i, align 8
+  %1 = load i64* @j, align 8
+  %add = add nsw i64 %1, %0
+  store i64 %add, i64* @k, align 8
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $t8
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+  ret void
+}
+
+define void @test2() nounwind {
+entry:
+  %0 = load i64* @i, align 8
+  %1 = load i64* @j, align 8
+  %sub = sub nsw i64 %0, %1
+; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $t8
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+  store i64 %sub, i64* @l, align 8
+  ret void
+}
+
+define void @test3() nounwind {
+entry:
+  %0 = load i64* @ii, align 8
+  %add = add nsw i64 %0, 15
+; 16:	addiu	${{[0-9]+}}, 15
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $t8
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+  store i64 %add, i64* @m, align 8
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index 0227b88..1a4f79c 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll
@@ -6,9 +6,15 @@
 define void @foo1(i32 %s) nounwind {
 entry:
 ; O32: bal
+; O32: lui $1, 0
+; O32: addiu $1, $1, {{[0-9]+}} 
+; N64: lui $1, 0
+; N64: daddiu $1, $1, 0
+; N64: dsll $1, $1, 16
+; N64: daddiu $1, $1, 0
 ; N64: bal
-; N64: highest
-; N64: higher
+; N64: dsll $1, $1, 16
+; N64: daddiu $1, $1, {{[0-9]+}}  
 
   %tobool = icmp eq i32 %s, 0
   br i1 %tobool, label %if.end, label %if.then
diff --git a/test/CodeGen/Mips/mips64-sret.ll b/test/CodeGen/Mips/mips64-sret.ll
new file mode 100644
index 0000000..e26b022
--- /dev/null
+++ b/test/CodeGen/Mips/mips64-sret.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -O3 < %s | FileCheck %s
+
+%struct.S = type { [8 x i32] }
+
+@g = common global %struct.S zeroinitializer, align 4
+
+define void @f(%struct.S* noalias sret %agg.result) nounwind {
+entry:
+; CHECK: daddu $2, $zero, $4
+
+  %0 = bitcast %struct.S* %agg.result to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.S* @g to i8*), i64 32, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/Mips/misha.ll b/test/CodeGen/Mips/misha.ll
new file mode 100644
index 0000000..80637ed
--- /dev/null
+++ b/test/CodeGen/Mips/misha.ll
@@ -0,0 +1,69 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+define i32 @sumc(i8* nocapture %to, i8* nocapture %from, i32) nounwind {
+entry:
+  %sext = shl i32 %0, 16
+  %conv = ashr exact i32 %sext, 16
+  %cmp8 = icmp eq i32 %conv, 0
+  br i1 %cmp8, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %.pre = load i8* %to, align 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %1 = phi i8 [ %.pre, %for.body.lr.ph ], [ %conv4, %for.body ]
+  %i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %from.addr.09 = phi i8* [ %from, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i8* %from.addr.09, i32 1
+  %2 = load i8* %from.addr.09, align 1
+  %conv27 = zext i8 %2 to i32
+  %conv36 = zext i8 %1 to i32
+  %add = add nsw i32 %conv36, %conv27
+  %conv4 = trunc i32 %add to i8
+  store i8 %conv4, i8* %to, align 1
+  %inc = add nsw i32 %i.010, 1
+  %cmp = icmp eq i32 %inc, %conv
+  br i1 %cmp, label %for.end, label %for.body
+; 16: sumc:
+; 16: 	lbu	${{[0-9]+}}, 0(${{[0-9]+}})
+; 16: 	lbu	${{[0-9]+}}, 0(${{[0-9]+}})
+; 16: sum:
+; 16: 	lhu	${{[0-9]+}}, 0(${{[0-9]+}})
+; 16: 	lhu	${{[0-9]+}}, 0(${{[0-9]+}})
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 undef
+}
+
+define i32 @sum(i16* nocapture %to, i16* nocapture %from, i32) nounwind {
+entry:
+  %sext = shl i32 %0, 16
+  %conv = ashr exact i32 %sext, 16
+  %cmp8 = icmp eq i32 %conv, 0
+  br i1 %cmp8, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %.pre = load i16* %to, align 2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %1 = phi i16 [ %.pre, %for.body.lr.ph ], [ %conv4, %for.body ]
+  %i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %from.addr.09 = phi i16* [ %from, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i16* %from.addr.09, i32 1
+  %2 = load i16* %from.addr.09, align 2
+  %conv27 = zext i16 %2 to i32
+  %conv36 = zext i16 %1 to i32
+  %add = add nsw i32 %conv36, %conv27
+  %conv4 = trunc i32 %add to i16
+  store i16 %conv4, i16* %to, align 2
+  %inc = add nsw i32 %i.010, 1
+  %cmp = icmp eq i32 %inc, %conv
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 undef
+}
+
+
diff --git a/test/CodeGen/Mips/mul.ll b/test/CodeGen/Mips/mul.ll
new file mode 100644
index 0000000..4ce801b
--- /dev/null
+++ b/test/CodeGen/Mips/mul.ll
@@ -0,0 +1,17 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 5, align 4
+@jjjj = global i32 -6, align 4
+@kkkk = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %mul = mul nsw i32 %1, %0
+; 16:	mult	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+
+  store i32 %mul, i32* @kkkk, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/mulll.ll b/test/CodeGen/Mips/mulll.ll
new file mode 100644
index 0000000..e37b919
--- /dev/null
+++ b/test/CodeGen/Mips/mulll.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i64 5, align 8
+@jjjj = global i64 -6, align 8
+@kkkk = common global i64 0, align 8
+
+define void @test() nounwind {
+entry:
+  %0 = load i64* @iiii, align 8
+  %1 = load i64* @jjjj, align 8
+  %mul = mul nsw i64 %1, %0
+  store i64 %mul, i64* @kkkk, align 8
+; 16:	multu	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mfhi	${{[0-9]+}}
+; 16:	mult	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+; 16:	mult	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+
+  ret void
+}
diff --git a/test/CodeGen/Mips/mulull.ll b/test/CodeGen/Mips/mulull.ll
new file mode 100644
index 0000000..4d23c69
--- /dev/null
+++ b/test/CodeGen/Mips/mulull.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i64 5, align 8
+@jjjj = global i64 6, align 8
+@kkkk = common global i64 0, align 8
+@.str = private unnamed_addr constant [20 x i8] c"%lld * %lld = %lld\0A\00", align 1
+
+define void @test() nounwind {
+entry:
+  %0 = load i64* @iiii, align 8
+  %1 = load i64* @jjjj, align 8
+  %mul = mul nsw i64 %1, %0
+  store i64 %mul, i64* @kkkk, align 8
+; 16:	multu	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mfhi	${{[0-9]+}}
+; 16:	mult	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+; 16:	mult	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+  ret void
+}
diff --git a/test/CodeGen/Mips/null.ll b/test/CodeGen/Mips/null.ll
index 7beae99..00c66a9 100644
--- a/test/CodeGen/Mips/null.ll
+++ b/test/CodeGen/Mips/null.ll
@@ -8,6 +8,6 @@ entry:
 ; 16: 	.set	mips16                  # @main
 
 
-; 16:	jr	$ra
+; 16:	jrc	$ra
 
 }
diff --git a/test/CodeGen/Mips/o32_cc_byval.ll b/test/CodeGen/Mips/o32_cc_byval.ll
index eac0d80..5558ba6 100644
--- a/test/CodeGen/Mips/o32_cc_byval.ll
+++ b/test/CodeGen/Mips/o32_cc_byval.ll
@@ -119,6 +119,16 @@ entry:
   ret void
 }
 
+%struct.S4 = type { [4 x i32] }
+
+define void @f5(i64 %a0, %struct.S4* nocapture byval %a1) nounwind {
+entry:
+  tail call void @f6(%struct.S4* byval %a1, i64 %a0) nounwind
+  ret void
+}
+
+declare void @f6(%struct.S4* nocapture byval, i64)
+
 !0 = metadata !{metadata !"int", metadata !1}
 !1 = metadata !{metadata !"omnipotent char", metadata !2}
 !2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/Mips/rem.ll b/test/CodeGen/Mips/rem.ll
new file mode 100644
index 0000000..b18f85d
--- /dev/null
+++ b/test/CodeGen/Mips/rem.ll
@@ -0,0 +1,19 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 103, align 4
+@jjjj = global i32 -4, align 4
+@kkkk = common global i32 0, align 4
+
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %rem = srem i32 %0, %1
+; 16:	div	$zero, ${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mfhi	${{[0-9]+}}
+  store i32 %rem, i32* @kkkk, align 4
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/remat-immed-load.ll b/test/CodeGen/Mips/remat-immed-load.ll
new file mode 100644
index 0000000..d93964b
--- /dev/null
+++ b/test/CodeGen/Mips/remat-immed-load.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | FileCheck %s -check-prefix=64
+
+define void @f0() nounwind {
+entry:
+; 32:  addiu $4, $zero, 1
+; 32:  addiu $4, $zero, 1
+
+  tail call void @foo1(i32 1) nounwind
+  tail call void @foo1(i32 1) nounwind
+  ret void
+}
+
+declare void @foo1(i32)
+
+define void @f3() nounwind {
+entry:
+; 64:  daddiu $4, $zero, 1
+; 64:  daddiu $4, $zero, 1
+
+  tail call void @foo2(i64 1) nounwind
+  tail call void @foo2(i64 1) nounwind
+  ret void
+}
+
+declare void @foo2(i64)
+
+define void @f5() nounwind {
+entry:
+; 32:  lui $4, 1
+; 32:  lui $4, 1
+
+  tail call void @f6(i32 65536) nounwind
+  tail call void @f6(i32 65536) nounwind
+  ret void
+}
+
+declare void @f6(i32)
+
+define void @f7() nounwind {
+entry:
+; 64:  lui $4, 1
+; 64:  lui $4, 1
+
+  tail call void @f8(i64 65536) nounwind
+  tail call void @f8(i64 65536) nounwind
+  ret void
+}
+
+declare void @f8(i64)
+
diff --git a/test/CodeGen/Mips/remu.ll b/test/CodeGen/Mips/remu.ll
new file mode 100644
index 0000000..472503c
--- /dev/null
+++ b/test/CodeGen/Mips/remu.ll
@@ -0,0 +1,18 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 103, align 4
+@jjjj = global i32 4, align 4
+@kkkk = common global i32 0, align 4
+@.str = private unnamed_addr constant [15 x i8] c"%u = %u %% %u\0A\00", align 1
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %rem = urem i32 %0, %1
+; 16:	divu	$zero, ${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mfhi	${{[0-9]+}}
+  store i32 %rem, i32* @kkkk, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/return-vector.ll b/test/CodeGen/Mips/return-vector.ll
new file mode 100644
index 0000000..739c43c
--- /dev/null
+++ b/test/CodeGen/Mips/return-vector.ll
@@ -0,0 +1,244 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s
+
+
+; Check that function accesses vector return value from stack in cases when
+; vector can't be returned in registers. Also check that caller passes in
+; register $4 stack address where the vector should be placed.
+
+
+declare <8 x i32>    @i8(...)
+declare <4 x float>  @f4(...)
+declare <4 x double> @d4(...)
+
+define i32 @call_i8() {
+entry:
+  %call = call <8 x i32> (...)* @i8()
+  %v0 = extractelement <8 x i32> %call, i32 0
+  %v1 = extractelement <8 x i32> %call, i32 1
+  %v2 = extractelement <8 x i32> %call, i32 2
+  %v3 = extractelement <8 x i32> %call, i32 3
+  %v4 = extractelement <8 x i32> %call, i32 4
+  %v5 = extractelement <8 x i32> %call, i32 5
+  %v6 = extractelement <8 x i32> %call, i32 6
+  %v7 = extractelement <8 x i32> %call, i32 7
+  %add1 = add i32 %v0, %v1
+  %add2 = add i32 %v2, %v3
+  %add3 = add i32 %v4, %v5
+  %add4 = add i32 %v6, %v7
+  %add5 = add i32 %add1, %add2
+  %add6 = add i32 %add3, %add4
+  %add7 = add i32 %add5, %add6
+  ret i32 %add7
+
+; CHECK:        call_i8:
+; CHECK:        call16(i8)
+; CHECK:        addiu   $4, $sp, 32
+; CHECK:        lw      $[[R0:[a-z0-9]+]], 60($sp)
+; CHECK:        lw      $[[R1:[a-z0-9]+]], 56($sp)
+; CHECK:        lw      $[[R2:[a-z0-9]+]], 52($sp)
+; CHECK:        lw      $[[R3:[a-z0-9]+]], 48($sp)
+; CHECK:        lw      $[[R4:[a-z0-9]+]], 44($sp)
+; CHECK:        lw      $[[R5:[a-z0-9]+]], 40($sp)
+; CHECK:        lw      $[[R6:[a-z0-9]+]], 36($sp)
+; CHECK:        lw      $[[R7:[a-z0-9]+]], 32($sp)
+}
+
+
+define float @call_f4() {
+entry:
+  %call = call <4 x float> (...)* @f4()
+  %v0 = extractelement <4 x float> %call, i32 0
+  %v1 = extractelement <4 x float> %call, i32 1
+  %v2 = extractelement <4 x float> %call, i32 2
+  %v3 = extractelement <4 x float> %call, i32 3
+  %add1 = fadd float %v0, %v1
+  %add2 = fadd float %v2, %v3
+  %add3 = fadd float %add1, %add2
+  ret float %add3
+
+; CHECK:        call_f4:
+; CHECK:        call16(f4)
+; CHECK:        addiu   $4, $sp, 16
+; CHECK:        lwc1    $[[R0:[a-z0-9]+]], 28($sp)
+; CHECK:        lwc1    $[[R1:[a-z0-9]+]], 24($sp)
+; CHECK:        lwc1    $[[R3:[a-z0-9]+]], 20($sp)
+; CHECK:        lwc1    $[[R4:[a-z0-9]+]], 16($sp)
+}
+
+
+define double @call_d4() {
+entry:
+  %call = call <4 x double> (...)* @d4()
+  %v0 = extractelement <4 x double> %call, i32 0
+  %v1 = extractelement <4 x double> %call, i32 1
+  %v2 = extractelement <4 x double> %call, i32 2
+  %v3 = extractelement <4 x double> %call, i32 3
+  %add1 = fadd double %v0, %v1
+  %add2 = fadd double %v2, %v3
+  %add3 = fadd double %add1, %add2
+  ret double %add3
+
+; CHECK:        call_d4:
+; CHECK:        call16(d4)
+; CHECK:        addiu   $4, $sp, 32
+; CHECK:        ldc1    $[[R0:[a-z0-9]+]], 56($sp)
+; CHECK:        ldc1    $[[R1:[a-z0-9]+]], 48($sp)
+; CHECK:        ldc1    $[[R3:[a-z0-9]+]], 40($sp)
+; CHECK:        ldc1    $[[R4:[a-z0-9]+]], 32($sp)
+}
+
+
+
+; Check that function accesses vector return value from registers in cases when
+; vector can be returned in registers
+
+
+declare <4 x i32>    @i4(...)
+declare <2 x float>  @f2(...)
+declare <2 x double> @d2(...)
+
+define i32 @call_i4() {
+entry:
+  %call = call <4 x i32> (...)* @i4()
+  %v0 = extractelement <4 x i32> %call, i32 0
+  %v1 = extractelement <4 x i32> %call, i32 1
+  %v2 = extractelement <4 x i32> %call, i32 2
+  %v3 = extractelement <4 x i32> %call, i32 3
+  %add1 = add i32 %v0, %v1
+  %add2 = add i32 %v2, %v3
+  %add3 = add i32 %add1, %add2
+  ret i32 %add3
+
+; CHECK:        call_i4:
+; CHECK:        call16(i4)
+; CHECK-NOT:    lw
+; CHECK:        addu    $[[R2:[a-z0-9]+]], $[[R0:[a-z0-9]+]], $[[R1:[a-z0-9]+]]
+; CHECK:        addu    $[[R5:[a-z0-9]+]], $[[R3:[a-z0-9]+]], $[[R4:[a-z0-9]+]]
+; CHECK:        addu    $[[R6:[a-z0-9]+]], $[[R5]], $[[R2]]
+}
+
+
+define float @call_f2() {
+entry:
+  %call = call <2 x float> (...)* @f2()
+  %v0 = extractelement <2 x float> %call, i32 0
+  %v1 = extractelement <2 x float> %call, i32 1
+  %add1 = fadd float %v0, %v1
+  ret float %add1
+
+; CHECK:        call_f2:
+; CHECK:        call16(f2)
+; CHECK-NOT:    lwc1
+; CHECK:        add.s    $[[R2:[a-z0-9]+]], $[[R0:[a-z0-9]+]], $[[R1:[a-z0-9]+]]
+}
+
+
+define double @call_d2() {
+entry:
+  %call = call <2 x double> (...)* @d2()
+  %v0 = extractelement <2 x double> %call, i32 0
+  %v1 = extractelement <2 x double> %call, i32 1
+  %add1 = fadd double %v0, %v1
+  ret double %add1
+
+; CHECK:        call_d2:
+; CHECK:        call16(d2)
+; CHECK-NOT:    ldc1
+; CHECK:        add.d    $[[R2:[a-z0-9]+]], $[[R0:[a-z0-9]+]], $[[R1:[a-z0-9]+]]
+}
+
+
+
+; Check that function returns vector on stack in cases when vector can't be
+; returned in registers. Also check that vector is placed on stack starting
+; from the address in register $4.
+
+
+define <8 x i32> @return_i8() {
+entry:
+  ret <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+; CHECK:        return_i8:
+; CHECK:        sw      $[[R0:[a-z0-9]+]], 28($4)
+; CHECK:        sw      $[[R1:[a-z0-9]+]], 24($4)
+; CHECK:        sw      $[[R2:[a-z0-9]+]], 20($4)
+; CHECK:        sw      $[[R3:[a-z0-9]+]], 16($4)
+; CHECK:        sw      $[[R4:[a-z0-9]+]], 12($4)
+; CHECK:        sw      $[[R5:[a-z0-9]+]], 8($4)
+; CHECK:        sw      $[[R6:[a-z0-9]+]], 4($4)
+; CHECK:        sw      $[[R7:[a-z0-9]+]], 0($4)
+}
+
+
+define <4 x float> @return_f4(float %a, float %b, float %c, float %d) {
+entry:
+  %vecins1 = insertelement <4 x float> undef,    float %a, i32 0
+  %vecins2 = insertelement <4 x float> %vecins1, float %b, i32 1
+  %vecins3 = insertelement <4 x float> %vecins2, float %c, i32 2
+  %vecins4 = insertelement <4 x float> %vecins3, float %d, i32 3
+  ret <4 x float> %vecins4
+
+; CHECK:        return_f4:
+; CHECK:        lwc1    $[[R0:[a-z0-9]+]], 16($sp)
+; CHECK:        swc1    $[[R0]], 12($4)
+; CHECK:        sw      $7, 8($4)
+; CHECK:        sw      $6, 4($4)
+; CHECK:        sw      $5, 0($4)
+}
+
+
+define <4 x double> @return_d4(double %a, double %b, double %c, double %d) {
+entry:
+  %vecins1 = insertelement <4 x double> undef,    double %a, i32 0
+  %vecins2 = insertelement <4 x double> %vecins1, double %b, i32 1
+  %vecins3 = insertelement <4 x double> %vecins2, double %c, i32 2
+  %vecins4 = insertelement <4 x double> %vecins3, double %d, i32 3
+  ret <4 x double> %vecins4
+
+; CHECK:        return_d4:
+; CHECK:        sdc1    $[[R0:[a-z0-9]+]], 24($4)
+; CHECK:        sdc1    $[[R1:[a-z0-9]+]], 16($4)
+; CHECK:        sdc1    $[[R2:[a-z0-9]+]], 8($4)
+; CHECK:        sdc1    $[[R3:[a-z0-9]+]], 0($4)
+}
+
+
+
+; Check that function returns vector in registers in cases when vector can be
+; returned in registers.
+
+
+define <4 x i32> @return_i4() {
+entry:
+  ret <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+
+; CHECK:        return_i4:
+; CHECK:        addiu   $2, $zero, 0
+; CHECK:        addiu   $3, $zero, 1
+; CHECK:        addiu   $4, $zero, 2
+; CHECK:        addiu   $5, $zero, 3
+}
+
+
+define <2 x float> @return_f2(float %a, float %b) {
+entry:
+  %vecins1 = insertelement <2 x float> undef,    float %a, i32 0
+  %vecins2 = insertelement <2 x float> %vecins1, float %b, i32 1
+  ret <2 x float> %vecins2
+
+; CHECK:        return_f2:
+; CHECK:        mov.s   $f0, $f12
+; CHECK:        mov.s   $f2, $f14
+}
+
+
+define <2 x double> @return_d2(double %a, double %b) {
+entry:
+  %vecins1 = insertelement <2 x double> undef,    double %a, i32 0
+  %vecins2 = insertelement <2 x double> %vecins1, double %b, i32 1
+  ret <2 x double> %vecins2
+
+; CHECK:        return_d2:
+; CHECK:        mov.d   $f0, $f12
+; CHECK:        mov.d   $f2, $f14
+}
diff --git a/test/CodeGen/Mips/selpat.ll b/test/CodeGen/Mips/selpat.ll
new file mode 100644
index 0000000..cda0c96
--- /dev/null
+++ b/test/CodeGen/Mips/selpat.ll
@@ -0,0 +1,350 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@t = global i32 10, align 4
+@f = global i32 199, align 4
+@a = global i32 1, align 4
+@b = global i32 10, align 4
+@c = global i32 1, align 4
+@z1 = common global i32 0, align 4
+@z2 = common global i32 0, align 4
+@z3 = common global i32 0, align 4
+@z4 = common global i32 0, align 4
+
+define void @calc_seleq() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp eq i32 %0, %1
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  store i32 %cond, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp eq i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %3, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond10, i32* @z4, align 4
+  ret void
+}
+
+
+define void @calc_seleqk() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp eq i32 %0, 1
+  %1 = load i32* @t, align 4
+  %2 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	cmpi	${{[0-9]+}}, 1
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp eq i32 %0, 10
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %3 = load i32* @b, align 4
+  %cmp6 = icmp eq i32 %3, 3
+  %cond10 = select i1 %cmp6, i32 %2, i32 %1
+  store i32 %cond10, i32* @z3, align 4
+; 16:	cmpi	${{[0-9]+}}, 10
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp11 = icmp eq i32 %3, 10
+  %cond15 = select i1 %cmp11, i32 %1, i32 %2
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_seleqz() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp eq i32 %0, 0
+  %1 = load i32* @t, align 4
+  %2 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	beqz	${{[0-9]+}}, .+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %3 = load i32* @b, align 4
+  %cmp1 = icmp eq i32 %3, 0
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp eq i32 %4, 0
+  %cond10 = select i1 %cmp6, i32 %1, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selge() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp sge i32 %0, %1
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp sge i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp sge i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %3, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp sge i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %3, i32 %2
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define i32 @calc_selgt() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp sgt i32 %0, %1
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+  %cmp1 = icmp sgt i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp sgt i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %2, i32 %3
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp sgt i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %2, i32 %3
+  store i32 %cond15, i32* @z4, align 4
+  ret i32 undef
+}
+
+define void @calc_selle() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp sle i32 %0, %1
+  %2 = load i32* @t, align 4
+  %3 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp sle i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp sle i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %2, i32 %3
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp sle i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %2, i32 %3
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selltk() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp slt i32 %0, 10
+  %1 = load i32* @t, align 4
+  %2 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	slti	${{[0-9]+}}, {{[0-9]+}}
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %3 = load i32* @b, align 4
+  %cmp1 = icmp slt i32 %3, 2
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp sgt i32 %4, 2
+  %cond10 = select i1 %cmp6, i32 %2, i32 %1
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp sgt i32 %0, 2
+  %cond15 = select i1 %cmp11, i32 %2, i32 %1
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+
+define void @calc_selne() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp ne i32 %0, %1
+  %2 = load i32* @t, align 4
+  %3 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  store i32 %cond, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp ne i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %3, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond10, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selnek() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp ne i32 %0, 1
+  %1 = load i32* @f, align 4
+  %2 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	cmpi	${{[0-9]+}}, 1
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp ne i32 %0, 10
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %3 = load i32* @b, align 4
+  %cmp6 = icmp ne i32 %3, 3
+  %cond10 = select i1 %cmp6, i32 %2, i32 %1
+  store i32 %cond10, i32* @z3, align 4
+; 16:	cmpi	${{[0-9]+}}, 10
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp11 = icmp ne i32 %3, 10
+  %cond15 = select i1 %cmp11, i32 %1, i32 %2
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selnez() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp ne i32 %0, 0
+  %1 = load i32* @f, align 4
+  %2 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	bnez	${{[0-9]+}}, .+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %3 = load i32* @b, align 4
+  %cmp1 = icmp ne i32 %3, 0
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp ne i32 %4, 0
+  %cond10 = select i1 %cmp6, i32 %1, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selnez2() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %tobool = icmp ne i32 %0, 0
+  %1 = load i32* @f, align 4
+  %2 = load i32* @t, align 4
+  %cond = select i1 %tobool, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	bnez	${{[0-9]+}}, .+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %3 = load i32* @b, align 4
+  %tobool1 = icmp ne i32 %3, 0
+  %cond5 = select i1 %tobool1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %tobool6 = icmp ne i32 %4, 0
+  %cond10 = select i1 %tobool6, i32 %1, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_seluge() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp uge i32 %0, %1
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp uge i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp uge i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %3, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp uge i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %3, i32 %2
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selugt() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp ugt i32 %0, %1
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp ugt i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp ugt i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %2, i32 %3
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp ugt i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %2, i32 %3
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selule() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp ule i32 %0, %1
+  %2 = load i32* @t, align 4
+  %3 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp ule i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp ule i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %2, i32 %3
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp ule i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %2, i32 %3
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/seteq.ll b/test/CodeGen/Mips/seteq.ll
new file mode 100644
index 0000000..da840c8
--- /dev/null
+++ b/test/CodeGen/Mips/seteq.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 1, align 4
+@j = global i32 10, align 4
+@k = global i32 1, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @k, align 4
+  %cmp = icmp eq i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	xor	$[[REGISTER:[0-9A-Ba-b_]+]], ${{[0-9]+}}
+; 16:	sltiu	$[[REGISTER:[0-9A-Ba-b_]+]], 1
+; 16:	move	${{[0-9]+}}, $t8
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/seteqz.ll b/test/CodeGen/Mips/seteqz.ll
new file mode 100644
index 0000000..d445be6
--- /dev/null
+++ b/test/CodeGen/Mips/seteqz.ll
@@ -0,0 +1,24 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 0, align 4
+@j = global i32 99, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, 0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	sltiu	${{[0-9]+}}, 1
+; 16:	move	${{[0-9]+}}, $t8
+  %1 = load i32* @j, align 4
+  %cmp1 = icmp eq i32 %1, 99
+  %conv2 = zext i1 %cmp1 to i32
+  store i32 %conv2, i32* @r2, align 4
+; 16:	xor	$[[REGISTER:[0-9A-Ba-b_]+]], ${{[0-9]+}}
+; 16:	sltiu	$[[REGISTER:[0-9A-Ba-b_]+]], 1
+; 16:	move	${{[0-9]+}}, $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/setge.ll b/test/CodeGen/Mips/setge.ll
new file mode 100644
index 0000000..94b499b
--- /dev/null
+++ b/test/CodeGen/Mips/setge.ll
@@ -0,0 +1,27 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 -5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+@.str = private unnamed_addr constant [22 x i8] c"1 = %i\0A1 = %i\0A0 = %i\0A\00", align 1
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @k, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp sge i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	$[[REGISTER:[0-9]+]], $t8
+; 16:	xor	$[[REGISTER]], ${{[0-9]+}}
+  %2 = load i32* @m, align 4
+  %cmp1 = icmp sge i32 %0, %2
+  %conv2 = zext i1 %cmp1 to i32
+  store i32 %conv2, i32* @r2, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/setgek.ll b/test/CodeGen/Mips/setgek.ll
new file mode 100644
index 0000000..b6bae09
--- /dev/null
+++ b/test/CodeGen/Mips/setgek.ll
@@ -0,0 +1,18 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@k = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @k, align 4
+  %cmp = icmp sgt i32 %0, -32769
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	slti	${{[0-9]+}}, -32768
+; 16:	move	${{[0-9]+}}, $t8
+; 16:	xor	${{[0-9]+}}, ${{[0-9]+}}
+  ret void
+}
diff --git a/test/CodeGen/Mips/setle.ll b/test/CodeGen/Mips/setle.ll
new file mode 100644
index 0000000..f36fb43
--- /dev/null
+++ b/test/CodeGen/Mips/setle.ll
@@ -0,0 +1,26 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 -5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @k, align 4
+  %cmp = icmp sle i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	$[[REGISTER:[0-9]+]], $t8
+; 16:	xor	$[[REGISTER]], ${{[0-9]+}}
+  %2 = load i32* @m, align 4
+  %cmp1 = icmp sle i32 %2, %1
+  %conv2 = zext i1 %cmp1 to i32
+  store i32 %conv2, i32* @r2, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/setlt.ll b/test/CodeGen/Mips/setlt.ll
new file mode 100644
index 0000000..435be8e
--- /dev/null
+++ b/test/CodeGen/Mips/setlt.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 -5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @k, align 4
+  %cmp = icmp slt i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/setltk.ll b/test/CodeGen/Mips/setltk.ll
new file mode 100644
index 0000000..c0b610e
--- /dev/null
+++ b/test/CodeGen/Mips/setltk.ll
@@ -0,0 +1,20 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 -5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %cmp = icmp slt i32 %0, 10
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	slti	$[[REGISTER:[0-9]+]], 10
+; 16:	move	$[[REGISTER]], $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/setne.ll b/test/CodeGen/Mips/setne.ll
new file mode 100644
index 0000000..6460c83
--- /dev/null
+++ b/test/CodeGen/Mips/setne.ll
@@ -0,0 +1,20 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 1, align 4
+@j = global i32 10, align 4
+@k = global i32 1, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @k, align 4
+  %cmp = icmp ne i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	xor	$[[REGISTER:[0-9]+]], ${{[0-9]+}}
+; 16:	sltu	${{[0-9]+}}, $[[REGISTER]]
+; 16:	move	${{[0-9]+}}, $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/setuge.ll b/test/CodeGen/Mips/setuge.ll
new file mode 100644
index 0000000..ac72b66
--- /dev/null
+++ b/test/CodeGen/Mips/setuge.ll
@@ -0,0 +1,26 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @k, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp uge i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move    $[[REGISTER:[0-9]+]], $t8
+; 16:	xor	$[[REGISTER]], ${{[0-9]+}}
+  %2 = load i32* @m, align 4
+  %cmp1 = icmp uge i32 %0, %2
+  %conv2 = zext i1 %cmp1 to i32
+  store i32 %conv2, i32* @r2, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/setugt.ll b/test/CodeGen/Mips/setugt.ll
new file mode 100644
index 0000000..328f0e3
--- /dev/null
+++ b/test/CodeGen/Mips/setugt.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @k, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp ugt i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move    ${{[0-9]+}}, $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/setule.ll b/test/CodeGen/Mips/setule.ll
new file mode 100644
index 0000000..792f2ae
--- /dev/null
+++ b/test/CodeGen/Mips/setule.ll
@@ -0,0 +1,26 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @k, align 4
+  %cmp = icmp ule i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	$[[REGISTER:[0-9]+]], $t8
+; 16:	xor	$[[REGISTER]], ${{[0-9]+}}
+  %2 = load i32* @m, align 4
+  %cmp1 = icmp ule i32 %2, %1
+  %conv2 = zext i1 %cmp1 to i32
+  store i32 %conv2, i32* @r2, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/setult.ll b/test/CodeGen/Mips/setult.ll
new file mode 100644
index 0000000..56d2e8d
--- /dev/null
+++ b/test/CodeGen/Mips/setult.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @k, align 4
+  %cmp = icmp ult i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/setultk.ll b/test/CodeGen/Mips/setultk.ll
new file mode 100644
index 0000000..75b270e
--- /dev/null
+++ b/test/CodeGen/Mips/setultk.ll
@@ -0,0 +1,20 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %cmp = icmp ult i32 %0, 10
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	sltiu	$[[REGISTER:[0-9]+]], 10
+; 16:	move	$[[REGISTER]], $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/small-section-reserve-gp.ll b/test/CodeGen/Mips/small-section-reserve-gp.ll
new file mode 100644
index 0000000..03503fb
--- /dev/null
+++ b/test/CodeGen/Mips/small-section-reserve-gp.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=mipsel-sde-elf -march=mipsel -relocation-model=static < %s \
+; RUN: | FileCheck %s
+
+@i = internal unnamed_addr global i32 0, align 4
+
+define i32 @geti() nounwind readonly {
+entry:
+; CHECK: lw ${{[0-9]+}}, %gp_rel(i)($gp)
+  %0 = load i32* @i, align 4
+  ret i32 %0
+}
+
diff --git a/test/CodeGen/Mips/stchar.ll b/test/CodeGen/Mips/stchar.ll
new file mode 100644
index 0000000..c00c9fd
--- /dev/null
+++ b/test/CodeGen/Mips/stchar.ll
@@ -0,0 +1,90 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16_h
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16_b
+
+@.str = private unnamed_addr constant [9 x i8] c"%hd %c \0A\00", align 1
+@sp = common global i16* null, align 4
+@cp = common global i8* null, align 4
+
+define void @p1(i16 signext %s, i8 signext %c) nounwind {
+entry:
+  %conv = sext i16 %s to i32
+  %conv1 = sext i8 %c to i32
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 %conv, i32 %conv1) nounwind
+  ret void
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+define void @p2() nounwind {
+entry:
+  %0 = load i16** @sp, align 4
+  %1 = load i16* %0, align 2
+  %2 = load i8** @cp, align 4
+  %3 = load i8* %2, align 1
+  %conv.i = sext i16 %1 to i32
+  %conv1.i = sext i8 %3 to i32
+  %call.i = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 %conv.i, i32 %conv1.i) nounwind
+  %4 = load i16** @sp, align 4
+  store i16 32, i16* %4, align 2
+  %5 = load i8** @cp, align 4
+  store i8 97, i8* %5, align 1
+  ret void
+}
+
+define void @test() nounwind {
+entry:
+  %s = alloca i16, align 4
+  %c = alloca i8, align 4
+  store i16 16, i16* %s, align 4
+  store i8 99, i8* %c, align 4
+  store i16* %s, i16** @sp, align 4
+  store i8* %c, i8** @cp, align 4
+  %call.i.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 16, i32 99) nounwind
+  %0 = load i16** @sp, align 4
+  store i16 32, i16* %0, align 2
+  %1 = load i8** @cp, align 4
+  store i8 97, i8* %1, align 1
+  %2 = load i16* %s, align 4
+  %3 = load i8* %c, align 4
+  %conv.i = sext i16 %2 to i32
+  %conv1.i = sext i8 %3 to i32
+  %call.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 %conv.i, i32 %conv1.i) nounwind
+  ret void
+; 16_b: test:
+; 16_h: test:
+; 16_b:	sb	${{[0-9]+}}, [[offset1:[0-9]+]](${{[0-9]+}})
+; 16_b: lb      ${{[0-9]+}}, [[offset1]](${{[0-9]+}})
+; 16_h:	sh	${{[0-9]+}}, [[offset2:[0-9]+]](${{[0-9]+}})
+; 16_h: lh      ${{[0-9]+}}, [[offset2]](${{[0-9]+}})
+}
+
+define i32 @main() nounwind {
+entry:
+  %s.i = alloca i16, align 4
+  %c.i = alloca i8, align 4
+  %0 = bitcast i16* %s.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.start(i64 -1, i8* %c.i) nounwind
+  store i16 16, i16* %s.i, align 4
+  store i8 99, i8* %c.i, align 4
+  store i16* %s.i, i16** @sp, align 4
+  store i8* %c.i, i8** @cp, align 4
+  %call.i.i.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 16, i32 99) nounwind
+  %1 = load i16** @sp, align 4
+  store i16 32, i16* %1, align 2
+  %2 = load i8** @cp, align 4
+  store i8 97, i8* %2, align 1
+  %3 = load i16* %s.i, align 4
+  %4 = load i8* %c.i, align 4
+  %conv.i.i = sext i16 %3 to i32
+  %conv1.i.i = sext i8 %4 to i32
+  %call.i.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 %conv.i.i, i32 %conv1.i.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %c.i) nounwind
+  ret i32 0
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
diff --git a/test/CodeGen/Mips/stldst.ll b/test/CodeGen/Mips/stldst.ll
new file mode 100644
index 0000000..4182b9e
--- /dev/null
+++ b/test/CodeGen/Mips/stldst.ll
@@ -0,0 +1,41 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@kkkk = global i32 67, align 4
+@llll = global i32 33, align 4
+@mmmm = global i32 44, align 4
+@nnnn = global i32 55, align 4
+@oooo = global i32 32, align 4
+@pppp = global i32 41, align 4
+@qqqq = global i32 59, align 4
+@rrrr = global i32 60, align 4
+@.str = private unnamed_addr constant [32 x i8] c"%i %i %i %i %i %i %i %i %i %i \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @kkkk, align 4
+  %1 = load i32* @llll, align 4
+  %add = add nsw i32 %0, 10
+  %add1 = add nsw i32 %1, 10
+  %2 = load i32* @mmmm, align 4
+  %sub = add nsw i32 %2, -3
+  %3 = load i32* @nnnn, align 4
+  %add2 = add nsw i32 %3, 10
+  %4 = load i32* @oooo, align 4
+  %add3 = add nsw i32 %4, 4
+  %5 = load i32* @pppp, align 4
+  %sub4 = add nsw i32 %5, -5
+  %6 = load i32* @qqqq, align 4
+  %sub5 = add nsw i32 %6, -10
+  %7 = load i32* @rrrr, align 4
+  %add6 = add nsw i32 %7, 6
+
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str, i32 0, i32 0), i32 %sub5, i32 %add6, i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7) nounwind
+  %call7 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str, i32 0, i32 0), i32 %0, i32 %1, i32 %add, i32 %add1, i32 %sub, i32 %add2, i32 %add3, i32 %sub4, i32 %sub5, i32 %add6) nounwind
+  ret i32 0
+}
+; 16:	sw	${{[0-9]+}}, {{[0-9]+}} ( $sp );         # 4-byte Folded Spill
+; 16:	lw	${{[0-9]+}}, {{[0-9]+}} ( $sp );         # 4-byte Folded Reload
+; 16:	sw	${{[0-9]+}}, {{[0-9]+}} ( $sp );         # 4-byte Folded Spill
+; 16:	lw	${{[0-9]+}}, {{[0-9]+}} ( $sp );         # 4-byte Folded Reload
+
+declare i32 @printf(i8* nocapture, ...) nounwind
diff --git a/test/CodeGen/Mips/tailcall.ll b/test/CodeGen/Mips/tailcall.ll
new file mode 100644
index 0000000..bcd33fc
--- /dev/null
+++ b/test/CodeGen/Mips/tailcall.ll
@@ -0,0 +1,245 @@
+; RUN: llc -march=mipsel -relocation-model=pic -enable-mips-tail-calls < %s | \
+; RUN: FileCheck %s -check-prefix=PIC32
+; RUN: llc -march=mipsel -relocation-model=static \
+; RUN: -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=STATIC32
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+n64 -enable-mips-tail-calls \
+; RUN: < %s | FileCheck %s -check-prefix=N64
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic \
+; RUN: -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=PIC16
+
+@g0 = common global i32 0, align 4
+@g1 = common global i32 0, align 4
+@g2 = common global i32 0, align 4
+@g3 = common global i32 0, align 4
+@g4 = common global i32 0, align 4
+@g5 = common global i32 0, align 4
+@g6 = common global i32 0, align 4
+@g7 = common global i32 0, align 4
+@g8 = common global i32 0, align 4
+@g9 = common global i32 0, align 4
+
+define i32 @caller1(i32 %a0) nounwind {
+entry:
+; PIC32-NOT: jalr
+; STATIC32-NOT: jal
+; N64-NOT: jalr
+; PIC16: jalrc
+
+  %call = tail call i32 @callee1(i32 1, i32 1, i32 1, i32 %a0) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee1(i32, i32, i32, i32)
+
+define i32 @caller2(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
+entry:
+; PIC32: jalr
+; STATIC32: jal
+; N64-NOT: jalr
+; PIC16: jalrc
+
+  %call = tail call i32 @callee2(i32 1, i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee2(i32, i32, i32, i32, i32)
+
+define i32 @caller3(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4) nounwind {
+entry:
+; PIC32: jalr
+; STATIC32: jal
+; N64-NOT: jalr
+; PIC16: jalrc
+
+  %call = tail call i32 @callee3(i32 1, i32 1, i32 1, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee3(i32, i32, i32, i32, i32, i32, i32, i32)
+
+define i32 @caller4(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
+entry:
+; PIC32: jalr
+; STATIC32: jal
+; N64: jalr
+; PIC16: jalrc
+
+  %call = tail call i32 @callee4(i32 1, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee4(i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+define i32 @caller5() nounwind readonly {
+entry:
+; PIC32: .ent caller5
+; PIC32-NOT: jalr
+; PIC32: .end caller5
+; STATIC32: .ent caller5
+; STATIC32-NOT: jal
+; STATIC32: .end caller5
+; N64: .ent caller5
+; N64-NOT: jalr
+; N64: .end caller5
+; PIC16: .ent caller5
+; PIC16: jalrc
+; PIC16: .end caller5
+
+  %0 = load i32* @g0, align 4
+  %1 = load i32* @g1, align 4
+  %2 = load i32* @g2, align 4
+  %3 = load i32* @g3, align 4
+  %4 = load i32* @g4, align 4
+  %5 = load i32* @g5, align 4
+  %6 = load i32* @g6, align 4
+  %7 = load i32* @g7, align 4
+  %8 = load i32* @g8, align 4
+  %9 = load i32* @g9, align 4
+  %call = tail call fastcc i32 @callee5(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9)
+  ret i32 %call
+}
+
+define internal fastcc i32 @callee5(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9) nounwind readnone noinline {
+entry:
+  %add = add nsw i32 %a1, %a0
+  %add1 = add nsw i32 %add, %a2
+  %add2 = add nsw i32 %add1, %a3
+  %add3 = add nsw i32 %add2, %a4
+  %add4 = add nsw i32 %add3, %a5
+  %add5 = add nsw i32 %add4, %a6
+  %add6 = add nsw i32 %add5, %a7
+  %add7 = add nsw i32 %add6, %a8
+  %add8 = add nsw i32 %add7, %a9
+  ret i32 %add8
+}
+
+declare i32 @callee8(i32, ...)
+
+define i32 @caller8_0() nounwind {
+entry:
+  %call = tail call fastcc i32 @caller8_1()
+  ret i32 %call
+}
+
+define internal fastcc i32 @caller8_1() nounwind noinline {
+entry:
+; PIC32: .ent caller8_1
+; PIC32: jalr
+; PIC32: .end caller8_1
+; STATIC32: .ent caller8_1
+; STATIC32: jal
+; STATIC32: .end caller8_1
+; N64: .ent caller8_1
+; N64-NOT: jalr
+; N64: .end caller8_1
+; PIC16: .ent caller8_1
+; PIC16: jalrc
+; PIC16: .end caller8_1
+
+  %call = tail call i32 (i32, ...)* @callee8(i32 2, i32 1) nounwind
+  ret i32 %call
+}
+
+%struct.S = type { [2 x i32] }
+
+@gs1 = external global %struct.S
+
+declare i32 @callee9(%struct.S* byval)
+
+define i32 @caller9_0() nounwind {
+entry:
+  %call = tail call fastcc i32 @caller9_1()
+  ret i32 %call
+}
+
+define internal fastcc i32 @caller9_1() nounwind noinline {
+entry:
+; PIC32: .ent caller9_1
+; PIC32: jalr
+; PIC32: .end caller9_1
+; STATIC32: .ent caller9_1
+; STATIC32: jal
+; STATIC32: .end caller9_1
+; N64: .ent caller9_1
+; N64: jalr
+; N64: .end caller9_1
+; PIC16: .ent caller9_1
+; PIC16: jalrc
+; PIC16: .end caller9_1
+
+  %call = tail call i32 @callee9(%struct.S* byval @gs1) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee10(i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+define i32 @caller10(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) nounwind {
+entry:
+; PIC32: .ent caller10
+; PIC32-NOT: jalr
+; STATIC32: .ent caller10
+; STATIC32-NOT: jal
+; N64: .ent caller10
+; N64-NOT: jalr
+; PIC16: .ent caller10
+; PIC16: jalrc
+
+  %call = tail call i32 @callee10(i32 %a8, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee11(%struct.S* byval)
+
+define i32 @caller11() nounwind noinline {
+entry:
+; PIC32: .ent caller11
+; PIC32: jalr
+; STATIC32: .ent caller11
+; STATIC32: jal
+; N64: .ent caller11
+; N64: jalr
+; PIC16: .ent caller11
+; PIC16: jalrc
+
+  %call = tail call i32 @callee11(%struct.S* byval @gs1) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee12()
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+define i32 @caller12(%struct.S* nocapture byval %a0) nounwind {
+entry:
+; PIC32: .ent caller12
+; PIC32: jalr
+; STATIC32: .ent caller12
+; STATIC32: jal
+; N64: .ent caller12
+; N64: jalr
+; PIC16: .ent caller12
+; PIC16: jalrc
+
+  %0 = bitcast %struct.S* %a0 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast (%struct.S* @gs1 to i8*), i8* %0, i32 8, i32 4, i1 false)
+  %call = tail call i32 @callee12() nounwind
+  ret i32 %call
+}
+
+declare i32 @callee13(i32, ...)
+
+define i32 @caller13() nounwind {
+entry:
+; PIC32: .ent caller13
+; PIC32-NOT: jalr
+; STATIC32: .ent caller13
+; STATIC32-NOT: jal
+; N64: .ent caller13
+; N64-NOT: jalr
+; PIC16: .ent caller13
+; PIC16: jalrc
+
+  %call = tail call i32 (i32, ...)* @callee13(i32 1, i32 2) nounwind
+  ret i32 %call
+}
+
diff --git a/test/CodeGen/Mips/tls-alias.ll b/test/CodeGen/Mips/tls-alias.ll
index d681091..ce98cc8 100644
--- a/test/CodeGen/Mips/tls-alias.ll
+++ b/test/CodeGen/Mips/tls-alias.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -disable-mips-delay-filler < %s | FileCheck %s
 
 @foo = thread_local global i32 42
 @bar = hidden alias i32* @foo
diff --git a/test/CodeGen/Mips/tls.ll b/test/CodeGen/Mips/tls.ll
index a7ddb96..72d30dc 100644
--- a/test/CodeGen/Mips/tls.ll
+++ b/test/CodeGen/Mips/tls.ll
@@ -1,8 +1,10 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=PIC
-; RUN: llc -march=mipsel -relocation-model=static < %s \
-; RUN:                             | FileCheck %s -check-prefix=STATIC
-; RUN: llc -march=mipsel -relocation-model=static < %s \
-; RUN:   -mips-fix-global-base-reg=false | FileCheck %s -check-prefix=STATICGP
+; RUN: llc -march=mipsel -disable-mips-delay-filler < %s | \
+; RUN:     FileCheck %s -check-prefix=PIC
+; RUN: llc -march=mipsel -relocation-model=static -disable-mips-delay-filler < \
+; RUN:     %s | FileCheck %s -check-prefix=STATIC
+; RUN: llc -march=mipsel -relocation-model=static -disable-mips-delay-filler \
+; RUN:     -mips-fix-global-base-reg=false < %s  | \
+; RUN:     FileCheck %s -check-prefix=STATICGP
 
 @t1 = thread_local global i32 0, align 4
 
diff --git a/test/CodeGen/Mips/tls16.ll b/test/CodeGen/Mips/tls16.ll
new file mode 100644
index 0000000..861864b
--- /dev/null
+++ b/test/CodeGen/Mips/tls16.ll
@@ -0,0 +1,13 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PIC16
+
+@a = thread_local global i32 4, align 4
+
+define i32 @foo() nounwind readonly {
+entry:
+  %0 = load i32* @a, align 4
+; PIC16:	lw	${{[0-9]+}}, %call16(__tls_get_addr)(${{[0-9]+}})
+; PIC16:	addiu	${{[0-9]+}}, %tlsgd(a)
+  ret i32 %0
+}
+
+
diff --git a/test/CodeGen/Mips/tls16_2.ll b/test/CodeGen/Mips/tls16_2.ll
new file mode 100644
index 0000000..b33e3c37
--- /dev/null
+++ b/test/CodeGen/Mips/tls16_2.ll
@@ -0,0 +1,15 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PIC16
+
+@f.i = internal thread_local unnamed_addr global i32 1, align 4
+
+define i8* @f(i8* nocapture %a) nounwind {
+entry:
+  %0 = load i32* @f.i, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @f.i, align 4
+  %1 = inttoptr i32 %inc to i8*
+; PIC16: addiu	${{[0-9]+}}, %tlsldm(f.i)
+  ret i8* %1
+}
+
+
diff --git a/test/CodeGen/Mips/uitofp.ll b/test/CodeGen/Mips/uitofp.ll
new file mode 100644
index 0000000..aff70c2
--- /dev/null
+++ b/test/CodeGen/Mips/uitofp.ll
@@ -0,0 +1,12 @@
+; RUN: llc -march=mips -mattr=+single-float < %s
+
+define void @f0() nounwind {
+entry:
+  %b = alloca i32, align 4
+  %a = alloca float, align 4
+  store volatile i32 1, i32* %b, align 4
+  %0 = load volatile i32* %b, align 4
+  %conv = uitofp i32 %0 to float
+  store float %conv, float* %a, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/ul1.ll b/test/CodeGen/Mips/ul1.ll
new file mode 100644
index 0000000..7e64ff4
--- /dev/null
+++ b/test/CodeGen/Mips/ul1.ll
@@ -0,0 +1,15 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+%struct.ua = type <{ i16, i32 }>
+
+@foo = common global %struct.ua zeroinitializer, align 1
+
+define i32 @main() nounwind {
+entry:
+  store i32 10, i32* getelementptr inbounds (%struct.ua* @foo, i32 0, i32 1), align 1
+; 16:   sb  ${{[0-9]+}}, {{[0-9]+}}(${{[0-9]+}})
+; 16:   sb  ${{[0-9]+}}, {{[0-9]+}}(${{[0-9]+}})
+; 16:   sb  ${{[0-9]+}}, {{[0-9]+}}(${{[0-9]+}})
+; 16:   sb  ${{[0-9]+}}, {{[0-9]+}}(${{[0-9]+}})
+  ret i32 0
+}
+
diff --git a/test/CodeGen/Mips/vector-load-store.ll b/test/CodeGen/Mips/vector-load-store.ll
new file mode 100644
index 0000000..d889963
--- /dev/null
+++ b/test/CodeGen/Mips/vector-load-store.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=mipsel -mattr=+dsp < %s | FileCheck %s
+
+@g1 = common global <2 x i16> zeroinitializer, align 4
+@g0 = common global <2 x i16> zeroinitializer, align 4
+@g3 = common global <4 x i8> zeroinitializer, align 4
+@g2 = common global <4 x i8> zeroinitializer, align 4
+
+define void @func_v2i16() nounwind {
+entry:
+; CHECK: lw
+; CHECK: sw
+
+  %0 = load <2 x i16>* @g1, align 4
+  store <2 x i16> %0, <2 x i16>* @g0, align 4
+  ret void
+}
+
+define void @func_v4i8() nounwind {
+entry:
+; CHECK: lw
+; CHECK: sw
+
+  %0 = load <4 x i8>* @g3, align 4
+  store <4 x i8> %0, <4 x i8>* @g2, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/NVPTX/global-ordering.ll b/test/CodeGen/NVPTX/global-ordering.ll
new file mode 100644
index 0000000..43394a7
--- /dev/null
+++ b/test/CodeGen/NVPTX/global-ordering.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+; Make sure we emit these globals in def-use order
+
+
+; PTX32:      .visible .global .align 1 .u8 a = 2;
+; PTX32-NEXT: .visible .global .align 4 .u32 a2 = a;
+; PTX64:      .visible .global .align 1 .u8 a = 2;
+; PTX64-NEXT: .visible .global .align 8 .u64 a2 = a;
+@a2 = addrspace(1) global i8 addrspace(1)* @a
+@a = addrspace(1) global i8 2
+
+
+; PTX32:      .visible .global .align 1 .u8 b = 1;
+; PTX32-NEXT: .visible .global .align 4 .u32 b2[2] = {b, b};
+; PTX64:      .visible .global .align 1 .u8 b = 1;
+; PTX64-NEXT: .visible .global .align 8 .u64 b2[2] = {b, b};
+@b2 = addrspace(1) global [2 x i8 addrspace(1)*] [i8 addrspace(1)* @b, i8 addrspace(1)* @b]
+@b = addrspace(1) global i8 1
diff --git a/test/CodeGen/NVPTX/param-align.ll b/test/CodeGen/NVPTX/param-align.ll
new file mode 100644
index 0000000..84ccb65
--- /dev/null
+++ b/test/CodeGen/NVPTX/param-align.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+;;; Need 4-byte alignment on float* passed byval
+define ptx_device void @t1(float* byval %x) {
+; CHECK: .func t1
+; CHECK: .param .align 4 .b8 t1_param_0[4]
+  ret void
+}
+
+
+;;; Need 8-byte alignment on double* passed byval
+define ptx_device void @t2(double* byval %x) {
+; CHECK: .func t2
+; CHECK: .param .align 8 .b8 t2_param_0[8]
+  ret void
+}
+
+
+;;; Need 4-byte alignment on float2* passed byval
+%struct.float2 = type { float, float }
+define ptx_device void @t3(%struct.float2* byval %x) {
+; CHECK: .func t3
+; CHECK: .param .align 4 .b8 t3_param_0[8]
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/pr13291-i1-store.ll b/test/CodeGen/NVPTX/pr13291-i1-store.ll
new file mode 100644
index 0000000..779f779
--- /dev/null
+++ b/test/CodeGen/NVPTX/pr13291-i1-store.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+define ptx_kernel void @t1(i1* %a) {
+; PTX32:      mov.u16 %rc{{[0-9]+}}, 0;
+; PTX32-NEXT: st.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}};
+; PTX64:      mov.u16 %rc{{[0-9]+}}, 0;
+; PTX64-NEXT: st.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}};
+  store i1 false, i1* %a
+  ret void
+}
+
+
+define ptx_kernel void @t2(i1* %a, i8* %b) {
+; PTX32: ld.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: and.b16 temp, %rc{{[0-9]+}}, 1;
+; PTX32: setp.b16.eq %p{{[0-9]+}}, temp, 1;
+; PTX64: ld.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: and.b16 temp, %rc{{[0-9]+}}, 1;
+; PTX64: setp.b16.eq %p{{[0-9]+}}, temp, 1;
+
+  %t1 = load i1* %a
+  %t2 = select i1 %t1, i8 1, i8 2
+  store i8 %t2, i8* %b
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/ptx-version-30.ll b/test/CodeGen/NVPTX/ptx-version-30.ll
new file mode 100644
index 0000000..0422b01
--- /dev/null
+++ b/test/CodeGen/NVPTX/ptx-version-30.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=ptx30 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=ptx30 | FileCheck %s
+
+
+; CHECK: .version 3.0
+
diff --git a/test/CodeGen/NVPTX/ptx-version-31.ll b/test/CodeGen/NVPTX/ptx-version-31.ll
new file mode 100644
index 0000000..d6e5730
--- /dev/null
+++ b/test/CodeGen/NVPTX/ptx-version-31.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=ptx31 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=ptx31 | FileCheck %s
+
+
+; CHECK: .version 3.1
+
diff --git a/test/CodeGen/NVPTX/sm-version-10.ll b/test/CodeGen/NVPTX/sm-version-10.ll
new file mode 100644
index 0000000..9324a37
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-10.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+
+
+; CHECK: .target sm_10
+
diff --git a/test/CodeGen/NVPTX/sm-version-11.ll b/test/CodeGen/NVPTX/sm-version-11.ll
new file mode 100644
index 0000000..9033a4e
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-11.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_11 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_11 | FileCheck %s
+
+
+; CHECK: .target sm_11
+
diff --git a/test/CodeGen/NVPTX/sm-version-12.ll b/test/CodeGen/NVPTX/sm-version-12.ll
new file mode 100644
index 0000000..d8ee85c
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-12.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_12 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_12 | FileCheck %s
+
+
+; CHECK: .target sm_12
+
diff --git a/test/CodeGen/NVPTX/sm-version-13.ll b/test/CodeGen/NVPTX/sm-version-13.ll
new file mode 100644
index 0000000..ad67d64
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-13.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_13 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_13 | FileCheck %s
+
+
+; CHECK: .target sm_13
+
diff --git a/test/CodeGen/NVPTX/sm-version-20.ll b/test/CodeGen/NVPTX/sm-version-20.ll
new file mode 100644
index 0000000..c21f49e
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-20.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+
+; CHECK: .target sm_20
+
diff --git a/test/CodeGen/NVPTX/sm-version-21.ll b/test/CodeGen/NVPTX/sm-version-21.ll
new file mode 100644
index 0000000..4fb6de3
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-21.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_21 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_21 | FileCheck %s
+
+
+; CHECK: .target sm_21
+
diff --git a/test/CodeGen/NVPTX/sm-version-30.ll b/test/CodeGen/NVPTX/sm-version-30.ll
new file mode 100644
index 0000000..692b49a0
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-30.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s
+
+
+; CHECK: .target sm_30
+
diff --git a/test/CodeGen/NVPTX/sm-version-35.ll b/test/CodeGen/NVPTX/sm-version-35.ll
new file mode 100644
index 0000000..25368a0
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-35.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
+
+
+; CHECK: .target sm_35
+
diff --git a/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll b/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
index 0003a17..b95ac68 100644
--- a/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
+++ b/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
@@ -9,9 +9,8 @@ target triple = "powerpc-apple-darwin11.0"
 
 define void @foo() nounwind ssp {
 entry:
-; Better: mtctr r12
-; CHECK: mr r12, [[REG:r[0-9]+]]
-; CHECK: mtctr [[REG]]
+; CHECK: mtctr r12
+; CHECK: bctrl
   %0 = load void (...)** @p, align 4              ; <void (...)*> [#uses=1]
   call void (...)* %0() nounwind
   br label %return
diff --git a/test/CodeGen/PowerPC/2012-09-16-TOC-entry-check.ll b/test/CodeGen/PowerPC/2012-09-16-TOC-entry-check.ll
new file mode 100644
index 0000000..9d2e390
--- /dev/null
+++ b/test/CodeGen/PowerPC/2012-09-16-TOC-entry-check.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; This test check if the TOC entry symbol name won't clash with global .LC0
+; and .LC2 symbols defined in the module.
+
+@.LC0 = internal global [5 x i8] c".LC0\00"
+@.LC2 = internal global [5 x i8] c".LC2\00"
+
+define i32 @foo(double %X, double %Y) nounwind readnone {
+  ; The 1.0 and 3.0 constants generate two TOC entries
+  %cmp = fcmp oeq double %X, 1.000000e+00
+  %conv = zext i1 %cmp to i32
+  %cmp1 = fcmp oeq double %Y, 3.000000e+00
+  %conv2 = zext i1 %cmp1 to i32
+  %add = add nsw i32 %conv2, %conv
+  ret i32 %add
+}
+
+; Check the creation of 2 .tc entries for both double constants. They
+; should be .LC1 and .LC3 to avoid name clash with global constants
+; .LC0 and .LC2
+; CHECK: .LC{{[13]}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
+; CHECK: .LC{{[13]}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
diff --git a/test/CodeGen/PowerPC/2012-10-11-dynalloc.ll b/test/CodeGen/PowerPC/2012-10-11-dynalloc.ll
new file mode 100644
index 0000000..41533a8
--- /dev/null
+++ b/test/CodeGen/PowerPC/2012-10-11-dynalloc.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define void @test(i64 %n) nounwind {
+entry:
+  %0 = alloca i8, i64 %n, align 1
+  %1 = alloca i8, i64 %n, align 1
+  call void @use(i8* %0, i8* %1) nounwind
+  ret void
+}
+
+declare void @use(i8*, i8*)
+
+; Check we actually have two instances of dynamic stack allocation,
+; identified by the stdux used to update the back-chain link.
+; CHECK: stdux
+; CHECK: stdux
diff --git a/test/CodeGen/PowerPC/2012-10-12-bitcast.ll b/test/CodeGen/PowerPC/2012-10-12-bitcast.ll
new file mode 100644
index 0000000..f841c5f
--- /dev/null
+++ b/test/CodeGen/PowerPC/2012-10-12-bitcast.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mattr=+altivec < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i32 @test(<16 x i8> %v) nounwind {
+entry:
+  %0 = bitcast <16 x i8> %v to i128
+  %1 = lshr i128 %0, 96
+  %2 = trunc i128 %1 to i32
+  ret i32 %2
+}
+
+; Verify that bitcast handles big-endian platforms correctly
+; by checking we load the result from the correct offset
+
+; CHECK: addi [[REGISTER:[0-9]+]], 1, -16
+; CHECK: stvx 2, 0, [[REGISTER]]
+; CHECK: lwz 3, -16(1)
+; CHECK: blr
+
diff --git a/test/CodeGen/PowerPC/asm-Zy.ll b/test/CodeGen/PowerPC/asm-Zy.ll
new file mode 100644
index 0000000..691165f
--- /dev/null
+++ b/test/CodeGen/PowerPC/asm-Zy.ll
@@ -0,0 +1,14 @@
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+; RUN: llc < %s -march=ppc64 -mcpu=a2 | FileCheck %s
+
+define i32 @zytest(i32 %a) nounwind {
+entry:
+; CHECK: @zytest
+  %r = call i32 asm "lwbrx $0, ${1:y}", "=r,Z"(i32 %a) nounwind, !srcloc !0
+  ret i32 %r
+; CHECK: lwbrx 3, 0,
+}
+
+!0 = metadata !{i32 101688}
+
diff --git a/test/CodeGen/PowerPC/big-endian-formal-args.ll b/test/CodeGen/PowerPC/big-endian-formal-args.ll
index 9a456b6..638059a 100644
--- a/test/CodeGen/PowerPC/big-endian-formal-args.ll
+++ b/test/CodeGen/PowerPC/big-endian-formal-args.ll
@@ -2,10 +2,10 @@
 
 declare void @bar(i64 %x, i64 %y)
 
-; CHECK: li {{[53]}}, 0
+; CHECK: li 3, 0
 ; CHECK: li 4, 2
+; CHECK: li 5, 0
 ; CHECK: li 6, 3
-; CHECK: mr {{[53]}}, {{[53]}}
 
 define void @foo() {
   call void @bar(i64 2, i64 3)
diff --git a/test/CodeGen/PowerPC/bl8_elf_nop.ll b/test/CodeGen/PowerPC/bl8_elf_nop.ll
deleted file mode 100644
index 386c59e..0000000
--- a/test/CodeGen/PowerPC/bl8_elf_nop.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu | FileCheck  %s
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-declare i32 @clock() nounwind
-
-define i32 @func() {
-entry:
-  %call = call i32 @clock() nounwind
-  %call2 = add i32 %call, 7
-  ret i32 %call2
-}
-
-; CHECK: bl clock
-; CHECK-NEXT: nop
-
diff --git a/test/CodeGen/PowerPC/coalesce-ext.ll b/test/CodeGen/PowerPC/coalesce-ext.ll
index cc80f83..f19175c 100644
--- a/test/CodeGen/PowerPC/coalesce-ext.ll
+++ b/test/CodeGen/PowerPC/coalesce-ext.ll
@@ -13,5 +13,6 @@ define i32 @test1sext(i64 %A, i64 %B, i32* %P, i64 *%P2) nounwind {
   store volatile i32 %D, i32* %P
   ; Reuse low bits of extended register, don't extend live range of SUM.
   ; CHECK: stw [[EXT]]
-  ret i32 %D
+  %R = add i32 %D, %D
+  ret i32 %R
 }
diff --git a/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll b/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll
new file mode 100644
index 0000000..afa1ea8
--- /dev/null
+++ b/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
+target triple = "powerpc-unknown-linux"
+
+@.str = private unnamed_addr constant [3 x i8] c"%i\00", align 1
+
+define void @test(i32 %count) nounwind {
+entry:
+; CHECK: crxor 6, 6, 6
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i32 1) nounwind
+  %cmp2 = icmp sgt i32 %count, 0
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+; CHECK: crxor 6, 6, 6
+  %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i32 1) nounwind
+  %inc = add nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
diff --git a/test/CodeGen/PowerPC/crsave.ll b/test/CodeGen/PowerPC/crsave.ll
new file mode 100644
index 0000000..3e98dbd
--- /dev/null
+++ b/test/CodeGen/PowerPC/crsave.ll
@@ -0,0 +1,49 @@
+; RUN: llc -O0 -disable-fp-elim -mtriple=powerpc-unknown-linux-gnu < %s | FileCheck %s -check-prefix=PPC32
+; RUN: llc -O0 -disable-fp-elim -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefix=PPC64
+
+declare void @foo()
+
+define i32 @test_cr2() nounwind {
+entry:
+  %ret = alloca i32, align 4
+  %0 = call i32 asm sideeffect "\0A\09mtcr $4\0A\09cmp 2,$2,$1\0A\09mfcr $0", "=r,r,r,r,r,~{cr2}"(i32 1, i32 2, i32 3, i32 0) nounwind
+  store i32 %0, i32* %ret, align 4
+  call void @foo()
+  %1 = load i32* %ret, align 4
+  ret i32 %1
+}
+
+; PPC32: mfcr 12
+; PPC32-NEXT: stw 12, {{[0-9]+}}(31)
+; PPC32: lwz 12, {{[0-9]+}}(31)
+; PPC32-NEXT: mtcrf 32, 12
+
+; PPC64: mfcr 12
+; PPC64-NEXT: stw 12, 8(1)
+; PPC64: lwz 12, 8(1)
+; PPC64-NEXT: mtcrf 32, 12
+
+define i32 @test_cr234() nounwind {
+entry:
+  %ret = alloca i32, align 4
+  %0 = call i32 asm sideeffect "\0A\09mtcr $4\0A\09cmp 2,$2,$1\0A\09cmp 3,$2,$2\0A\09cmp 4,$2,$3\0A\09mfcr $0", "=r,r,r,r,r,~{cr2},~{cr3},~{cr4}"(i32 1, i32 2, i32 3, i32 0) nounwind
+  store i32 %0, i32* %ret, align 4
+  call void @foo()
+  %1 = load i32* %ret, align 4
+  ret i32 %1
+}
+
+; PPC32: mfcr 12
+; PPC32-NEXT: stw 12, {{[0-9]+}}(31)
+; PPC32: lwz 12, {{[0-9]+}}(31)
+; PPC32-NEXT: mtcrf 32, 12
+; PPC32-NEXT: mtcrf 16, 12
+; PPC32-NEXT: mtcrf 8, 12
+
+; PPC64: mfcr 12
+; PPC64-NEXT: stw 12, 8(1)
+; PPC64: lwz 12, 8(1)
+; PPC64-NEXT: mtcrf 32, 12
+; PPC64-NEXT: mtcrf 16, 12
+; PPC64-NEXT: mtcrf 8, 12
+
diff --git a/test/CodeGen/PowerPC/emptystruct.ll b/test/CodeGen/PowerPC/emptystruct.ll
new file mode 100644
index 0000000..36b4abd
--- /dev/null
+++ b/test/CodeGen/PowerPC/emptystruct.ll
@@ -0,0 +1,51 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s | FileCheck %s
+
+; This tests correct handling of empty aggregate parameters and return values.
+; An empty parameter passed by value does not consume a protocol register or
+; a parameter save area doubleword.  An empty parameter passed by reference
+; is treated as any other pointer parameter.  An empty aggregate return value 
+; is treated as any other aggregate return value, passed via address as a 
+; hidden parameter in GPR3.  In this example, GPR3 contains the return value
+; address, GPR4 contains the address of e2, and e1 and e3 are not passed or
+; received.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.empty = type {}
+
+define void @callee(%struct.empty* noalias sret %agg.result, %struct.empty* byval %a1, %struct.empty* %a2, %struct.empty* byval %a3) nounwind {
+entry:
+  %a2.addr = alloca %struct.empty*, align 8
+  store %struct.empty* %a2, %struct.empty** %a2.addr, align 8
+  %0 = load %struct.empty** %a2.addr, align 8
+  %1 = bitcast %struct.empty* %agg.result to i8*
+  %2 = bitcast %struct.empty* %0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 0, i32 1, i1 false)
+  ret void
+}
+
+; CHECK: callee:
+; CHECK: std 4,
+; CHECK: std 3,
+; CHECK-NOT: std 5,
+; CHECK-NOT: std 6,
+; CHECK: blr
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define void @caller(%struct.empty* noalias sret %agg.result) nounwind {
+entry:
+  %e1 = alloca %struct.empty, align 1
+  %e2 = alloca %struct.empty, align 1
+  %e3 = alloca %struct.empty, align 1
+  call void @callee(%struct.empty* sret %agg.result, %struct.empty* byval %e1, %struct.empty* %e2, %struct.empty* byval %e3)
+  ret void
+}
+
+; CHECK: caller:
+; CHECK: addi 4,
+; CHECK: std 3,
+; CHECK-NOT: std 5,
+; CHECK-NOT: std 6,
+; CHECK: bl callee
diff --git a/test/CodeGen/PowerPC/floatPSA.ll b/test/CodeGen/PowerPC/floatPSA.ll
new file mode 100644
index 0000000..b5631a1
--- /dev/null
+++ b/test/CodeGen/PowerPC/floatPSA.ll
@@ -0,0 +1,97 @@
+; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+
+; This verifies that single-precision floating point values that can't
+; be passed in registers are stored in the rightmost word of the parameter
+; save area slot.  There are 13 architected floating-point registers, so
+; the 14th is passed in storage.  The address of the 14th argument is
+; 48 (fixed size of the linkage area) + 13 * 8 (first 13 args) + 4
+; (offset to second word) = 156.
+
+define float @bar(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i, float %j, float %k, float %l, float %m, float %n) nounwind {
+entry:
+  %a.addr = alloca float, align 4
+  %b.addr = alloca float, align 4
+  %c.addr = alloca float, align 4
+  %d.addr = alloca float, align 4
+  %e.addr = alloca float, align 4
+  %f.addr = alloca float, align 4
+  %g.addr = alloca float, align 4
+  %h.addr = alloca float, align 4
+  %i.addr = alloca float, align 4
+  %j.addr = alloca float, align 4
+  %k.addr = alloca float, align 4
+  %l.addr = alloca float, align 4
+  %m.addr = alloca float, align 4
+  %n.addr = alloca float, align 4
+  store float %a, float* %a.addr, align 4
+  store float %b, float* %b.addr, align 4
+  store float %c, float* %c.addr, align 4
+  store float %d, float* %d.addr, align 4
+  store float %e, float* %e.addr, align 4
+  store float %f, float* %f.addr, align 4
+  store float %g, float* %g.addr, align 4
+  store float %h, float* %h.addr, align 4
+  store float %i, float* %i.addr, align 4
+  store float %j, float* %j.addr, align 4
+  store float %k, float* %k.addr, align 4
+  store float %l, float* %l.addr, align 4
+  store float %m, float* %m.addr, align 4
+  store float %n, float* %n.addr, align 4
+  %0 = load float* %n.addr, align 4
+  ret float %0
+}
+
+; CHECK: lfs {{[0-9]+}}, 156(1)
+
+define float @foo() nounwind {
+entry:
+  %a = alloca float, align 4
+  %b = alloca float, align 4
+  %c = alloca float, align 4
+  %d = alloca float, align 4
+  %e = alloca float, align 4
+  %f = alloca float, align 4
+  %g = alloca float, align 4
+  %h = alloca float, align 4
+  %i = alloca float, align 4
+  %j = alloca float, align 4
+  %k = alloca float, align 4
+  %l = alloca float, align 4
+  %m = alloca float, align 4
+  %n = alloca float, align 4
+  store float 1.000000e+00, float* %a, align 4
+  store float 2.000000e+00, float* %b, align 4
+  store float 3.000000e+00, float* %c, align 4
+  store float 4.000000e+00, float* %d, align 4
+  store float 5.000000e+00, float* %e, align 4
+  store float 6.000000e+00, float* %f, align 4
+  store float 7.000000e+00, float* %g, align 4
+  store float 8.000000e+00, float* %h, align 4
+  store float 9.000000e+00, float* %i, align 4
+  store float 1.000000e+01, float* %j, align 4
+  store float 1.100000e+01, float* %k, align 4
+  store float 1.200000e+01, float* %l, align 4
+  store float 1.300000e+01, float* %m, align 4
+  store float 1.400000e+01, float* %n, align 4
+  %0 = load float* %a, align 4
+  %1 = load float* %b, align 4
+  %2 = load float* %c, align 4
+  %3 = load float* %d, align 4
+  %4 = load float* %e, align 4
+  %5 = load float* %f, align 4
+  %6 = load float* %g, align 4
+  %7 = load float* %h, align 4
+  %8 = load float* %i, align 4
+  %9 = load float* %j, align 4
+  %10 = load float* %k, align 4
+  %11 = load float* %l, align 4
+  %12 = load float* %m, align 4
+  %13 = load float* %n, align 4
+  %call = call float @bar(float %0, float %1, float %2, float %3, float %4, float %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13)
+  ret float %call
+}
+
+; Note that stw is used instead of stfs because the value is a simple
+; constant that can be created with a load-immediate in a GPR.
+; CHECK: stw {{[0-9]+}}, 156(1)
+
diff --git a/test/CodeGen/PowerPC/fsl-e500mc.ll b/test/CodeGen/PowerPC/fsl-e500mc.ll
new file mode 100644
index 0000000..09b7e41
--- /dev/null
+++ b/test/CodeGen/PowerPC/fsl-e500mc.ll
@@ -0,0 +1,22 @@
+;
+; Test support for Freescale e500mc and its higher memcpy inlining thresholds.
+;
+; RUN: llc -mcpu=e500mc < %s 2>&1 | FileCheck %s
+; CHECK-NOT: not a recognized processor for this target
+
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
+target triple = "powerpc-fsl-linux"
+
+%struct.teststruct = type { [12 x i32], i32 }
+
+define void @copy(%struct.teststruct* noalias nocapture sret %agg.result, %struct.teststruct* nocapture %in) nounwind {
+entry:
+; CHECK: @copy
+; CHECK-NOT: bl memcpy
+  %0 = bitcast %struct.teststruct* %agg.result to i8*
+  %1 = bitcast %struct.teststruct* %in to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 52, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/CodeGen/PowerPC/fsl-e5500.ll b/test/CodeGen/PowerPC/fsl-e5500.ll
new file mode 100644
index 0000000..d47d8c8
--- /dev/null
+++ b/test/CodeGen/PowerPC/fsl-e5500.ll
@@ -0,0 +1,22 @@
+;
+; Test support for Freescale e5500 and its higher memcpy inlining thresholds.
+;
+; RUN: llc -mcpu=e5500 < %s 2>&1 | FileCheck %s
+; CHECK-NOT: not a recognized processor for this target
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-fsl-linux"
+
+%struct.teststruct = type { [24 x i32], i32 }
+
+define void @copy(%struct.teststruct* noalias nocapture sret %agg.result, %struct.teststruct* nocapture %in) nounwind {
+entry:
+; CHECK: @copy
+; CHECK-NOT: bl memcpy
+  %0 = bitcast %struct.teststruct* %agg.result to i8*
+  %1 = bitcast %struct.teststruct* %in to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 100, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/PowerPC/i64_fp_round.ll b/test/CodeGen/PowerPC/i64_fp_round.ll
new file mode 100644
index 0000000..5a0c072
--- /dev/null
+++ b/test/CodeGen/PowerPC/i64_fp_round.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define float @test(i64 %x) nounwind readnone {
+entry:
+  %conv = sitofp i64 %x to float
+  ret float %conv
+}
+
+; Verify that we get the code sequence needed to avoid double-rounding.
+; Note that only parts of the sequence are checked for here, to allow
+; for minor code generation differences.
+
+; CHECK: sradi [[REGISTER:[0-9]+]], 3, 53
+; CHECK: addi [[REGISTER:[0-9]+]], [[REGISTER]], 1
+; CHECK: cmpldi 0, [[REGISTER]], 1
+; CHECK: isel [[REGISTER:[0-9]+]], {{[0-9]+}}, 3, 1
+; CHECK: std [[REGISTER]], -{{[0-9]+}}(1)
+
+
+; Also check that with -enable-unsafe-fp-math we do not get that extra
+; code sequence.  Simply verify that there is no "isel" present.
+
+; RUN: llc -mcpu=pwr7 -enable-unsafe-fp-math < %s | FileCheck %s -check-prefix=UNSAFE
+; CHECK-UNSAFE-NOT: isel
+
diff --git a/test/CodeGen/PowerPC/inlineasm-copy.ll b/test/CodeGen/PowerPC/inlineasm-copy.ll
index e1ff82d..59c3388 100644
--- a/test/CodeGen/PowerPC/inlineasm-copy.ll
+++ b/test/CodeGen/PowerPC/inlineasm-copy.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=ppc32 | not grep mr
+; RUN: llc < %s -march=ppc32 -verify-machineinstrs | FileCheck %s
 
+; CHECK-NOT: mr
 define i32 @test(i32 %Y, i32 %X) {
 entry:
         %tmp = tail call i32 asm "foo $0", "=r"( )              ; <i32> [#uses=1]
@@ -12,3 +13,9 @@ entry:
         ret i32 %tmp1
 }
 
+; CHECK: test3
+define i32 @test3(i32 %Y, i32 %X) {
+entry:
+        %tmp1 = tail call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "foo $0, $1", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19"( i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y )                ; <i32> [#uses=1]
+       ret i32 1
+}
diff --git a/test/CodeGen/PowerPC/int-fp-conv-1.ll b/test/CodeGen/PowerPC/int-fp-conv-1.ll
index 6c82723..d2887b9 100644
--- a/test/CodeGen/PowerPC/int-fp-conv-1.ll
+++ b/test/CodeGen/PowerPC/int-fp-conv-1.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=ppc64 | grep __floatditf
+; RUN: llc < %s -march=ppc64 | FileCheck %s
+; CHECK-NOT: __floatditf
 
 define i64 @__fixunstfdi(ppc_fp128 %a) nounwind  {
 entry:
diff --git a/test/CodeGen/PowerPC/jaggedstructs.ll b/test/CodeGen/PowerPC/jaggedstructs.ll
new file mode 100644
index 0000000..62aa7cf
--- /dev/null
+++ b/test/CodeGen/PowerPC/jaggedstructs.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s | FileCheck %s
+
+; This tests receiving and re-passing parameters consisting of structures
+; of size 3, 5, 6, and 7.  They are to be found/placed right-adjusted in
+; the parameter registers.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.S3 = type { [3 x i8] }
+%struct.S5 = type { [5 x i8] }
+%struct.S6 = type { [6 x i8] }
+%struct.S7 = type { [7 x i8] }
+
+define void @test(%struct.S3* byval %s3, %struct.S5* byval %s5, %struct.S6* byval %s6, %struct.S7* byval %s7) nounwind {
+entry:
+  call void @check(%struct.S3* byval %s3, %struct.S5* byval %s5, %struct.S6* byval %s6, %struct.S7* byval %s7)
+  ret void
+}
+
+; CHECK: std 6, 216(1)
+; CHECK: std 5, 208(1)
+; CHECK: std 4, 200(1)
+; CHECK: std 3, 192(1)
+; CHECK: lbz {{[0-9]+}}, 199(1)
+; CHECK: stb {{[0-9]+}}, 55(1)
+; CHECK: lhz {{[0-9]+}}, 197(1)
+; CHECK: sth {{[0-9]+}}, 53(1)
+; CHECK: lbz {{[0-9]+}}, 207(1)
+; CHECK: stb {{[0-9]+}}, 63(1)
+; CHECK: lwz {{[0-9]+}}, 203(1)
+; CHECK: stw {{[0-9]+}}, 59(1)
+; CHECK: lhz {{[0-9]+}}, 214(1)
+; CHECK: sth {{[0-9]+}}, 70(1)
+; CHECK: lwz {{[0-9]+}}, 210(1)
+; CHECK: stw {{[0-9]+}}, 66(1)
+; CHECK: lbz {{[0-9]+}}, 223(1)
+; CHECK: stb {{[0-9]+}}, 79(1)
+; CHECK: lhz {{[0-9]+}}, 221(1)
+; CHECK: sth {{[0-9]+}}, 77(1)
+; CHECK: lwz {{[0-9]+}}, 217(1)
+; CHECK: stw {{[0-9]+}}, 73(1)
+; CHECK: ld 6, 72(1)
+; CHECK: ld 5, 64(1)
+; CHECK: ld 4, 56(1)
+; CHECK: ld 3, 48(1)
+
+declare void @check(%struct.S3* byval, %struct.S5* byval, %struct.S6* byval, %struct.S7* byval)
diff --git a/test/CodeGen/PowerPC/misched.ll b/test/CodeGen/PowerPC/misched.ll
new file mode 100644
index 0000000..d6fb3b3
--- /dev/null
+++ b/test/CodeGen/PowerPC/misched.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -enable-misched -verify-machineinstrs
+; PR14302
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+@b = external global [16000 x double], align 32
+
+define void @pr14302() nounwind {
+entry:
+  tail call void @putchar() nounwind
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 undef, label %for.body, label %for.body24.i
+
+for.body24.i:                                     ; preds = %for.body24.i, %for.body
+  store double 1.000000e+00, double* undef, align 8
+  br i1 undef, label %for.body24.i58, label %for.body24.i
+
+for.body24.i58:                                   ; preds = %for.body24.i58, %for.body24.i
+  %arrayidx26.i55.1 = getelementptr inbounds [16000 x double]* @b, i64 0, i64 undef
+  store double 1.000000e+00, double* %arrayidx26.i55.1, align 8
+  br i1 undef, label %for.body24.i64, label %for.body24.i58
+
+for.body24.i64:                                   ; preds = %for.body24.i64, %for.body24.i58
+  %exitcond.2489 = icmp eq i32 0, 16000
+  br i1 %exitcond.2489, label %for.body24.i70, label %for.body24.i64
+
+for.body24.i70:                                   ; preds = %for.body24.i70, %for.body24.i64
+  br i1 undef, label %for.body24.i76, label %for.body24.i70
+
+for.body24.i76:                                   ; preds = %for.body24.i76, %for.body24.i70
+  br i1 undef, label %set1d.exit77, label %for.body24.i76
+
+set1d.exit77:                                     ; preds = %for.body24.i76
+  br label %for.body29
+
+for.body29:                                       ; preds = %for.body29, %set1d.exit77
+  br i1 undef, label %for.end35, label %for.body29
+
+for.end35:                                        ; preds = %for.body29
+  ret void
+}
+
+declare void @putchar()
diff --git a/test/CodeGen/PowerPC/novrsave.ll b/test/CodeGen/PowerPC/novrsave.ll
new file mode 100644
index 0000000..a70576a
--- /dev/null
+++ b/test/CodeGen/PowerPC/novrsave.ll
@@ -0,0 +1,15 @@
+; RUN: llc -O0 -mtriple=powerpc-unknown-linux-gnu   < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+
+; This verifies that the code to update VRSAVE has been removed for SVR4.
+
+define <4 x float> @bar(<4 x float> %v) nounwind {
+entry:
+  %v.addr = alloca <4 x float>, align 16
+  store <4 x float> %v, <4 x float>* %v.addr, align 16
+  %0 = load <4 x float>* %v.addr, align 16
+  ret <4 x float> %0
+}
+
+; CHECK-NOT: mfspr
+; CHECK-NOT: mtspr
diff --git a/test/CodeGen/PowerPC/ppc64-abi-extend.ll b/test/CodeGen/PowerPC/ppc64-abi-extend.ll
new file mode 100644
index 0000000..8baf1c6
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-abi-extend.ll
@@ -0,0 +1,97 @@
+; Verify that i32 argument/return values are extended to i64
+
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@si = common global i32 0, align 4
+@ui = common global i32 0, align 4
+
+declare void @arg_si(i32 signext)
+declare void @arg_ui(i32 zeroext)
+
+declare signext i32 @ret_si()
+declare zeroext i32 @ret_ui()
+
+define void @pass_arg_si() nounwind {
+entry:
+  %0 = load i32* @si, align 4
+  tail call void @arg_si(i32 signext %0) nounwind
+  ret void
+}
+; CHECK: @pass_arg_si
+; CHECK: lwa 3,
+; CHECK: bl arg_si
+
+define void @pass_arg_ui() nounwind {
+entry:
+  %0 = load i32* @ui, align 4
+  tail call void @arg_ui(i32 zeroext %0) nounwind
+  ret void
+}
+; CHECK: @pass_arg_ui
+; CHECK: lwz 3,
+; CHECK: bl arg_ui
+
+define i64 @use_arg_si(i32 signext %x) nounwind readnone {
+entry:
+  %conv = sext i32 %x to i64
+  ret i64 %conv
+}
+; CHECK: @use_arg_si
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+define i64 @use_arg_ui(i32 zeroext %x) nounwind readnone {
+entry:
+  %conv = zext i32 %x to i64
+  ret i64 %conv
+}
+; CHECK: @use_arg_ui
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+define signext i32 @pass_ret_si() nounwind readonly {
+entry:
+  %0 = load i32* @si, align 4
+  ret i32 %0
+}
+; CHECK: @pass_ret_si
+; CHECK: lwa 3,
+; CHECK: blr
+
+define zeroext i32 @pass_ret_ui() nounwind readonly {
+entry:
+  %0 = load i32* @ui, align 4
+  ret i32 %0
+}
+; CHECK: @pass_ret_ui
+; CHECK: lwz 3,
+; CHECK: blr
+
+define i64 @use_ret_si() nounwind {
+entry:
+  %call = tail call signext i32 @ret_si() nounwind
+  %conv = sext i32 %call to i64
+  ret i64 %conv
+}
+; CHECK: @use_ret_si
+; CHECK: bl ret_si
+; This is to verify the return register (3) set up by the ret_si
+; call is passed on unmodified as return value of use_ret_si.
+; CHECK-NOT: 3
+; CHECK: blr
+
+define i64 @use_ret_ui() nounwind {
+entry:
+  %call = tail call zeroext i32 @ret_ui() nounwind
+  %conv = zext i32 %call to i64
+  ret i64 %conv
+}
+; CHECK: @use_ret_ui
+; CHECK: bl ret_ui
+; This is to verify the return register (3) set up by the ret_ui
+; call is passed on unmodified as return value of use_ret_ui.
+; CHECK-NOT: 3
+; CHECK: blr
+
diff --git a/test/CodeGen/PowerPC/ppc64-align-long-double.ll b/test/CodeGen/PowerPC/ppc64-align-long-double.ll
new file mode 100644
index 0000000..10b70d0
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-align-long-double.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s | FileCheck %s
+
+; Verify internal alignment of long double in a struct.  The double
+; argument comes in in GPR3; GPR4 is skipped; GPRs 5 and 6 contain
+; the long double.  Check that these are stored to proper locations
+; in the parameter save area and loaded from there for return in FPR1/2.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.S = type { double, ppc_fp128 }
+
+define ppc_fp128 @test(%struct.S* byval %x) nounwind {
+entry:
+  %b = getelementptr inbounds %struct.S* %x, i32 0, i32 1
+  %0 = load ppc_fp128* %b, align 16
+  ret ppc_fp128 %0
+}
+
+; CHECK: std 6, 72(1)
+; CHECK: std 5, 64(1)
+; CHECK: std 4, 56(1)
+; CHECK: std 3, 48(1)
+; CHECK: lfd 1, 64(1)
+; CHECK: lfd 2, 72(1)
+
diff --git a/test/CodeGen/PowerPC/ppc64-calls.ll b/test/CodeGen/PowerPC/ppc64-calls.ll
new file mode 100644
index 0000000..c382edbb
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-calls.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -march=ppc64 | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define void @foo() nounwind readnone noinline {
+  ret void
+}
+
+define weak void @foo_weak() nounwind {
+  ret void
+}
+
+; Calls to local function does not require the TOC restore 'nop'
+define void @test_direct() nounwind readnone {
+; CHECK: test_direct:
+  tail call void @foo() nounwind
+; CHECK: bl foo
+; CHECK-NOT: nop
+  ret void
+}
+
+; Calls to weak function requires a TOC restore 'nop' because they
+; may be overridden in a different module.
+define void @test_weak() nounwind readnone {
+; CHECK: test_weak:
+  tail call void @foo_weak() nounwind
+; CHECK: bl foo
+; CHECK-NEXT: nop
+  ret void
+}
+
+; Indirect calls requires a full stub creation
+define void @test_indirect(void ()* nocapture %fp) nounwind {
+; CHECK: test_indirect:
+  tail call void %fp() nounwind
+; CHECK: ld [[FP:[0-9]+]], 0(3)
+; CHECK: ld 11, 16(3)
+; CHECK: ld 2, 8(3)
+; CHECK-NEXT: mtctr [[FP]]
+; CHECK-NEXT: bctrl
+; CHECK-NEXT: ld 2, 40(1)
+  ret void
+}
+
+; Absolute vales should be have the TOC restore 'nop'
+define void @test_abs() nounwind {
+; CHECK: test_abs:
+  tail call void inttoptr (i64 1024 to void ()*)() nounwind
+; CHECK: bla 1024
+; CHECK-NEXT: nop
+  ret void
+}
+
+declare double @sin(double) nounwind
+
+; External functions call should also have a 'nop'
+define double @test_external(double %x) nounwind {
+; CHECK: test_external:
+  %call = tail call double @sin(double %x) nounwind
+; CHECK: bl sin
+; CHECK-NEXT: nop
+  ret double %call
+}
diff --git a/test/CodeGen/PowerPC/ppc64-ind-call.ll b/test/CodeGen/PowerPC/ppc64-ind-call.ll
deleted file mode 100644
index d5c4d46..0000000
--- a/test/CodeGen/PowerPC/ppc64-ind-call.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -march=ppc64 | FileCheck %s
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-define void @test1() {
-entry:
-  %call.i75 = call zeroext i8 undef(i8* undef, i8 zeroext 10)
-  unreachable
-}
-
-; CHECK: @test1
-; CHECK: ld 11, 0(3)
-; CHECK: ld 2, 8(3)
-; CHECK: bctrl
-; CHECK: ld 2, 40(1)
-
diff --git a/test/CodeGen/PowerPC/ppc64-linux-func-size.ll b/test/CodeGen/PowerPC/ppc64-linux-func-size.ll
index e5aa1f1..e1d50ba 100644
--- a/test/CodeGen/PowerPC/ppc64-linux-func-size.ll
+++ b/test/CodeGen/PowerPC/ppc64-linux-func-size.ll
@@ -5,6 +5,7 @@
 ; CHECK-NEXT:	.align 3
 ; CHECK-NEXT:	.quad .L.test1
 ; CHECK-NEXT:	.quad .TOC.@tocbase
+; CHECK-NEXT:   .quad 0
 ; CHECK-NEXT:	.text
 ; CHECK-NEXT: .L.test1:
 
diff --git a/test/CodeGen/PowerPC/ppc64-toc.ll b/test/CodeGen/PowerPC/ppc64-toc.ll
new file mode 100644
index 0000000..a29bdcb
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-toc.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@double_array = global [32 x double] zeroinitializer, align 8
+@number64 = global i64 10, align 8
+@internal_static_var.x = internal unnamed_addr global i64 0, align 8
+
+define i64 @access_int64(i64 %a) nounwind readonly {
+entry:
+; CHECK: access_int64:
+; CHECK-NEXT: .align  3
+; CHECK-NEXT: .quad   .L.access_int64
+; CHECK-NEXT: .quad   .TOC.@tocbase
+; CHECK-NEXT: .quad   0
+; CHECK-NEXT: .text
+  %0 = load i64* @number64, align 8
+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)
+  %cmp = icmp eq i64 %0, %a
+  %conv1 = zext i1 %cmp to i64 
+  ret i64 %conv1
+}
+
+define i64 @internal_static_var(i64 %a) nounwind {
+entry:
+; CHECK: internal_static_var:
+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)
+  %0 = load i64* @internal_static_var.x, align 8
+  %cmp = icmp eq i64 %0, %a
+  %conv1 = zext i1 %cmp to i64 
+  ret i64 %conv1 
+}
+
+define i32 @access_double(double %a) nounwind readnone {
+entry:
+; CHECK: access_double:
+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)
+  %cmp = fcmp oeq double %a, 2.000000e+00
+  %conv = zext i1 %cmp to i32 
+  ret i32 %conv
+}
+
+
+define i32 @access_double_array(double %a, i32 %i) nounwind readonly {
+entry:
+; CHECK: access_double_array:
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds [32 x double]* @double_array, i64 0, i64 %idxprom
+  %0 = load double* %arrayidx, align 8
+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)
+  %cmp = fcmp oeq double %0, %a
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+; Check the creation of 4 .tc entries:
+; * int64_t global 'number64'
+; * double constant 2.0
+; * double array 'double_array'
+; * static int64_t 'x' accessed within '@internal_static_var'
+; CHECK: .LC{{[0-9]+}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
+; CHECK-NEXT: .LC{{[0-9]+}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
+; CHECK-NEXT: .LC{{[0-9]+}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
+; CHECK-NEXT: .LC{{[0-9]+}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
diff --git a/test/CodeGen/PowerPC/ppc64-zext.ll b/test/CodeGen/PowerPC/ppc64-zext.ll
new file mode 100644
index 0000000..eb55445
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-zext.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux"
+
+define i64 @fun(i32 %arg32) nounwind {
+entry:
+; CHECK: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 32
+  %o = zext i32 %arg32 to i64
+  ret i64 %o
+}
+
diff --git a/test/CodeGen/PowerPC/pr12757.ll b/test/CodeGen/PowerPC/pr12757.ll
new file mode 100644
index 0000000..c344656
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr12757.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i32 @__flt_rounds() nounwind {
+entry:
+  %0 = tail call i64 asm sideeffect "mffs $0", "=f"() nounwind
+  %conv = trunc i64 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK: @__flt_rounds
+; CHECK: mffs
+
diff --git a/test/CodeGen/PowerPC/pr13641.ll b/test/CodeGen/PowerPC/pr13641.ll
new file mode 100644
index 0000000..c4d3f3a
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr13641.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define void @foo() nounwind {
+  ret void
+}
+
+; CHECK: blr
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .quad 0
diff --git a/test/CodeGen/PowerPC/pr13891.ll b/test/CodeGen/PowerPC/pr13891.ll
new file mode 100644
index 0000000..3ae7385
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr13891.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.foo = type { i8, i8 }
+
+define void @_Z5check3foos(%struct.foo* nocapture byval %f, i16 signext %i) noinline {
+; CHECK: _Z5check3foos:
+; CHECK: sth 3, {{[0-9]+}}(1)
+; CHECK: lha {{[0-9]+}}, {{[0-9]+}}(1)
+entry:
+  %0 = bitcast %struct.foo* %f to i16*
+  %1 = load i16* %0, align 2
+  %bf.val.sext = ashr i16 %1, 8
+  %cmp = icmp eq i16 %bf.val.sext, %i
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %conv = sext i16 %bf.val.sext to i32
+  tail call void @exit(i32 %conv)
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+declare void @exit(i32)
diff --git a/test/CodeGen/PowerPC/remat-imm.ll b/test/CodeGen/PowerPC/remat-imm.ll
new file mode 100644
index 0000000..520921f
--- /dev/null
+++ b/test/CodeGen/PowerPC/remat-imm.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s | FileCheck %s
+; ModuleID = 'test.c'
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
+target triple = "powerpc-unknown-linux"
+
+@.str = private unnamed_addr constant [6 x i8] c"%d,%d\00", align 1
+
+define i32 @main() nounwind {
+entry:
+; CHECK: li 4, 128
+; CHECK-NOT: mr 4, {{.*}}
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), i32 128, i32 128) nounwind
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
diff --git a/test/CodeGen/PowerPC/structsinmem.ll b/test/CodeGen/PowerPC/structsinmem.ll
new file mode 100644
index 0000000..884d3a8
--- /dev/null
+++ b/test/CodeGen/PowerPC/structsinmem.ll
@@ -0,0 +1,227 @@
+; RUN: llc -mcpu=pwr7 -O0 -disable-fp-elim < %s | FileCheck %s
+
+; FIXME: The code generation for packed structs is very poor because the
+; PowerPC target wrongly rejects all unaligned loads.  This test case will
+; need to be revised when that is fixed.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.s1 = type { i8 }
+%struct.s2 = type { i16 }
+%struct.s4 = type { i32 }
+%struct.t1 = type { i8 }
+%struct.t3 = type <{ i16, i8 }>
+%struct.t5 = type <{ i32, i8 }>
+%struct.t6 = type <{ i32, i16 }>
+%struct.t7 = type <{ i32, i16, i8 }>
+%struct.s3 = type { i16, i8 }
+%struct.s5 = type { i32, i8 }
+%struct.s6 = type { i32, i16 }
+%struct.s7 = type { i32, i16, i8 }
+%struct.t2 = type <{ i16 }>
+%struct.t4 = type <{ i32 }>
+
+@caller1.p1 = private unnamed_addr constant %struct.s1 { i8 1 }, align 1
+@caller1.p2 = private unnamed_addr constant %struct.s2 { i16 2 }, align 2
+@caller1.p3 = private unnamed_addr constant { i16, i8, i8 } { i16 4, i8 8, i8 undef }, align 2
+@caller1.p4 = private unnamed_addr constant %struct.s4 { i32 16 }, align 4
+@caller1.p5 = private unnamed_addr constant { i32, i8, [3 x i8] } { i32 32, i8 64, [3 x i8] undef }, align 4
+@caller1.p6 = private unnamed_addr constant { i32, i16, [2 x i8] } { i32 128, i16 256, [2 x i8] undef }, align 4
+@caller1.p7 = private unnamed_addr constant { i32, i16, i8, i8 } { i32 512, i16 1024, i8 -3, i8 undef }, align 4
+@caller2.p1 = private unnamed_addr constant %struct.t1 { i8 1 }, align 1
+@caller2.p2 = private unnamed_addr constant { i16 } { i16 2 }, align 1
+@caller2.p3 = private unnamed_addr constant %struct.t3 <{ i16 4, i8 8 }>, align 1
+@caller2.p4 = private unnamed_addr constant { i32 } { i32 16 }, align 1
+@caller2.p5 = private unnamed_addr constant %struct.t5 <{ i32 32, i8 64 }>, align 1
+@caller2.p6 = private unnamed_addr constant %struct.t6 <{ i32 128, i16 256 }>, align 1
+@caller2.p7 = private unnamed_addr constant %struct.t7 <{ i32 512, i16 1024, i8 -3 }>, align 1
+
+define i32 @caller1() nounwind {
+entry:
+  %p1 = alloca %struct.s1, align 1
+  %p2 = alloca %struct.s2, align 2
+  %p3 = alloca %struct.s3, align 2
+  %p4 = alloca %struct.s4, align 4
+  %p5 = alloca %struct.s5, align 4
+  %p6 = alloca %struct.s6, align 4
+  %p7 = alloca %struct.s7, align 4
+  %0 = bitcast %struct.s1* %p1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.s1* @caller1.p1, i32 0, i32 0), i64 1, i32 1, i1 false)
+  %1 = bitcast %struct.s2* %p2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s2* @caller1.p2 to i8*), i64 2, i32 2, i1 false)
+  %2 = bitcast %struct.s3* %p3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast ({ i16, i8, i8 }* @caller1.p3 to i8*), i64 4, i32 2, i1 false)
+  %3 = bitcast %struct.s4* %p4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* bitcast (%struct.s4* @caller1.p4 to i8*), i64 4, i32 4, i1 false)
+  %4 = bitcast %struct.s5* %p5 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast ({ i32, i8, [3 x i8] }* @caller1.p5 to i8*), i64 8, i32 4, i1 false)
+  %5 = bitcast %struct.s6* %p6 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* bitcast ({ i32, i16, [2 x i8] }* @caller1.p6 to i8*), i64 8, i32 4, i1 false)
+  %6 = bitcast %struct.s7* %p7 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast ({ i32, i16, i8, i8 }* @caller1.p7 to i8*), i64 8, i32 4, i1 false)
+  %call = call i32 @callee1(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, %struct.s1* byval %p1, %struct.s2* byval %p2, %struct.s3* byval %p3, %struct.s4* byval %p4, %struct.s5* byval %p5, %struct.s6* byval %p6, %struct.s7* byval %p7)
+  ret i32 %call
+
+; CHECK: stb {{[0-9]+}}, 119(1)
+; CHECK: sth {{[0-9]+}}, 126(1)
+; CHECK: stw {{[0-9]+}}, 132(1)
+; CHECK: stw {{[0-9]+}}, 140(1)
+; CHECK: std {{[0-9]+}}, 144(1)
+; CHECK: std {{[0-9]+}}, 152(1)
+; CHECK: std {{[0-9]+}}, 160(1)
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define internal i32 @callee1(i32 %z1, i32 %z2, i32 %z3, i32 %z4, i32 %z5, i32 %z6, i32 %z7, i32 %z8, %struct.s1* byval %v1, %struct.s2* byval %v2, %struct.s3* byval %v3, %struct.s4* byval %v4, %struct.s5* byval %v5, %struct.s6* byval %v6, %struct.s7* byval %v7) nounwind {
+entry:
+  %z1.addr = alloca i32, align 4
+  %z2.addr = alloca i32, align 4
+  %z3.addr = alloca i32, align 4
+  %z4.addr = alloca i32, align 4
+  %z5.addr = alloca i32, align 4
+  %z6.addr = alloca i32, align 4
+  %z7.addr = alloca i32, align 4
+  %z8.addr = alloca i32, align 4
+  store i32 %z1, i32* %z1.addr, align 4
+  store i32 %z2, i32* %z2.addr, align 4
+  store i32 %z3, i32* %z3.addr, align 4
+  store i32 %z4, i32* %z4.addr, align 4
+  store i32 %z5, i32* %z5.addr, align 4
+  store i32 %z6, i32* %z6.addr, align 4
+  store i32 %z7, i32* %z7.addr, align 4
+  store i32 %z8, i32* %z8.addr, align 4
+  %a = getelementptr inbounds %struct.s1* %v1, i32 0, i32 0
+  %0 = load i8* %a, align 1
+  %conv = zext i8 %0 to i32
+  %a1 = getelementptr inbounds %struct.s2* %v2, i32 0, i32 0
+  %1 = load i16* %a1, align 2
+  %conv2 = sext i16 %1 to i32
+  %add = add nsw i32 %conv, %conv2
+  %a3 = getelementptr inbounds %struct.s3* %v3, i32 0, i32 0
+  %2 = load i16* %a3, align 2
+  %conv4 = sext i16 %2 to i32
+  %add5 = add nsw i32 %add, %conv4
+  %a6 = getelementptr inbounds %struct.s4* %v4, i32 0, i32 0
+  %3 = load i32* %a6, align 4
+  %add7 = add nsw i32 %add5, %3
+  %a8 = getelementptr inbounds %struct.s5* %v5, i32 0, i32 0
+  %4 = load i32* %a8, align 4
+  %add9 = add nsw i32 %add7, %4
+  %a10 = getelementptr inbounds %struct.s6* %v6, i32 0, i32 0
+  %5 = load i32* %a10, align 4
+  %add11 = add nsw i32 %add9, %5
+  %a12 = getelementptr inbounds %struct.s7* %v7, i32 0, i32 0
+  %6 = load i32* %a12, align 4
+  %add13 = add nsw i32 %add11, %6
+  ret i32 %add13
+
+; CHECK: lha {{[0-9]+}}, 126(1)
+; CHECK: lbz {{[0-9]+}}, 119(1)
+; CHECK: lha {{[0-9]+}}, 132(1)
+; CHECK: lwz {{[0-9]+}}, 140(1)
+; CHECK: lwz {{[0-9]+}}, 144(1)
+; CHECK: lwz {{[0-9]+}}, 152(1)
+; CHECK: lwz {{[0-9]+}}, 160(1)
+}
+
+define i32 @caller2() nounwind {
+entry:
+  %p1 = alloca %struct.t1, align 1
+  %p2 = alloca %struct.t2, align 1
+  %p3 = alloca %struct.t3, align 1
+  %p4 = alloca %struct.t4, align 1
+  %p5 = alloca %struct.t5, align 1
+  %p6 = alloca %struct.t6, align 1
+  %p7 = alloca %struct.t7, align 1
+  %0 = bitcast %struct.t1* %p1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.t1* @caller2.p1, i32 0, i32 0), i64 1, i32 1, i1 false)
+  %1 = bitcast %struct.t2* %p2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ({ i16 }* @caller2.p2 to i8*), i64 2, i32 1, i1 false)
+  %2 = bitcast %struct.t3* %p3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.t3* @caller2.p3 to i8*), i64 3, i32 1, i1 false)
+  %3 = bitcast %struct.t4* %p4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* bitcast ({ i32 }* @caller2.p4 to i8*), i64 4, i32 1, i1 false)
+  %4 = bitcast %struct.t5* %p5 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast (%struct.t5* @caller2.p5 to i8*), i64 5, i32 1, i1 false)
+  %5 = bitcast %struct.t6* %p6 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* bitcast (%struct.t6* @caller2.p6 to i8*), i64 6, i32 1, i1 false)
+  %6 = bitcast %struct.t7* %p7 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast (%struct.t7* @caller2.p7 to i8*), i64 7, i32 1, i1 false)
+  %call = call i32 @callee2(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, %struct.t1* byval %p1, %struct.t2* byval %p2, %struct.t3* byval %p3, %struct.t4* byval %p4, %struct.t5* byval %p5, %struct.t6* byval %p6, %struct.t7* byval %p7)
+  ret i32 %call
+
+; CHECK: stb {{[0-9]+}}, 119(1)
+; CHECK: sth {{[0-9]+}}, 126(1)
+; CHECK: stb {{[0-9]+}}, 135(1)
+; CHECK: sth {{[0-9]+}}, 133(1)
+; CHECK: stw {{[0-9]+}}, 140(1)
+; CHECK: stb {{[0-9]+}}, 151(1)
+; CHECK: stw {{[0-9]+}}, 147(1)
+; CHECK: sth {{[0-9]+}}, 158(1)
+; CHECK: stw {{[0-9]+}}, 154(1)
+; CHECK: stb {{[0-9]+}}, 167(1)
+; CHECK: sth {{[0-9]+}}, 165(1)
+; CHECK: stw {{[0-9]+}}, 161(1)
+}
+
+define internal i32 @callee2(i32 %z1, i32 %z2, i32 %z3, i32 %z4, i32 %z5, i32 %z6, i32 %z7, i32 %z8, %struct.t1* byval %v1, %struct.t2* byval %v2, %struct.t3* byval %v3, %struct.t4* byval %v4, %struct.t5* byval %v5, %struct.t6* byval %v6, %struct.t7* byval %v7) nounwind {
+entry:
+  %z1.addr = alloca i32, align 4
+  %z2.addr = alloca i32, align 4
+  %z3.addr = alloca i32, align 4
+  %z4.addr = alloca i32, align 4
+  %z5.addr = alloca i32, align 4
+  %z6.addr = alloca i32, align 4
+  %z7.addr = alloca i32, align 4
+  %z8.addr = alloca i32, align 4
+  store i32 %z1, i32* %z1.addr, align 4
+  store i32 %z2, i32* %z2.addr, align 4
+  store i32 %z3, i32* %z3.addr, align 4
+  store i32 %z4, i32* %z4.addr, align 4
+  store i32 %z5, i32* %z5.addr, align 4
+  store i32 %z6, i32* %z6.addr, align 4
+  store i32 %z7, i32* %z7.addr, align 4
+  store i32 %z8, i32* %z8.addr, align 4
+  %a = getelementptr inbounds %struct.t1* %v1, i32 0, i32 0
+  %0 = load i8* %a, align 1
+  %conv = zext i8 %0 to i32
+  %a1 = getelementptr inbounds %struct.t2* %v2, i32 0, i32 0
+  %1 = load i16* %a1, align 1
+  %conv2 = sext i16 %1 to i32
+  %add = add nsw i32 %conv, %conv2
+  %a3 = getelementptr inbounds %struct.t3* %v3, i32 0, i32 0
+  %2 = load i16* %a3, align 1
+  %conv4 = sext i16 %2 to i32
+  %add5 = add nsw i32 %add, %conv4
+  %a6 = getelementptr inbounds %struct.t4* %v4, i32 0, i32 0
+  %3 = load i32* %a6, align 1
+  %add7 = add nsw i32 %add5, %3
+  %a8 = getelementptr inbounds %struct.t5* %v5, i32 0, i32 0
+  %4 = load i32* %a8, align 1
+  %add9 = add nsw i32 %add7, %4
+  %a10 = getelementptr inbounds %struct.t6* %v6, i32 0, i32 0
+  %5 = load i32* %a10, align 1
+  %add11 = add nsw i32 %add9, %5
+  %a12 = getelementptr inbounds %struct.t7* %v7, i32 0, i32 0
+  %6 = load i32* %a12, align 1
+  %add13 = add nsw i32 %add11, %6
+  ret i32 %add13
+
+; CHECK: lbz {{[0-9]+}}, 149(1)
+; CHECK: lbz {{[0-9]+}}, 150(1)
+; CHECK: lbz {{[0-9]+}}, 147(1)
+; CHECK: lbz {{[0-9]+}}, 148(1)
+; CHECK: lbz {{[0-9]+}}, 133(1)
+; CHECK: lbz {{[0-9]+}}, 134(1)
+; CHECK: lha {{[0-9]+}}, 126(1)
+; CHECK: lbz {{[0-9]+}}, 119(1)
+; CHECK: lwz {{[0-9]+}}, 140(1)
+; CHECK: lhz {{[0-9]+}}, 154(1)
+; CHECK: lhz {{[0-9]+}}, 156(1)
+; CHECK: lbz {{[0-9]+}}, 163(1)
+; CHECK: lbz {{[0-9]+}}, 164(1)
+; CHECK: lbz {{[0-9]+}}, 161(1)
+; CHECK: lbz {{[0-9]+}}, 162(1)
+}
diff --git a/test/CodeGen/PowerPC/structsinregs.ll b/test/CodeGen/PowerPC/structsinregs.ll
new file mode 100644
index 0000000..ef706af
--- /dev/null
+++ b/test/CodeGen/PowerPC/structsinregs.ll
@@ -0,0 +1,213 @@
+; RUN: llc -mcpu=pwr7 -O0 -disable-fp-elim < %s | FileCheck %s
+
+; FIXME: The code generation for packed structs is very poor because the
+; PowerPC target wrongly rejects all unaligned loads.  This test case will
+; need to be revised when that is fixed.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.s1 = type { i8 }
+%struct.s2 = type { i16 }
+%struct.s4 = type { i32 }
+%struct.t1 = type { i8 }
+%struct.t3 = type <{ i16, i8 }>
+%struct.t5 = type <{ i32, i8 }>
+%struct.t6 = type <{ i32, i16 }>
+%struct.t7 = type <{ i32, i16, i8 }>
+%struct.s3 = type { i16, i8 }
+%struct.s5 = type { i32, i8 }
+%struct.s6 = type { i32, i16 }
+%struct.s7 = type { i32, i16, i8 }
+%struct.t2 = type <{ i16 }>
+%struct.t4 = type <{ i32 }>
+
+@caller1.p1 = private unnamed_addr constant %struct.s1 { i8 1 }, align 1
+@caller1.p2 = private unnamed_addr constant %struct.s2 { i16 2 }, align 2
+@caller1.p3 = private unnamed_addr constant { i16, i8, i8 } { i16 4, i8 8, i8 undef }, align 2
+@caller1.p4 = private unnamed_addr constant %struct.s4 { i32 16 }, align 4
+@caller1.p5 = private unnamed_addr constant { i32, i8, [3 x i8] } { i32 32, i8 64, [3 x i8] undef }, align 4
+@caller1.p6 = private unnamed_addr constant { i32, i16, [2 x i8] } { i32 128, i16 256, [2 x i8] undef }, align 4
+@caller1.p7 = private unnamed_addr constant { i32, i16, i8, i8 } { i32 512, i16 1024, i8 -3, i8 undef }, align 4
+@caller2.p1 = private unnamed_addr constant %struct.t1 { i8 1 }, align 1
+@caller2.p2 = private unnamed_addr constant { i16 } { i16 2 }, align 1
+@caller2.p3 = private unnamed_addr constant %struct.t3 <{ i16 4, i8 8 }>, align 1
+@caller2.p4 = private unnamed_addr constant { i32 } { i32 16 }, align 1
+@caller2.p5 = private unnamed_addr constant %struct.t5 <{ i32 32, i8 64 }>, align 1
+@caller2.p6 = private unnamed_addr constant %struct.t6 <{ i32 128, i16 256 }>, align 1
+@caller2.p7 = private unnamed_addr constant %struct.t7 <{ i32 512, i16 1024, i8 -3 }>, align 1
+
+define i32 @caller1() nounwind {
+entry:
+  %p1 = alloca %struct.s1, align 1
+  %p2 = alloca %struct.s2, align 2
+  %p3 = alloca %struct.s3, align 2
+  %p4 = alloca %struct.s4, align 4
+  %p5 = alloca %struct.s5, align 4
+  %p6 = alloca %struct.s6, align 4
+  %p7 = alloca %struct.s7, align 4
+  %0 = bitcast %struct.s1* %p1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.s1* @caller1.p1, i32 0, i32 0), i64 1, i32 1, i1 false)
+  %1 = bitcast %struct.s2* %p2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s2* @caller1.p2 to i8*), i64 2, i32 2, i1 false)
+  %2 = bitcast %struct.s3* %p3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast ({ i16, i8, i8 }* @caller1.p3 to i8*), i64 4, i32 2, i1 false)
+  %3 = bitcast %struct.s4* %p4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* bitcast (%struct.s4* @caller1.p4 to i8*), i64 4, i32 4, i1 false)
+  %4 = bitcast %struct.s5* %p5 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast ({ i32, i8, [3 x i8] }* @caller1.p5 to i8*), i64 8, i32 4, i1 false)
+  %5 = bitcast %struct.s6* %p6 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* bitcast ({ i32, i16, [2 x i8] }* @caller1.p6 to i8*), i64 8, i32 4, i1 false)
+  %6 = bitcast %struct.s7* %p7 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast ({ i32, i16, i8, i8 }* @caller1.p7 to i8*), i64 8, i32 4, i1 false)
+  %call = call i32 @callee1(%struct.s1* byval %p1, %struct.s2* byval %p2, %struct.s3* byval %p3, %struct.s4* byval %p4, %struct.s5* byval %p5, %struct.s6* byval %p6, %struct.s7* byval %p7)
+  ret i32 %call
+
+; CHECK: ld 9, 128(31)
+; CHECK: ld 8, 136(31)
+; CHECK: ld 7, 144(31)
+; CHECK: lwz 6, 152(31)
+; CHECK: lwz 5, 160(31)
+; CHECK: lhz 4, 168(31)
+; CHECK: lbz 3, 176(31)
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define internal i32 @callee1(%struct.s1* byval %v1, %struct.s2* byval %v2, %struct.s3* byval %v3, %struct.s4* byval %v4, %struct.s5* byval %v5, %struct.s6* byval %v6, %struct.s7* byval %v7) nounwind {
+entry:
+  %a = getelementptr inbounds %struct.s1* %v1, i32 0, i32 0
+  %0 = load i8* %a, align 1
+  %conv = zext i8 %0 to i32
+  %a1 = getelementptr inbounds %struct.s2* %v2, i32 0, i32 0
+  %1 = load i16* %a1, align 2
+  %conv2 = sext i16 %1 to i32
+  %add = add nsw i32 %conv, %conv2
+  %a3 = getelementptr inbounds %struct.s3* %v3, i32 0, i32 0
+  %2 = load i16* %a3, align 2
+  %conv4 = sext i16 %2 to i32
+  %add5 = add nsw i32 %add, %conv4
+  %a6 = getelementptr inbounds %struct.s4* %v4, i32 0, i32 0
+  %3 = load i32* %a6, align 4
+  %add7 = add nsw i32 %add5, %3
+  %a8 = getelementptr inbounds %struct.s5* %v5, i32 0, i32 0
+  %4 = load i32* %a8, align 4
+  %add9 = add nsw i32 %add7, %4
+  %a10 = getelementptr inbounds %struct.s6* %v6, i32 0, i32 0
+  %5 = load i32* %a10, align 4
+  %add11 = add nsw i32 %add9, %5
+  %a12 = getelementptr inbounds %struct.s7* %v7, i32 0, i32 0
+  %6 = load i32* %a12, align 4
+  %add13 = add nsw i32 %add11, %6
+  ret i32 %add13
+
+; CHECK: std 9, 96(1)
+; CHECK: std 8, 88(1)
+; CHECK: std 7, 80(1)
+; CHECK: stw 6, 76(1)
+; CHECK: stw 5, 68(1)
+; CHECK: sth 4, 62(1)
+; CHECK: stb 3, 55(1)
+; CHECK: lha {{[0-9]+}}, 62(1)
+; CHECK: lbz {{[0-9]+}}, 55(1)
+; CHECK: lha {{[0-9]+}}, 68(1)
+; CHECK: lwz {{[0-9]+}}, 76(1)
+; CHECK: lwz {{[0-9]+}}, 80(1)
+; CHECK: lwz {{[0-9]+}}, 88(1)
+; CHECK: lwz {{[0-9]+}}, 96(1)
+}
+
+define i32 @caller2() nounwind {
+entry:
+  %p1 = alloca %struct.t1, align 1
+  %p2 = alloca %struct.t2, align 1
+  %p3 = alloca %struct.t3, align 1
+  %p4 = alloca %struct.t4, align 1
+  %p5 = alloca %struct.t5, align 1
+  %p6 = alloca %struct.t6, align 1
+  %p7 = alloca %struct.t7, align 1
+  %0 = bitcast %struct.t1* %p1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.t1* @caller2.p1, i32 0, i32 0), i64 1, i32 1, i1 false)
+  %1 = bitcast %struct.t2* %p2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ({ i16 }* @caller2.p2 to i8*), i64 2, i32 1, i1 false)
+  %2 = bitcast %struct.t3* %p3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.t3* @caller2.p3 to i8*), i64 3, i32 1, i1 false)
+  %3 = bitcast %struct.t4* %p4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* bitcast ({ i32 }* @caller2.p4 to i8*), i64 4, i32 1, i1 false)
+  %4 = bitcast %struct.t5* %p5 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast (%struct.t5* @caller2.p5 to i8*), i64 5, i32 1, i1 false)
+  %5 = bitcast %struct.t6* %p6 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* bitcast (%struct.t6* @caller2.p6 to i8*), i64 6, i32 1, i1 false)
+  %6 = bitcast %struct.t7* %p7 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast (%struct.t7* @caller2.p7 to i8*), i64 7, i32 1, i1 false)
+  %call = call i32 @callee2(%struct.t1* byval %p1, %struct.t2* byval %p2, %struct.t3* byval %p3, %struct.t4* byval %p4, %struct.t5* byval %p5, %struct.t6* byval %p6, %struct.t7* byval %p7)
+  ret i32 %call
+
+; CHECK: stb {{[0-9]+}}, 71(1)
+; CHECK: sth {{[0-9]+}}, 69(1)
+; CHECK: stb {{[0-9]+}}, 87(1)
+; CHECK: stw {{[0-9]+}}, 83(1)
+; CHECK: sth {{[0-9]+}}, 94(1)
+; CHECK: stw {{[0-9]+}}, 90(1)
+; CHECK: stb {{[0-9]+}}, 103(1)
+; CHECK: sth {{[0-9]+}}, 101(1)
+; CHECK: stw {{[0-9]+}}, 97(1)
+; CHECK: ld 9, 96(1)
+; CHECK: ld 8, 88(1)
+; CHECK: ld 7, 80(1)
+; CHECK: lwz 6, 152(31)
+; CHECK: ld 5, 64(1)
+; CHECK: lhz 4, 168(31)
+; CHECK: lbz 3, 176(31)
+}
+
+define internal i32 @callee2(%struct.t1* byval %v1, %struct.t2* byval %v2, %struct.t3* byval %v3, %struct.t4* byval %v4, %struct.t5* byval %v5, %struct.t6* byval %v6, %struct.t7* byval %v7) nounwind {
+entry:
+  %a = getelementptr inbounds %struct.t1* %v1, i32 0, i32 0
+  %0 = load i8* %a, align 1
+  %conv = zext i8 %0 to i32
+  %a1 = getelementptr inbounds %struct.t2* %v2, i32 0, i32 0
+  %1 = load i16* %a1, align 1
+  %conv2 = sext i16 %1 to i32
+  %add = add nsw i32 %conv, %conv2
+  %a3 = getelementptr inbounds %struct.t3* %v3, i32 0, i32 0
+  %2 = load i16* %a3, align 1
+  %conv4 = sext i16 %2 to i32
+  %add5 = add nsw i32 %add, %conv4
+  %a6 = getelementptr inbounds %struct.t4* %v4, i32 0, i32 0
+  %3 = load i32* %a6, align 1
+  %add7 = add nsw i32 %add5, %3
+  %a8 = getelementptr inbounds %struct.t5* %v5, i32 0, i32 0
+  %4 = load i32* %a8, align 1
+  %add9 = add nsw i32 %add7, %4
+  %a10 = getelementptr inbounds %struct.t6* %v6, i32 0, i32 0
+  %5 = load i32* %a10, align 1
+  %add11 = add nsw i32 %add9, %5
+  %a12 = getelementptr inbounds %struct.t7* %v7, i32 0, i32 0
+  %6 = load i32* %a12, align 1
+  %add13 = add nsw i32 %add11, %6
+  ret i32 %add13
+
+; CHECK: std 9, 96(1)
+; CHECK: std 8, 88(1)
+; CHECK: std 7, 80(1)
+; CHECK: stw 6, 76(1)
+; CHECK: std 5, 64(1)
+; CHECK: sth 4, 62(1)
+; CHECK: stb 3, 55(1)
+; CHECK: lbz {{[0-9]+}}, 85(1)
+; CHECK: lbz {{[0-9]+}}, 86(1)
+; CHECK: lbz {{[0-9]+}}, 83(1)
+; CHECK: lbz {{[0-9]+}}, 84(1)
+; CHECK: lbz {{[0-9]+}}, 69(1)
+; CHECK: lbz {{[0-9]+}}, 70(1)
+; CHECK: lha {{[0-9]+}}, 62(1)
+; CHECK: lbz {{[0-9]+}}, 55(1)
+; CHECK: lwz {{[0-9]+}}, 76(1)
+; CHECK: lhz {{[0-9]+}}, 90(1)
+; CHECK: lhz {{[0-9]+}}, 92(1)
+; CHECK: lbz {{[0-9]+}}, 99(1)
+; CHECK: lbz {{[0-9]+}}, 100(1)
+; CHECK: lbz {{[0-9]+}}, 97(1)
+; CHECK: lbz {{[0-9]+}}, 98(1)
+}
diff --git a/test/CodeGen/PowerPC/varargs-struct-float.ll b/test/CodeGen/PowerPC/varargs-struct-float.ll
new file mode 100644
index 0000000..fb1835f
--- /dev/null
+++ b/test/CodeGen/PowerPC/varargs-struct-float.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s | FileCheck %s
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.Sf1 = type { float }
+
+define void @foo(float inreg %s.coerce) nounwind {
+entry:
+  %s = alloca %struct.Sf1, align 4
+  %coerce.dive = getelementptr %struct.Sf1* %s, i32 0, i32 0
+  store float %s.coerce, float* %coerce.dive, align 1
+  %coerce.dive1 = getelementptr %struct.Sf1* %s, i32 0, i32 0
+  %0 = load float* %coerce.dive1, align 1
+  call void (i32, ...)* @testvaSf1(i32 1, float inreg %0)
+  ret void
+}
+
+; CHECK: stfs {{[0-9]+}}, 60(1)
+; CHECK: ld 4, 56(1)
+; CHECK: bl
+
+declare void @testvaSf1(i32, ...)
diff --git a/test/CodeGen/PowerPC/vec_cmp.ll b/test/CodeGen/PowerPC/vec_cmp.ll
new file mode 100644
index 0000000..3180f46
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_cmp.ll
@@ -0,0 +1,527 @@
+; RUN: llc -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
+
+; Check vector comparisons using altivec. For non native types, just basic
+; comparison instruction check is done. For altivec supported type (16i8,
+; 8i16, 4i32, and 4f32) all the comparisons operators (==, !=, >, >=, <, <=)
+; are checked.
+
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define <2 x i8> @v2si8_cmp(<2 x i8> %x, <2 x i8> %y) nounwind readnone {
+  %cmp = icmp eq <2 x i8> %x, %y
+  %sext = sext <2 x i1> %cmp to <2 x i8>
+  ret <2 x i8> %sext
+}
+; CHECK: v2si8_cmp:
+; CHECK: vcmpequb {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <4 x i8> @v4si8_cmp(<4 x i8> %x, <4 x i8> %y) nounwind readnone {
+  %cmp = icmp eq <4 x i8> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i8>
+  ret <4 x i8> %sext
+}
+; CHECK: v4si8_cmp:
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <8 x i8> @v8si8_cmp(<8 x i8> %x, <8 x i8> %y) nounwind readnone {
+  %cmp = icmp eq <8 x i8> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i8>
+  ret <8 x i8> %sext
+}
+; CHECK: v8si8_cmp:
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+; Adicional tests for v16i8 since it is a altivec native type
+
+define <16 x i8> @v16si8_cmp_eq(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+  %cmp = icmp eq <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16si8_cmp_eq:
+; CHECK: vcmpequb 2, 2, 3
+
+define <16 x i8> @v16si8_cmp_ne(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp ne <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:     v16si8_cmp_ne:
+; CHECK:     vcmpequb [[RET:[0-9]+]], 2, 3
+; CHECK-NOR: vnor     2, [[RET]], [[RET]]
+
+define <16 x i8> @v16si8_cmp_le(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp sle <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:      v16si8_cmp_le:
+; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsb [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <16 x i8> @v16ui8_cmp_le(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp ule <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:      v16ui8_cmp_le:
+; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtub [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <16 x i8> @v16si8_cmp_lt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp slt <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16si8_cmp_lt:
+; CHECK: vcmpgtsb 2, 3, 2
+
+define <16 x i8> @v16ui8_cmp_lt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp ult <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16ui8_cmp_lt:
+; CHECK: vcmpgtub 2, 3, 2
+
+define <16 x i8> @v16si8_cmp_gt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp sgt <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16si8_cmp_gt:
+; CHECK: vcmpgtsb 2, 2, 3
+
+define <16 x i8> @v16ui8_cmp_gt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp ugt <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16ui8_cmp_gt:
+; CHECK: vcmpgtub 2, 2, 3
+
+define <16 x i8> @v16si8_cmp_ge(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp sge <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:      v16si8_cmp_ge:
+; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsb [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+define <16 x i8> @v16ui8_cmp_ge(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp uge <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:      v16ui8_cmp_ge:
+; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtub [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+
+define <32 x i8> @v32si8_cmp(<32 x i8> %x, <32 x i8> %y) nounwind readnone {
+  %cmp = icmp eq <32 x i8> %x, %y
+  %sext = sext <32 x i1> %cmp to <32 x i8>
+  ret <32 x i8> %sext
+}
+; CHECK: v32si8_cmp:
+; CHECK: vcmpequb {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequb {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <2 x i16> @v2si16_cmp(<2 x i16> %x, <2 x i16> %y) nounwind readnone {
+  %cmp = icmp eq <2 x i16> %x, %y
+  %sext = sext <2 x i1> %cmp to <2 x i16>
+  ret <2 x i16> %sext
+}
+; CHECK: v2si16_cmp:
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <4 x i16> @v4si16_cmp(<4 x i16> %x, <4 x i16> %y) nounwind readnone {
+  %cmp = icmp eq <4 x i16> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i16>
+  ret <4 x i16> %sext
+}
+; CHECK: v4si16_cmp:
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+; Adicional tests for v8i16 since it is an altivec native type
+
+define <8 x i16> @v8si16_cmp_eq(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp eq <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8si16_cmp_eq:
+; CHECK: vcmpequh 2, 2, 3
+
+define <8 x i16> @v8si16_cmp_ne(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp ne <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8si16_cmp_ne:
+; CHECK:      vcmpequh [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+
+define <8 x i16> @v8si16_cmp_le(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp sle <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8si16_cmp_le:
+; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsh [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <8 x i16> @v8ui16_cmp_le(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp ule <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8ui16_cmp_le:
+; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtuh [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <8 x i16> @v8si16_cmp_lt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp slt <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8si16_cmp_lt:
+; CHECK: vcmpgtsh 2, 3, 2
+
+define <8 x i16> @v8ui16_cmp_lt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp ult <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8ui16_cmp_lt:
+; CHECK: vcmpgtuh 2, 3, 2
+
+define <8 x i16> @v8si16_cmp_gt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp sgt <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8si16_cmp_gt:
+; CHECK: vcmpgtsh 2, 2, 3
+
+define <8 x i16> @v8ui16_cmp_gt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp ugt <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8ui16_cmp_gt:
+; CHECK: vcmpgtuh 2, 2, 3
+
+define <8 x i16> @v8si16_cmp_ge(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp sge <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8si16_cmp_ge:
+; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsh [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+define <8 x i16> @v8ui16_cmp_ge(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp uge <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8ui16_cmp_ge:
+; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtuh [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+
+define <16 x i16> @v16si16_cmp(<16 x i16> %x, <16 x i16> %y) nounwind readnone {
+  %cmp = icmp eq <16 x i16> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i16>
+  ret <16 x i16> %sext
+}
+; CHECK: v16si16_cmp:
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <32 x i16> @v32si16_cmp(<32 x i16> %x, <32 x i16> %y) nounwind readnone {
+  %cmp = icmp eq <32 x i16> %x, %y
+  %sext = sext <32 x i1> %cmp to <32 x i16>
+  ret <32 x i16> %sext
+}
+; CHECK: v32si16_cmp:
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <2 x i32> @v2si32_cmp(<2 x i32> %x, <2 x i32> %y) nounwind readnone {
+  %cmp = icmp eq <2 x i32> %x, %y
+  %sext = sext <2 x i1> %cmp to <2 x i32>
+  ret <2 x i32> %sext
+}
+; CHECK: v2si32_cmp:
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+; Adicional tests for v4si32 since it is an altivec native type
+
+define <4 x i32> @v4si32_cmp_eq(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp eq <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4si32_cmp_eq:
+; CHECK: vcmpequw 2, 2, 3
+
+define <4 x i32> @v4si32_cmp_ne(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp ne <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4si32_cmp_ne:
+; CHECK:      vcmpequw [[RCMP:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RCMP]], [[RCMP]]
+
+define <4 x i32> @v4si32_cmp_le(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp sle <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4si32_cmp_le:
+; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsw [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <4 x i32> @v4ui32_cmp_le(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp ule <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4ui32_cmp_le:
+; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtuw [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <4 x i32> @v4si32_cmp_lt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp slt <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4si32_cmp_lt:
+; CHECK: vcmpgtsw 2, 3, 2
+
+define <4 x i32> @v4ui32_cmp_lt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp ult <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4ui32_cmp_lt:
+; CHECK: vcmpgtuw 2, 3, 2
+
+define <4 x i32> @v4si32_cmp_gt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp sgt <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4si32_cmp_gt:
+; CHECK: vcmpgtsw 2, 2, 3
+
+define <4 x i32> @v4ui32_cmp_gt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp ugt <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4ui32_cmp_gt:
+; CHECK: vcmpgtuw 2, 2, 3
+
+define <4 x i32> @v4si32_cmp_ge(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp sge <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4si32_cmp_ge:
+; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsw [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+define <4 x i32> @v4ui32_cmp_ge(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp uge <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4ui32_cmp_ge:
+; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtuw [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+
+define <8 x i32> @v8si32_cmp(<8 x i32> %x, <8 x i32> %y) nounwind readnone {
+  %cmp = icmp eq <8 x i32> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i32>
+  ret <8 x i32> %sext
+}
+; CHECK: v8si32_cmp:
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <16 x i32> @v16si32_cmp(<16 x i32> %x, <16 x i32> %y) nounwind readnone {
+  %cmp = icmp eq <16 x i32> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i32>
+  ret <16 x i32> %sext
+}
+; CHECK: v16si32_cmp:
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <32 x i32> @v32si32_cmp(<32 x i32> %x, <32 x i32> %y) nounwind readnone {
+  %cmp = icmp eq <32 x i32> %x, %y
+  %sext = sext <32 x i1> %cmp to <32 x i32>
+  ret <32 x i32> %sext
+}
+; CHECK: v32si32_cmp:
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <2 x float> @v2f32_cmp(<2 x float> %x, <2 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp oeq <2 x float> %x, %y
+  %sext = sext <2 x i1> %cmp to <2 x i32>
+  %0 = bitcast <2 x i32> %sext to <2 x float>
+  ret <2 x float> %0
+}
+; CHECK: v2f32_cmp:
+; CHECK: vcmpeqfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+; Adicional tests for v4f32 since it is a altivec native type
+
+define <4 x float> @v4f32_cmp_eq(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp oeq <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK: v4f32_cmp_eq:
+; CHECK: vcmpeqfp 2, 2, 3
+
+define <4 x float> @v4f32_cmp_ne(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp une <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK:      v4f32_cmp_ne:
+; CHECK:      vcmpeqfp [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+
+define <4 x float> @v4f32_cmp_le(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp ole <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK:      v4f32_cmp_le:
+; CHECK:      vcmpeqfp [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtfp [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <4 x float> @v4f32_cmp_lt(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp olt <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK: v4f32_cmp_lt:
+; CHECK: vcmpgtfp 2, 3, 2
+
+define <4 x float> @v4f32_cmp_ge(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp oge <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK: v4f32_cmp_ge:
+; CHECK: vcmpgefp 2, 2, 3
+
+define <4 x float> @v4f32_cmp_gt(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp ogt <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK: v4f32_cmp_gt:
+; CHECK: vcmpgtfp 2, 2, 3
+
+
+define <8 x float> @v8f32_cmp(<8 x float> %x, <8 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp oeq <8 x float> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i32>
+  %0 = bitcast <8 x i32> %sext to <8 x float>
+  ret <8 x float> %0
+}
+; CHECK: v8f32_cmp:
+; CHECK: vcmpeqfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpeqfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
diff --git a/test/CodeGen/PowerPC/vec_conv.ll b/test/CodeGen/PowerPC/vec_conv.ll
new file mode 100644
index 0000000..a475e94
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv.ll
@@ -0,0 +1,57 @@
+; RUN: llc -mattr=+altivec < %s | FileCheck %s
+
+; Check vector float/int conversion using altivec.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@cte_float = global <4 x float> <float 6.5e+00, float 6.5e+00, float 6.5e+00, float 6.5e+00>, align 16
+@cte_int = global <4 x i32> <i32 6, i32 6, i32 6, i32 6>, align 16
+
+
+define void @v4f32_to_v4i32(<4 x float> %x, <4 x i32>* nocapture %y) nounwind {
+entry:
+  %0 = load <4 x float>* @cte_float, align 16
+  %mul = fmul <4 x float> %0, %x
+  %1 = fptosi <4 x float> %mul to <4 x i32>
+  store <4 x i32> %1, <4 x i32>* %y, align 16
+  ret void
+}
+;CHECK: v4f32_to_v4i32:
+;CHECK: vctsxs {{[0-9]+}}, {{[0-9]+}}, 0
+
+
+define void @v4f32_to_v4u32(<4 x float> %x, <4 x i32>* nocapture %y) nounwind {
+entry:
+  %0 = load <4 x float>* @cte_float, align 16
+  %mul = fmul <4 x float> %0, %x
+  %1 = fptoui <4 x float> %mul to <4 x i32>
+  store <4 x i32> %1, <4 x i32>* %y, align 16
+  ret void
+}
+;CHECK: v4f32_to_v4u32:
+;CHECK: vctuxs {{[0-9]+}}, {{[0-9]+}}, 0
+
+
+define void @v4i32_to_v4f32(<4 x i32> %x, <4 x float>* nocapture %y) nounwind {
+entry:
+  %0 = load <4 x i32>* @cte_int, align 16
+  %mul = mul <4 x i32> %0, %x
+  %1 = sitofp <4 x i32> %mul to <4 x float>
+  store <4 x float> %1, <4 x float>* %y, align 16
+  ret void
+}
+;CHECK: v4i32_to_v4f32:
+;CHECK: vcfsx {{[0-9]+}}, {{[0-9]+}}, 0
+
+
+define void @v4u32_to_v4f32(<4 x i32> %x, <4 x float>* nocapture %y) nounwind {
+entry:
+  %0 = load <4 x i32>* @cte_int, align 16
+  %mul = mul <4 x i32> %0, %x
+  %1 = uitofp <4 x i32> %mul to <4 x float>
+  store <4 x float> %1, <4 x float>* %y, align 16
+  ret void
+}
+;CHECK: v4u32_to_v4f32:
+;CHECK: vcfux {{[0-9]+}}, {{[0-9]+}}, 0
diff --git a/test/CodeGen/PowerPC/vec_extload.ll b/test/CodeGen/PowerPC/vec_extload.ll
new file mode 100644
index 0000000..201c15b
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_extload.ll
@@ -0,0 +1,155 @@
+; RUN: llc -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
+
+; Check vector extend load expansion with altivec enabled.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Altivec does not provides an sext intruction, so it expands
+; a set of vector stores (stvx), bytes load/sign expand/store
+; (lbz/stb), and a final vector load (lvx) to load the result
+; extended vector.
+define <16 x i8> @v16si8_sext_in_reg(<16 x i8> %a) {
+  %b = trunc <16 x i8> %a to <16 x i4>
+  %c = sext <16 x i4> %b to <16 x i8>
+  ret <16 x i8> %c
+}
+; CHECK: v16si8_sext_in_reg:
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+
+; The zero extend uses a more clever logic: a vector splat
+; and a logic and to set higher bits to 0.
+define <16 x i8> @v16si8_zext_in_reg(<16 x i8> %a) {
+  %b = trunc <16 x i8> %a to <16 x i4>
+  %c = zext <16 x i4> %b to <16 x i8>
+  ret <16 x i8> %c
+}
+; CHECK:      v16si8_zext_in_reg:
+; CHECK:      vspltisb [[VMASK:[0-9]+]], 15
+; CHECK-NEXT: vand 2, 2, [[VMASK]]
+
+; Same as v16si8_sext_in_reg, expands to load/store halfwords (lhz/sth).
+define <8 x i16> @v8si16_sext_in_reg(<8 x i16> %a) {
+  %b = trunc <8 x i16> %a to <8 x i8>
+  %c = sext <8 x i8> %b to <8 x i16>
+  ret <8 x i16> %c
+}
+; CHECK: v8si16_sext_in_reg:
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+
+; Same as v8si16_sext_in_reg, but instead of creating the mask
+; with a splat, loads it from memory.
+define <8 x i16> @v8si16_zext_in_reg(<8 x i16> %a) {
+  %b = trunc <8 x i16> %a to <8 x i8>
+  %c = zext <8 x i8> %b to <8 x i16>
+  ret <8 x i16> %c
+}
+; CHECK:      v8si16_zext_in_reg:
+; CHECK:      ld [[RMASKTOC:[0-9]+]], .LC{{[0-9]+}}@toc(2)
+; CHECK-NEXT: lvx [[VMASK:[0-9]+]], {{[0-9]+}}, [[RMASKTOC]]
+; CHECK-NEXT: vand 2, 2, [[VMASK]]
+
+; Same as v16si8_sext_in_reg, expands to load halfword (lha) and
+; store words (stw).
+define <4 x i32> @v4si32_sext_in_reg(<4 x i32> %a) {
+  %b = trunc <4 x i32> %a to <4 x i16>
+  %c = sext <4 x i16> %b to <4 x i32>
+  ret <4 x i32> %c
+}
+; CHECK: v4si32_sext_in_reg:
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lha
+; CHECK: stw
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lha
+; CHECK: stw
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lha
+; CHECK: stw
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lha
+; CHECK: stw
+; CHECK: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+
+; Same as v8si16_sext_in_reg.
+define <4 x i32> @v4si32_zext_in_reg(<4 x i32> %a) {
+  %b = trunc <4 x i32> %a to <4 x i16>
+  %c = zext <4 x i16> %b to <4 x i32>
+  ret <4 x i32> %c
+}
+; CHECK:      v4si32_zext_in_reg:
+; CHECK:      vspltisw [[VMASK:[0-9]+]], -16
+; CHECK-NEXT: vsrw [[VMASK]], [[VMASK]], [[VMASK]]
+; CHECK-NEXT: vand 2, 2, [[VMASK]]
diff --git a/test/CodeGen/PowerPC/vec_sqrt.ll b/test/CodeGen/PowerPC/vec_sqrt.ll
new file mode 100644
index 0000000..055da1a
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_sqrt.ll
@@ -0,0 +1,71 @@
+; RUN: llc -mcpu=pwr6 -mattr=+altivec,+fsqrt < %s | FileCheck %s
+
+; Check for vector sqrt expansion using floating-point types, since altivec
+; does not provide an fsqrt instruction for vector.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %val)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %val)
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float> %val)
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double> %val)
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double> %val)
+
+define <2 x float> @v2f32_sqrt(<2 x float> %x) nounwind readnone {
+entry:
+  %sqrt = call <2 x float> @llvm.sqrt.v2f32 (<2 x float> %x)
+  ret <2 x float> %sqrt
+}
+; sqrt (<2 x float>) is promoted to sqrt (<4 x float>)
+; CHECK: v2f32_sqrt:
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+
+define <4 x float> @v4f32_sqrt(<4 x float> %x) nounwind readnone {
+entry:
+  %sqrt = call <4 x float> @llvm.sqrt.v4f32 (<4 x float> %x)
+  ret <4 x float> %sqrt
+}
+; CHECK: v4f32_sqrt:
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+
+define <8 x float> @v8f32_sqrt(<8 x float> %x) nounwind readnone {
+entry:
+  %sqrt = call <8 x float> @llvm.sqrt.v8f32 (<8 x float> %x)
+  ret <8 x float> %sqrt
+}
+; CHECK: v8f32_sqrt:
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+
+define <2 x double> @v2f64_sqrt(<2 x double> %x) nounwind readnone {
+entry:
+  %sqrt = call <2 x double> @llvm.sqrt.v2f64 (<2 x double> %x)
+  ret <2 x double> %sqrt
+}
+; CHECK: v2f64_sqrt:
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+
+define <4 x double> @v4f64_sqrt(<4 x double> %x) nounwind readnone {
+entry:
+  %sqrt = call <4 x double> @llvm.sqrt.v4f64 (<4 x double> %x)
+  ret <4 x double> %sqrt
+}
+; CHECK: v4f64_sqrt:
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
diff --git a/test/CodeGen/PowerPC/vrspill.ll b/test/CodeGen/PowerPC/vrspill.ll
new file mode 100644
index 0000000..7641017
--- /dev/null
+++ b/test/CodeGen/PowerPC/vrspill.ll
@@ -0,0 +1,19 @@
+; RUN: llc -O0 -mtriple=powerpc-unknown-linux-gnu -mattr=+altivec -verify-machineinstrs  < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec -verify-machineinstrs < %s | FileCheck %s
+
+; This verifies that we generate correct spill/reload code for vector regs.
+
+define void @addrtaken(i32 %i, <4 x float> %w) nounwind {
+entry:
+  %i.addr = alloca i32, align 4
+  %w.addr = alloca <4 x float>, align 16
+  store i32 %i, i32* %i.addr, align 4
+  store <4 x float> %w, <4 x float>* %w.addr, align 16
+  call void @foo(i32* %i.addr)
+  ret void
+}
+
+; CHECK: stvx 2, 0, 0
+; CHECK: lvx 2, 0, 0
+
+declare void @foo(i32*)
diff --git a/test/CodeGen/SPARC/2011-01-11-CC.ll b/test/CodeGen/SPARC/2011-01-11-CC.ll
index 3ceda95..f676fd8 100755
--- a/test/CodeGen/SPARC/2011-01-11-CC.ll
+++ b/test/CodeGen/SPARC/2011-01-11-CC.ll
@@ -54,7 +54,7 @@ entry:
 ; V8: {{be|bne}}
 ; V9: test_select_dfp_icc
 ; V9: subcc
-; V9=NOT: {{be|bne}}
+; V9-NOT: {{be|bne}}
 ; V9: fmovd{{e|ne}} %icc
   %0 = icmp eq i32 %a, 0
   %1 = select i1 %0, double %f1, double %f2
diff --git a/test/CodeGen/Thumb2/buildvector-crash.ll b/test/CodeGen/Thumb2/buildvector-crash.ll
index 01ef472..ce42f4b 100644
--- a/test/CodeGen/Thumb2/buildvector-crash.ll
+++ b/test/CodeGen/Thumb2/buildvector-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -O3 -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s
 ; Formerly crashed, 3573915.
 
 define void @RotateStarsFP_Vec() nounwind {
@@ -13,5 +13,5 @@ bb8:                                              ; preds = %bb8, %bb.nph372
   store <4 x float> %3, <4 x float>* undef, align 4
   br label %bb8
 ; CHECK: RotateStarsFP_Vec:
-; CHECK: vldmia
+; CHECK: vld1.64
 }
diff --git a/test/CodeGen/Thumb2/carry.ll b/test/CodeGen/Thumb2/carry.ll
index de6f6e2..85b4370 100644
--- a/test/CodeGen/Thumb2/carry.ll
+++ b/test/CodeGen/Thumb2/carry.ll
@@ -20,3 +20,16 @@ entry:
 	%tmp2 = sub i64 %tmp1, %b
 	ret i64 %tmp2
 }
+
+; rdar://12559385
+define i64 @f3(i32 %vi) {
+entry:
+; CHECK: f3:
+; CHECK: movw [[REG:r[0-9]+]], #36102
+; CHECK: sbcs r{{[0-9]+}}, [[REG]]
+    %v0 = zext i32 %vi to i64
+    %v1 = xor i64 %v0, -155057456198619
+    %v4 = add i64 %v1, 155057456198619
+    %v5 = add i64 %v4, %v1
+    ret i64 %v5
+}
diff --git a/test/CodeGen/Thumb2/cortex-fp.ll b/test/CodeGen/Thumb2/cortex-fp.ll
index d06f8a7..b7df2fb 100644
--- a/test/CodeGen/Thumb2/cortex-fp.ll
+++ b/test/CodeGen/Thumb2/cortex-fp.ll
@@ -7,8 +7,8 @@ define float @foo(float %a, float %b) {
 entry:
 ; CHECK: foo
 ; CORTEXM3: blx ___mulsf3
-; CORTEXM4: vmul.f32  s0, s1, s0
-; CORTEXA8: vmul.f32  d0, d1, d0
+; CORTEXM4: vmul.f32  s0, s2, s0
+; CORTEXA8: vmul.f32  d
   %0 = fmul float %a, %b
   ret float %0
 }
@@ -19,6 +19,6 @@ entry:
   %0 = fmul double %a, %b
 ; CORTEXM3: blx ___muldf3
 ; CORTEXM4: blx ___muldf3
-; CORTEXA8: vmul.f64  d16, d17, d16
+; CORTEXA8: vmul.f64  d
   ret double %0
 }
diff --git a/test/CodeGen/Thumb2/div.ll b/test/CodeGen/Thumb2/div.ll
index 2c00c70..f89746a 100644
--- a/test/CodeGen/Thumb2/div.ll
+++ b/test/CodeGen/Thumb2/div.ll
@@ -2,6 +2,8 @@
 ; RUN:    | FileCheck %s -check-prefix=CHECK-THUMB
 ; RUN: llc < %s -march=thumb -mcpu=cortex-m3 -mattr=+thumb2 \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-THUMBV7M
+; RUN: llc < %s -march=thumb -mcpu=swift \
+; RUN:    | FileCheck %s -check-prefix=CHECK-SWIFT-T2
 
 define i32 @f1(i32 %a, i32 %b) {
 entry:
@@ -9,6 +11,8 @@ entry:
 ; CHECK-THUMB: __divsi3
 ; CHECK-THUMBV7M: f1
 ; CHECK-THUMBV7M: sdiv
+; CHECK-SWIFT-T2: f1
+; CHECK-SWIFT-T2: sdiv
         %tmp1 = sdiv i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
@@ -19,6 +23,8 @@ entry:
 ; CHECK-THUMB: __udivsi3
 ; CHECK-THUMBV7M: f2
 ; CHECK-THUMBV7M: udiv
+; CHECK-SWIFT-T2: f2
+; CHECK-SWIFT-T2: udiv
         %tmp1 = udiv i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
@@ -29,6 +35,8 @@ entry:
 ; CHECK-THUMB: __modsi3
 ; CHECK-THUMBV7M: f3
 ; CHECK-THUMBV7M: sdiv
+; CHECK-SWIFT-T2: f3
+; CHECK-SWIFT-T2: sdiv
         %tmp1 = srem i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
@@ -39,6 +47,8 @@ entry:
 ; CHECK-THUMB: __umodsi3
 ; CHECK-THUMBV7M: f4
 ; CHECK-THUMBV7M: udiv
+; CHECK-SWIFT-T2: f4
+; CHECK-SWIFT-T2: udiv
         %tmp1 = urem i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
diff --git a/test/CodeGen/Thumb2/longMACt.ll b/test/CodeGen/Thumb2/longMACt.ll
new file mode 100644
index 0000000..beefd60
--- /dev/null
+++ b/test/CodeGen/Thumb2/longMACt.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; Check generated signed and unsigned multiply accumulate long.
+
+define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) {
+;CHECK: MACLongTest1:
+;CHECK: umlal
+  %conv = zext i32 %a to i64
+  %conv1 = zext i32 %b to i64
+  %mul = mul i64 %conv1, %conv
+  %add = add i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest2(i32 %a, i32 %b, i64 %c)  {
+;CHECK: MACLongTest2:
+;CHECK: smlal
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %add = add nsw i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest3(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest3:
+;CHECK: umlal
+  %conv = zext i32 %b to i64
+  %conv1 = zext i32 %a to i64
+  %mul = mul i64 %conv, %conv1
+  %conv2 = zext i32 %c to i64
+  %add = add i64 %mul, %conv2
+  ret i64 %add
+}
+
+define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest4:
+;CHECK: smlal
+  %conv = sext i32 %b to i64
+  %conv1 = sext i32 %a to i64
+  %mul = mul nsw i64 %conv, %conv1
+  %conv2 = sext i32 %c to i64
+  %add = add nsw i64 %mul, %conv2
+  ret i64 %add
+}
diff --git a/test/CodeGen/Thumb2/thumb2-mla.ll b/test/CodeGen/Thumb2/thumb2-mla.ll
index c4cc749..594d974 100644
--- a/test/CodeGen/Thumb2/thumb2-mla.ll
+++ b/test/CodeGen/Thumb2/thumb2-mla.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
 
 define i32 @f1(i32 %a, i32 %b, i32 %c) {
     %tmp1 = mul i32 %a, %b
@@ -7,6 +8,9 @@ define i32 @f1(i32 %a, i32 %b, i32 %c) {
 }
 ; CHECK: f1:
 ; CHECK: 	mla	r0, r0, r1, r2
+; NO_MULOPS: f1:
+; NO_MULOPS: muls r0, r1, r0
+; NO_MULOPS-NEXT: add r0, r2
 
 define i32 @f2(i32 %a, i32 %b, i32 %c) {
     %tmp1 = mul i32 %a, %b
@@ -15,3 +19,6 @@ define i32 @f2(i32 %a, i32 %b, i32 %c) {
 }
 ; CHECK: f2:
 ; CHECK: 	mla	r0, r0, r1, r2
+; NO_MULOPS: f2:
+; NO_MULOPS: muls r0, r1, r0
+; NO_MULOPS-NEXT: add r0, r2
diff --git a/test/CodeGen/Thumb2/thumb2-select_xform.ll b/test/CodeGen/Thumb2/thumb2-select_xform.ll
index ead198f..ed4d26d 100644
--- a/test/CodeGen/Thumb2/thumb2-select_xform.ll
+++ b/test/CodeGen/Thumb2/thumb2-select_xform.ll
@@ -5,7 +5,7 @@ define i32 @t1(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK: mvn r0, #-2147483648
 ; CHECK: cmp r2, #10
 ; CHECK: it  le
-; CHECK: addle.w r1, r1, r0
+; CHECK: addle r1, r0
 ; CHECK: mov r0, r1
         %tmp1 = icmp sgt i32 %c, 10
         %tmp2 = select i1 %tmp1, i32 0, i32 2147483647
@@ -30,7 +30,7 @@ define i32 @t3(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 ; CHECK: t3
 ; CHECK: cmp r2, #10
 ; CHECK: it  le
-; CHECK: suble.w r1, r1, #10
+; CHECK: suble r1, #10
 ; CHECK: mov r0, r1
         %tmp1 = icmp sgt i32 %c, 10
         %tmp2 = select i1 %tmp1, i32 0, i32 10
diff --git a/test/CodeGen/Thumb2/thumb2-smla.ll b/test/CodeGen/Thumb2/thumb2-smla.ll
index c128ecc..aaaedfa 100644
--- a/test/CodeGen/Thumb2/thumb2-smla.ll
+++ b/test/CodeGen/Thumb2/thumb2-smla.ll
@@ -1,8 +1,12 @@
 ; RUN: llc < %s -march=thumb -mattr=+thumb2,+t2xtpk,+t2dsp | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2,+t2xtpk,+t2dsp -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
 
 define i32 @f3(i32 %a, i16 %x, i32 %y) {
 ; CHECK: f3
 ; CHECK: smlabt r0, r1, r2, r0
+; NO_MULOPS: f3
+; NO_MULOPS: smultb r1, r2, r1
+; NO_MULOPS-NEXT: add r0, r1
         %tmp = sext i16 %x to i32               ; <i32> [#uses=1]
         %tmp2 = ashr i32 %y, 16         ; <i32> [#uses=1]
         %tmp3 = mul i32 %tmp2, %tmp             ; <i32> [#uses=1]
diff --git a/test/CodeGen/Thumb2/thumb2-uxtb.ll b/test/CodeGen/Thumb2/thumb2-uxtb.ll
index 35914b1..2074f98 100644
--- a/test/CodeGen/Thumb2/thumb2-uxtb.ll
+++ b/test/CodeGen/Thumb2/thumb2-uxtb.ll
@@ -128,9 +128,9 @@ define i32 @test10(i32 %p0) {
 
 ; ARMv7M: test10
 ; ARMv7M: mov.w r1, #16253176
-; ARMv7M: mov.w r2, #458759
 ; ARMv7M: and.w r0, r1, r0, lsr #7
-; ARMv7M: and.w r1, r2, r0, lsr #5
+; ARMv7M: mov.w r1, #458759
+; ARMv7M: and.w r1, r1, r0, lsr #5
 ; ARMv7M: orrs r0, r1
 	%tmp1 = lshr i32 %p0, 7		; <i32> [#uses=1]
 	%tmp2 = and i32 %tmp1, 16253176		; <i32> [#uses=2]
diff --git a/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll b/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
index 8b55bd7..3d058bc 100644
--- a/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
+++ b/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=corei7 | FileCheck %s
 ; rdar://r7512579
 
 ; PHI defs in the atomic loop should be used by the add / adc
@@ -7,17 +7,16 @@
 define void @t(i64* nocapture %p) nounwind ssp {
 entry:
 ; CHECK: t:
-; CHECK: movl $1
-; CHECK: movl (%ebp), %eax
-; CHECK: movl 4(%ebp), %edx
+; CHECK: movl ([[REG:%[a-z]+]]), %eax
+; CHECK: movl 4([[REG]]), %edx
 ; CHECK: LBB0_1:
-; CHECK-NOT: movl $1
-; CHECK-NOT: movl $0
-; CHECK: addl
-; CHECK: adcl
+; CHECK: movl %eax, %ebx
+; CHECK: addl {{%[a-z]+}}, %ebx
+; CHECK: movl %edx, %ecx
+; CHECK: adcl {{%[a-z]+}}, %ecx
 ; CHECK: lock
-; CHECK: cmpxchg8b
-; CHECK: jne
+; CHECK-NEXT: cmpxchg8b ([[REG]])
+; CHECK-NEXT: jne
   %0 = atomicrmw add i64* %p, i64 1 seq_cst
   ret void
 }
diff --git a/test/CodeGen/X86/2012-01-18-vbitcast.ll b/test/CodeGen/X86/2012-01-18-vbitcast.ll
index 8a3ccc8..3ce7db6 100644
--- a/test/CodeGen/X86/2012-01-18-vbitcast.ll
+++ b/test/CodeGen/X86/2012-01-18-vbitcast.ll
@@ -2,8 +2,8 @@
 
 ;CHECK: vcast
 define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) {
-;CHECK: pshufd
-;CHECK: pshufd
+;CHECK: pmovzxdq
+;CHECK: pmovzxdq
   %af = bitcast <2 x float> %a to <2 x i32>
   %bf = bitcast <2 x float> %b to <2 x i32>
   %x = sub <2 x i32> %af, %bf
diff --git a/test/CodeGen/X86/2012-03-15-build_vector_wl.ll b/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
index fec17e9..c4b307e 100644
--- a/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
+++ b/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
@@ -4,7 +4,7 @@
 define <4 x i8> @build_vector_again(<16 x i8> %in) nounwind readnone {
 entry:
   %out = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK: shufb
+; CHECK: pmovzxbd
   ret <4 x i8> %out
 ; CHECK: ret
 }
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 9a66b67..0465952 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -5,7 +5,7 @@
 ; It's hard to test for the ISEL condition because CodeGen optimizes
 ; away the bugpointed code. Just ensure the basics are still there.
 ;CHECK: func:
-;CHECK: vpxor
+;CHECK: vxorps
 ;CHECK: vinsertf128
 ;CHECK: vpshufd
 ;CHECK: vpshufd
diff --git a/test/CodeGen/X86/2012-07-10-extload64.ll b/test/CodeGen/X86/2012-07-10-extload64.ll
index 906b748..4abdded 100644
--- a/test/CodeGen/X86/2012-07-10-extload64.ll
+++ b/test/CodeGen/X86/2012-07-10-extload64.ll
@@ -3,7 +3,7 @@
 ; CHECK: load_store
 define void @load_store(<4 x i16>* %in) {
 entry:
-; CHECK: movsd
+; CHECK: pmovzxwd
   %A27 = load <4 x i16>* %in, align 4
   %A28 = add <4 x i16> %A27, %A27
 ; CHECK: movlpd
@@ -27,6 +27,6 @@ define <2 x i32> @load_64(<2 x i32>* %ptr) {
 BB:
   %t = load <2 x i32>* %ptr
   ret <2 x i32> %t
-;CHECK: movsd
+;CHECK: pmovzxdq
 ;CHECK: ret
 }
diff --git a/test/CodeGen/X86/2012-08-16-setcc.ll b/test/CodeGen/X86/2012-08-16-setcc.ll
new file mode 100644
index 0000000..ed51156
--- /dev/null
+++ b/test/CodeGen/X86/2012-08-16-setcc.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s
+
+; rdar://12081007
+
+; CHECK: and_1:
+; CHECK: andb
+; CHECK-NEXT: cmovnel
+; CHECK: ret
+define i32 @and_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
+  %1 = and i8 %b, %a
+  %2 = icmp ne i8 %1, 0
+  %3 = select i1 %2, i32 %x, i32 0
+  ret i32 %3
+}
+
+; CHECK: and_2:
+; CHECK: andb
+; CHECK-NEXT: setne
+; CHECK: ret
+define zeroext i1 @and_2(i8 zeroext %a, i8 zeroext %b) {
+  %1 = and i8 %b, %a
+  %2 = icmp ne i8 %1, 0
+  ret i1 %2
+}
+
+; CHECK: xor_1:
+; CHECK: xorb
+; CHECK-NEXT: cmovnel
+; CHECK: ret
+define i32 @xor_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
+  %1 = xor i8 %b, %a
+  %2 = icmp ne i8 %1, 0
+  %3 = select i1 %2, i32 %x, i32 0
+  ret i32 %3
+}
+
+; CHECK: xor_2:
+; CHECK: xorb
+; CHECK-NEXT: setne
+; CHECK: ret
+define zeroext i1 @xor_2(i8 zeroext %a, i8 zeroext %b) {
+  %1 = xor i8 %b, %a
+  %2 = icmp ne i8 %1, 0
+  ret i1 %2
+}
diff --git a/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll b/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
new file mode 100644
index 0000000..6ebbb2e
--- /dev/null
+++ b/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -enable-unsafe-fp-math
+; <rdar://problem/12180135>
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+define i32 @foo(float %mean) nounwind readnone ssp align 2 {
+entry:
+  %cmp = fcmp olt float %mean, -3.000000e+00
+  %f.0 = select i1 %cmp, float -3.000000e+00, float %mean
+  %cmp2 = fcmp ult float %f.0, 3.000000e+00
+  %f.1 = select i1 %cmp2, float %f.0, float 0x4007EB8520000000
+  %add = fadd float %f.1, 3.000000e+00
+  %div = fdiv float %add, 2.343750e-02
+  %0 = fpext float %div to double
+  %conv = select i1 undef, double 2.550000e+02, double %0
+  %add8 = fadd double %conv, 5.000000e-01
+  %conv9 = fptosi double %add8 to i32
+  %.conv9 = select i1 undef, i32 255, i32 %conv9
+  ret i32 %.conv9
+}
diff --git a/test/CodeGen/X86/2012-09-13-dagco-fneg.ll b/test/CodeGen/X86/2012-09-13-dagco-fneg.ll
new file mode 100644
index 0000000..7b9bab9
--- /dev/null
+++ b/test/CodeGen/X86/2012-09-13-dagco-fneg.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: foo
+; Make sure we are not trying to use scalar xor on the high bits of the vector.
+; CHECK-NOT: xorq
+; CHECK: xorl
+; CHECK-NEXT: ret
+
+define i32 @foo() {
+bb:
+  %tmp44.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00>
+  %0 = bitcast <4 x float> %tmp44.i to i128
+  %1 = zext i128 %0 to i512
+  %2 = shl nuw nsw i512 %1, 256
+  %ins = or i512 %2, 3325764857622480139933400731976840738652108318779753826115024029985671937147149347761402413803120180680770390816681124225944317364750115981129923635970048
+  store i512 %ins, i512* undef, align 64
+  ret i32 0
+}
diff --git a/test/CodeGen/X86/2012-09-28-CGPBug.ll b/test/CodeGen/X86/2012-09-28-CGPBug.ll
new file mode 100644
index 0000000..32d7d01
--- /dev/null
+++ b/test/CodeGen/X86/2012-09-28-CGPBug.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=i386-apple-macosx < %s | FileCheck %s
+; rdar://12396696
+
+@JT = global [4 x i32] [i32 sub (i32 ptrtoint (i8* blockaddress(@h, %18) to i32), i32 ptrtoint (i8* blockaddress(@h, %11) to i32)), i32 sub (i32 ptrtoint (i8* blockaddress(@h, %17) to i32), i32 ptrtoint (i8* blockaddress(@h, %11) to i32)), i32 sub (i32 ptrtoint (i8* blockaddress(@h, %22) to i32), i32 ptrtoint (i8* blockaddress(@h, %18) to i32)), i32 sub (i32 ptrtoint (i8* blockaddress(@h, %22) to i32), i32 ptrtoint (i8* blockaddress(@h, %17) to i32))]
+@gGlobalLock = external global i8*
+@.str40 = external global [35 x i8]
+
+; CHECK: _JT:
+; CHECK-NOT: .long Ltmp{{[0-9]+}}-1
+; CHECK-NOT: .long 1-Ltmp{{[0-9]+}}
+; CHECK: .long Ltmp{{[0-9]+}}-Ltmp{{[0-9]+}}
+; CHECK: .long Ltmp{{[0-9]+}}-Ltmp{{[0-9]+}}
+; CHECK: .long Ltmp{{[0-9]+}}-Ltmp{{[0-9]+}}
+; CHECK: .long Ltmp{{[0-9]+}}-Ltmp{{[0-9]+}}
+
+define void @h(i8*) nounwind ssp {
+  %2 = alloca i8*
+  store i8* %0, i8** %2
+  %3 = load i8** %2
+  %4 = bitcast i8* %3 to { i32, i32 }*
+  %5 = getelementptr { i32, i32 }* %4, i32 0, i32 0
+  %6 = load i32* %5
+  %7 = srem i32 %6, 2
+  %8 = icmp slt i32 %6, 2
+  %9 = select i1 %8, i32 %6, i32 %7
+  %10 = icmp eq i32 %9, 0
+  br label %11
+
+; <label>:11                                      ; preds = %1
+  %12 = zext i1 %10 to i32
+  %13 = getelementptr [4 x i32]* @JT, i32 0, i32 %12
+  %14 = load i32* %13
+  %15 = add i32 %14, ptrtoint (i8* blockaddress(@h, %11) to i32)
+  %16 = inttoptr i32 %15 to i8*
+  indirectbr i8* %16, [label %17, label %18]
+
+; <label>:17                                      ; preds = %11
+  tail call void (i8*, ...)* @g(i8* getelementptr inbounds ([35 x i8]* @.str40, i32 0, i32 0))
+  br label %22
+
+; <label>:18                                      ; preds = %11
+  %19 = call i32 @f(i32 -1037694186) nounwind
+  %20 = inttoptr i32 %19 to i32 (i8**)*
+  %21 = tail call i32 %20(i8** @gGlobalLock)
+  br label %22
+
+; <label>:22                                      ; preds = %18, %17
+  ret void
+}
+
+declare i32 @f(i32)
+
+declare void @g(i8*, ...)
diff --git a/test/CodeGen/X86/2012-10-02-DAGCycle.ll b/test/CodeGen/X86/2012-10-02-DAGCycle.ll
new file mode 100644
index 0000000..8d914db
--- /dev/null
+++ b/test/CodeGen/X86/2012-10-02-DAGCycle.ll
@@ -0,0 +1,52 @@
+; RUN: llc -mtriple=i386-apple-macosx -relocation-model=pic < %s
+; RUN: llc -mtriple=x86_64-apple-macosx -relocation-model=pic < %s
+
+; rdar://12393897
+
+%TRp = type { i32, %TRH*, i32, i32 }
+%TRH = type { i8*, i8*, i8*, i8*, {}* }
+
+define i32 @t(%TRp* inreg %rp) nounwind optsize ssp {
+entry:
+  %handler = getelementptr inbounds %TRp* %rp, i32 0, i32 1
+  %0 = load %TRH** %handler, align 4
+  %sync = getelementptr inbounds %TRH* %0, i32 0, i32 4
+  %sync12 = load {}** %sync, align 4
+  %1 = bitcast {}* %sync12 to i32 (%TRp*)*
+  %call = tail call i32 %1(%TRp* inreg %rp) nounwind optsize
+  ret i32 %call
+}
+
+%btConeShape = type { %btConvexInternalShape, float, float, float, [3 x i32] }
+%btConvexInternalShape = type { %btConvexShape, %btVector, %btVector, float, float }
+%btConvexShape = type { %btCollisionShape }
+%btCollisionShape = type { i32 (...)**, i32, i8* }
+%btVector = type { [4 x float] }
+
+define { <2 x float>, <2 x float> } @t2(%btConeShape* %this) unnamed_addr uwtable ssp align 2 {
+entry:
+  %0 = getelementptr inbounds %btConeShape* %this, i64 0, i32 0
+  br i1 undef, label %if.then, label %if.end17
+
+if.then:                                          ; preds = %entry
+  %vecnorm.sroa.2.8.copyload = load float* undef, align 4
+  %cmp4 = fcmp olt float undef, 0x3D10000000000000
+  %vecnorm.sroa.2.8.copyload36 = select i1 %cmp4, float -1.000000e+00, float %vecnorm.sroa.2.8.copyload
+  %call.i.i.i = tail call float @sqrtf(float 0.000000e+00) nounwind readnone
+  %div.i.i = fdiv float 1.000000e+00, %call.i.i.i
+  %mul7.i.i.i = fmul float %div.i.i, %vecnorm.sroa.2.8.copyload36
+  %1 = load float (%btConvexInternalShape*)** undef, align 8
+  %call12 = tail call float %1(%btConvexInternalShape* %0)
+  %mul7.i.i = fmul float %call12, %mul7.i.i.i
+  %retval.sroa.0.4.insert = insertelement <2 x float> zeroinitializer, float undef, i32 1
+  %add13.i = fadd float undef, %mul7.i.i
+  %retval.sroa.1.8.insert = insertelement <2 x float> undef, float %add13.i, i32 0
+  br label %if.end17
+
+if.end17:                                         ; preds = %if.then, %entry
+  %retval.sroa.1.8.load3338 = phi <2 x float> [ %retval.sroa.1.8.insert, %if.then ], [ undef, %entry ]
+  %retval.sroa.0.0.load3137 = phi <2 x float> [ %retval.sroa.0.4.insert, %if.then ], [ undef, %entry ]
+  ret { <2 x float>, <2 x float> } undef
+}
+
+declare float @sqrtf(float) nounwind readnone
diff --git a/test/CodeGen/X86/2012-10-03-DAGCycle.ll b/test/CodeGen/X86/2012-10-03-DAGCycle.ll
new file mode 100644
index 0000000..72083c7
--- /dev/null
+++ b/test/CodeGen/X86/2012-10-03-DAGCycle.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=x86_64-apple-macosx -mcpu=corei7 < %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.pluto.0 = type { %struct.bar.1, %struct.hoge.368* }
+%struct.bar.1 = type { %i8* }
+%i8 = type { i8 }
+%struct.hoge.368 = type { i32, i32 }
+%struct.widget.375 = type { i32, i32, %i8*, %struct.hoge.368* }
+
+define fastcc void @bar(%struct.pluto.0* %arg) nounwind uwtable ssp align 2 {
+bb:
+  %tmp1 = alloca %struct.widget.375, align 8
+  %tmp2 = getelementptr inbounds %struct.pluto.0* %arg, i64 0, i32 1
+  %tmp3 = load %struct.hoge.368** %tmp2, align 8
+  store %struct.pluto.0* %arg, %struct.pluto.0** undef, align 8
+  %tmp = getelementptr inbounds %struct.widget.375* %tmp1, i64 0, i32 2
+  %tmp4 = getelementptr %struct.pluto.0* %arg, i64 0, i32 0, i32 0
+  %tmp5 = load %i8** %tmp4, align 8
+  store %i8* %tmp5, %i8** %tmp, align 8
+  %tmp6 = getelementptr inbounds %struct.widget.375* %tmp1, i64 0, i32 3
+  store %struct.hoge.368* %tmp3, %struct.hoge.368** %tmp6, align 8
+  br i1 undef, label %bb8, label %bb7
+
+bb7:                                              ; preds = %bb
+  unreachable
+
+bb8:                                              ; preds = %bb
+  unreachable
+}
diff --git a/test/CodeGen/X86/2012-10-18-crash-dagco.ll b/test/CodeGen/X86/2012-10-18-crash-dagco.ll
new file mode 100644
index 0000000..5b98624
--- /dev/null
+++ b/test/CodeGen/X86/2012-10-18-crash-dagco.ll
@@ -0,0 +1,61 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 -disable-cgp-select2branch < %s
+
+; We should not crash on this test.
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+target triple = "i386-apple-darwin9.0.0"
+
+@global = external constant [411 x i8], align 1
+
+define void @snork() nounwind {
+bb:
+  br i1 undef, label %bb26, label %bb27
+
+bb26:                                             ; preds = %bb48, %bb26, %bb
+  switch i32 undef, label %bb26 [
+    i32 142771596, label %bb28
+  ]
+
+bb27:                                             ; preds = %bb48, %bb
+  switch i32 undef, label %bb49 [
+    i32 142771596, label %bb28
+  ]
+
+bb28:                                             ; preds = %bb27, %bb26
+  %tmp = load i32* null
+  %tmp29 = trunc i32 %tmp to i8
+  store i8* undef, i8** undef
+  %tmp30 = load i32* null
+  %tmp31 = icmp eq i32 %tmp30, 0
+  %tmp32 = getelementptr inbounds [411 x i8]* @global, i32 0, i32 undef
+  %tmp33 = load i8* %tmp32, align 1
+  %tmp34 = getelementptr inbounds [411 x i8]* @global, i32 0, i32 0
+  %tmp35 = load i8* %tmp34, align 1
+  %tmp36 = select i1 %tmp31, i8 %tmp35, i8 %tmp33
+  %tmp37 = select i1 undef, i8 %tmp29, i8 %tmp36
+  %tmp38 = zext i8 %tmp37 to i32
+  %tmp39 = select i1 undef, i32 0, i32 %tmp38
+  %tmp40 = getelementptr inbounds i32* null, i32 %tmp39
+  %tmp41 = load i32* %tmp40, align 4
+  %tmp42 = load i32* undef, align 4
+  %tmp43 = load i32* undef
+  %tmp44 = xor i32 %tmp42, %tmp43
+  %tmp45 = lshr i32 %tmp44, 8
+  %tmp46 = lshr i32 %tmp44, 7
+  call void @spam()
+  unreachable
+
+bb47:                                             ; No predecessors!
+  ret void
+
+bb48:                                             ; No predecessors!
+  br i1 undef, label %bb27, label %bb26
+
+bb49:                                             ; preds = %bb49, %bb27
+  br label %bb49
+
+bb50:                                             ; preds = %bb50
+  br label %bb50
+}
+
+declare void @spam() noreturn nounwind
diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll
new file mode 100644
index 0000000..64825ba
--- /dev/null
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -0,0 +1,305 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=+avx < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
+%struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }
+
+; CHECK: merge_const_store
+; save 1,2,3 ... as one big integer.
+; CHECK: movabsq $578437695752307201
+; CHECK: ret
+define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+.lr.ph:
+  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
+  %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
+  %2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
+  store i8 1, i8* %2, align 1
+  %3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
+  store i8 2, i8* %3, align 1
+  %4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2
+  store i8 3, i8* %4, align 1
+  %5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3
+  store i8 4, i8* %5, align 1
+  %6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4
+  store i8 5, i8* %6, align 1
+  %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5
+  store i8 6, i8* %7, align 1
+  %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6
+  store i8 7, i8* %8, align 1
+  %9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7
+  store i8 8, i8* %9, align 1
+  %10 = add nsw i32 %i.02, 1
+  %11 = getelementptr inbounds %struct.A* %.01, i64 1
+  %exitcond = icmp eq i32 %10, %count
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+._crit_edge:
+  ret void
+}
+
+; Move the constants using a single vector store.
+; CHECK: merge_const_store_vec
+; CHECK: vmovups  %ymm0, (%rsi)
+; CHECK: ret
+define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+.lr.ph:
+  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
+  %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
+  %2 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
+  store i32 0, i32* %2, align 4
+  %3 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
+  store i32 0, i32* %3, align 4
+  %4 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2
+  store i32 0, i32* %4, align 4
+  %5 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3
+  store i32 0, i32* %5, align 4
+  %6 = getelementptr inbounds %struct.B* %.01, i64 0, i32 4
+  store i32 0, i32* %6, align 4
+  %7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 5
+  store i32 0, i32* %7, align 4
+  %8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 6
+  store i32 0, i32* %8, align 4
+  %9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 7
+  store i32 0, i32* %9, align 4
+  %10 = add nsw i32 %i.02, 1
+  %11 = getelementptr inbounds %struct.B* %.01, i64 1
+  %exitcond = icmp eq i32 %10, %count
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+._crit_edge:
+  ret void
+}
+
+; Move the first 4 constants as a single vector. Move the rest as scalars.
+; CHECK: merge_nonconst_store
+; CHECK: movl $67305985
+; CHECK: movb
+; CHECK: movb
+; CHECK: movb
+; CHECK: movb
+; CHECK: ret
+define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+.lr.ph:
+  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
+  %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
+  %2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
+  store i8 1, i8* %2, align 1
+  %3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
+  store i8 2, i8* %3, align 1
+  %4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2
+  store i8 3, i8* %4, align 1
+  %5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3
+  store i8 4, i8* %5, align 1
+  %6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4
+  store i8 %zz, i8* %6, align 1                     ;  <----------- Not a const;
+  %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5
+  store i8 6, i8* %7, align 1
+  %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6
+  store i8 7, i8* %8, align 1
+  %9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7
+  store i8 8, i8* %9, align 1
+  %10 = add nsw i32 %i.02, 1
+  %11 = getelementptr inbounds %struct.A* %.01, i64 1
+  %exitcond = icmp eq i32 %10, %count
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+._crit_edge:
+  ret void
+}
+
+
+;CHECK: merge_loads_i16
+; load:
+;CHECK: movw
+; store:
+;CHECK: movw
+;CHECK: ret
+define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0
+  %3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1
+  br label %4
+
+; <label>:4                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
+  %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ]
+  %5 = load i8* %2, align 1
+  %6 = load i8* %3, align 1
+  %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
+  store i8 %5, i8* %7, align 1
+  %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
+  store i8 %6, i8* %8, align 1
+  %9 = add nsw i32 %i.02, 1
+  %10 = getelementptr inbounds %struct.A* %.01, i64 1
+  %exitcond = icmp eq i32 %9, %count
+  br i1 %exitcond, label %._crit_edge, label %4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
+; The loads and the stores are interleved. Can't merge them.
+;CHECK: no_merge_loads
+;CHECK: movb
+;CHECK: movb
+;CHECK: movb
+;CHECK: movb
+;CHECK: ret
+define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0
+  %3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1
+  br label %a4
+
+a4:                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
+  %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]
+  %a5 = load i8* %2, align 1
+  %a7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
+  store i8 %a5, i8* %a7, align 1
+  %a8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
+  %a6 = load i8* %3, align 1
+  store i8 %a6, i8* %a8, align 1
+  %a9 = add nsw i32 %i.02, 1
+  %a10 = getelementptr inbounds %struct.A* %.01, i64 1
+  %exitcond = icmp eq i32 %a9, %count
+  br i1 %exitcond, label %._crit_edge, label %a4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
+
+;CHECK: merge_loads_integer
+; load:
+;CHECK: movq
+; store:
+;CHECK: movq
+;CHECK: ret
+define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = getelementptr inbounds %struct.B* %q, i64 0, i32 0
+  %3 = getelementptr inbounds %struct.B* %q, i64 0, i32 1
+  br label %4
+
+; <label>:4                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
+  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %10, %4 ]
+  %5 = load i32* %2
+  %6 = load i32* %3
+  %7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
+  store i32 %5, i32* %7
+  %8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
+  store i32 %6, i32* %8
+  %9 = add nsw i32 %i.02, 1
+  %10 = getelementptr inbounds %struct.B* %.01, i64 1
+  %exitcond = icmp eq i32 %9, %count
+  br i1 %exitcond, label %._crit_edge, label %4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
+
+;CHECK: merge_loads_vector
+; load:
+;CHECK: movups
+; store:
+;CHECK: movups
+;CHECK: ret
+define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %a1 = icmp sgt i32 %count, 0
+  br i1 %a1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %a2 = getelementptr inbounds %struct.B* %q, i64 0, i32 0
+  %a3 = getelementptr inbounds %struct.B* %q, i64 0, i32 1
+  %a4 = getelementptr inbounds %struct.B* %q, i64 0, i32 2
+  %a5 = getelementptr inbounds %struct.B* %q, i64 0, i32 3
+  br label %block4
+
+block4:                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
+  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
+  %a7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
+  %a8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
+  %a9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2
+  %a10 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3
+  %b1 = load i32* %a2
+  %b2 = load i32* %a3
+  %b3 = load i32* %a4
+  %b4 = load i32* %a5
+  store i32 %b1, i32* %a7
+  store i32 %b2, i32* %a8
+  store i32 %b3, i32* %a9
+  store i32 %b4, i32* %a10
+  %c9 = add nsw i32 %i.02, 1
+  %c10 = getelementptr inbounds %struct.B* %.01, i64 1
+  %exitcond = icmp eq i32 %c9, %count
+  br i1 %exitcond, label %._crit_edge, label %block4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
+;CHECK: merge_loads_no_align
+; load:
+;CHECK: movl
+;CHECK: movl
+;CHECK: movl
+;CHECK: movl
+; store:
+;CHECK: movl
+;CHECK: movl
+;CHECK: movl
+;CHECK: movl
+;CHECK: ret
+define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %a1 = icmp sgt i32 %count, 0
+  br i1 %a1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %a2 = getelementptr inbounds %struct.B* %q, i64 0, i32 0
+  %a3 = getelementptr inbounds %struct.B* %q, i64 0, i32 1
+  %a4 = getelementptr inbounds %struct.B* %q, i64 0, i32 2
+  %a5 = getelementptr inbounds %struct.B* %q, i64 0, i32 3
+  br label %block4
+
+block4:                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
+  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
+  %a7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
+  %a8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
+  %a9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2
+  %a10 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3
+  %b1 = load i32* %a2, align 1
+  %b2 = load i32* %a3, align 1
+  %b3 = load i32* %a4, align 1
+  %b4 = load i32* %a5, align 1
+  store i32 %b1, i32* %a7, align 1
+  store i32 %b2, i32* %a8, align 1
+  store i32 %b3, i32* %a9, align 1
+  store i32 %b4, i32* %a10, align 1
+  %c9 = add nsw i32 %i.02, 1
+  %c10 = getelementptr inbounds %struct.B* %.01, i64 1
+  %exitcond = icmp eq i32 %c9, %count
+  br i1 %exitcond, label %._crit_edge, label %block4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
diff --git a/test/CodeGen/X86/StackColoring-dbg.ll b/test/CodeGen/X86/StackColoring-dbg.ll
new file mode 100644
index 0000000..5982544
--- /dev/null
+++ b/test/CodeGen/X86/StackColoring-dbg.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s
+
+; Make sure that we don't crash when dbg values are used.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define void @foo() nounwind uwtable ssp {
+entry:
+  %x.i = alloca i8, align 1
+  %y.i = alloca [256 x i8], align 16
+  %0 = getelementptr inbounds [256 x i8]* %y.i, i64 0, i64 0
+  br label %for.body
+
+for.body:
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.start(i64 -1, i8* %x.i) nounwind
+  call void @llvm.dbg.declare(metadata !{i8* %x.i}, metadata !22) nounwind
+  br label %for.body
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+!16 = metadata !{i32 786468, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6}
+!2 = metadata !{i32 0}
+!22 = metadata !{i32 786688, metadata !2, metadata !"x", metadata !2, i32 16, metadata !16, i32 0, i32 0}
diff --git a/test/CodeGen/X86/StackColoring.ll b/test/CodeGen/X86/StackColoring.ll
new file mode 100644
index 0000000..f8ae74f
--- /dev/null
+++ b/test/CodeGen/X86/StackColoring.ll
@@ -0,0 +1,410 @@
+; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s | FileCheck %s --check-prefix=YESCOLOR
+; RUN: llc -mcpu=corei7 -no-stack-coloring=true  < %s | FileCheck %s --check-prefix=NOCOLOR
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;YESCOLOR: subq  $136, %rsp
+;NOCOLOR: subq  $264, %rsp
+
+define i32 @myCall_w2(i32 %in) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+}
+
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall2_no_merge(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  ret i32 %t7
+bb3:
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  ret i32 0
+}
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall2_w2(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+;YESCOLOR: subq  $208, %rsp
+;NOCOLOR: subq  $400, %rsp
+
+
+
+
+define i32 @myCall_w4(i32 %in) {
+entry:
+  %a1 = alloca [14 x i8*], align 8
+  %a2 = alloca [13 x i8*], align 8
+  %a3 = alloca [12 x i8*], align 8
+  %a4 = alloca [11 x i8*], align 8
+  %b1 = bitcast [14 x i8*]* %a1 to i8*
+  %b2 = bitcast [13 x i8*]* %a2 to i8*
+  %b3 = bitcast [12 x i8*]* %a3 to i8*
+  %b4 = bitcast [11 x i8*]* %a4 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b4)
+  call void @llvm.lifetime.start(i64 -1, i8* %b1)
+  %t1 = call i32 @foo(i32 %in, i8* %b1)
+  %t2 = call i32 @foo(i32 %in, i8* %b1)
+  call void @llvm.lifetime.end(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t9 = call i32 @foo(i32 %in, i8* %b2)
+  %t8 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+  %t3 = call i32 @foo(i32 %in, i8* %b3)
+  %t4 = call i32 @foo(i32 %in, i8* %b3)
+  call void @llvm.lifetime.end(i64 -1, i8* %b3)
+  %t11 = call i32 @foo(i32 %in, i8* %b4)
+  call void @llvm.lifetime.end(i64 -1, i8* %b4)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+}
+
+;YESCOLOR: subq  $112, %rsp
+;NOCOLOR: subq  $400, %rsp
+
+define i32 @myCall2_w4(i32 %in) {
+entry:
+  %a1 = alloca [14 x i8*], align 8
+  %a2 = alloca [13 x i8*], align 8
+  %a3 = alloca [12 x i8*], align 8
+  %a4 = alloca [11 x i8*], align 8
+  %b1 = bitcast [14 x i8*]* %a1 to i8*
+  %b2 = bitcast [13 x i8*]* %a2 to i8*
+  %b3 = bitcast [12 x i8*]* %a3 to i8*
+  %b4 = bitcast [11 x i8*]* %a4 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b1)
+  %t1 = call i32 @foo(i32 %in, i8* %b1)
+  %t2 = call i32 @foo(i32 %in, i8* %b1)
+  call void @llvm.lifetime.end(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t9 = call i32 @foo(i32 %in, i8* %b2)
+  %t8 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+  %t3 = call i32 @foo(i32 %in, i8* %b3)
+  %t4 = call i32 @foo(i32 %in, i8* %b3)
+  call void @llvm.lifetime.end(i64 -1, i8* %b3)
+  br i1 undef, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b4)
+  %t11 = call i32 @foo(i32 %in, i8* %b4)
+  call void @llvm.lifetime.end(i64 -1, i8* %b4)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+
+define i32 @myCall2_noend(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall2_noend2(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall2_nostart(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+; Adopt the test from Transforms/Inline/array_merge.ll'
+;YESCOLOR: subq  $816, %rsp
+;NOCOLOR: subq  $1616, %rsp
+define void @array_merge() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  %2 = bitcast [100 x i32]* %A.i1 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %2) nounwind
+  %3 = bitcast [100 x i32]* %B.i2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %3) nounwind
+  call void @bar([100 x i32]* %A.i1, [100 x i32]* %B.i2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
+  ret void
+}
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @func_phi_lifetime(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb0, label %bb1
+
+bb0:
+  %I1 = bitcast [17 x i8*]* %a to i8*
+  br label %bb2
+
+bb1:
+  %I2 = bitcast [16 x i8*]* %a2 to i8*
+  br label %bb2
+
+bb2:
+  %split = phi i8* [ %I1, %bb0 ], [ %I2, %bb1 ]
+  call void @llvm.lifetime.start(i64 -1, i8* %split)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  call void @llvm.lifetime.end(i64 -1, i8* %split)
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: multi_region_bb
+;NOCOLOR: multi_region_bb
+define void @multi_region_bb() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind ; <---- start #1
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  %2 = bitcast [100 x i32]* %A.i1 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %2) nounwind
+  %3 = bitcast [100 x i32]* %B.i2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %3) nounwind
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind  ; <---- start #2
+  call void @bar([100 x i32]* %A.i1, [100 x i32]* %B.i2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
+  ret void
+}
+
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall_end_before_begin(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+; Check that we don't assert and crash even when there are allocas
+; outside the declared lifetime regions.
+;YESCOLOR: bad_range
+;NOCOLOR:  bad_range
+define void @bad_range() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  br label %block2
+
+block2:
+  ; I am used outside the marked lifetime.
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  ret void
+}
+
+
+; Check that we don't assert and crash even when there are usages
+; of allocas which do not read or write outside the declared lifetime regions.
+;YESCOLOR: shady_range
+;NOCOLOR:  shady_range
+
+%struct.Klass = type { i32, i32 }
+
+define i32 @shady_range(i32 %argc, i8** nocapture %argv) uwtable {
+  %a.i = alloca [4 x %struct.Klass], align 16
+  %b.i = alloca [4 x %struct.Klass], align 16
+  %a8 = bitcast [4 x %struct.Klass]* %a.i to i8*
+  %b8 = bitcast [4 x %struct.Klass]* %b.i to i8*
+  ; I am used outside the lifetime zone below:
+  %z2 = getelementptr inbounds [4 x %struct.Klass]* %a.i, i64 0, i64 0, i32 0
+  call void @llvm.lifetime.start(i64 -1, i8* %a8)
+  call void @llvm.lifetime.start(i64 -1, i8* %b8)
+  %z3 = load i32* %z2, align 16
+  %r = call i32 @foo(i32 %z3, i8* %a8)
+  %r2 = call i32 @foo(i32 %z3, i8* %b8)
+  call void @llvm.lifetime.end(i64 -1, i8* %a8)
+  call void @llvm.lifetime.end(i64 -1, i8* %b8)
+  ret i32 9
+}
+
+declare void @bar([100 x i32]* , [100 x i32]*) nounwind
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+declare i32 @foo(i32, i8*)
+
diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll
index a4abccb..4e30f2b 100644
--- a/test/CodeGen/X86/add-of-carry.ll
+++ b/test/CodeGen/X86/add-of-carry.ll
@@ -30,4 +30,17 @@ entry:
   ret i32 %z.0
 }
 
+; <rdar://problem/12579915>
+define i32 @test3(i32 %x, i32 %y, i32 %res) nounwind uwtable readnone ssp {
+entry:
+  %cmp = icmp ugt i32 %x, %y
+  %dec = sext i1 %cmp to i32
+  %dec.res = add nsw i32 %dec, %res
+  ret i32 %dec.res
+; CHECK: test3:
+; CHECK: cmpl
+; CHECK: sbbl
+; CHECK: ret
+}
+
 declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/X86/atom-bypass-slow-division.ll b/test/CodeGen/X86/atom-bypass-slow-division.ll
new file mode 100644
index 0000000..e7c9605
--- /dev/null
+++ b/test/CodeGen/X86/atom-bypass-slow-division.ll
@@ -0,0 +1,112 @@
+; RUN: llc < %s -mcpu=atom -mtriple=i686-linux  | FileCheck %s
+
+define i32 @test_get_quotient(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+  %result = sdiv i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @test_get_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+  %result = srem i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient_and_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+; CEECK-NOT: idivl
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, %b
+  %resultrem = srem i32 %a, %b
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
+; CHECK: test_use_div_and_idiv
+; CHECK: idivl
+; CHECK: divb
+; CHECK: divl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+  %resultidiv = sdiv i32 %a, %b
+  %resultdiv = udiv i32 %a, %b
+  %result = add i32 %resultidiv, %resultdiv
+  ret i32 %result
+}
+
+define i32 @test_use_div_imm_imm() nounwind {
+; CHECK: test_use_div_imm_imm
+; CHECK: movl $64
+  %resultdiv = sdiv i32 256, 4
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_div_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_div_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, 33
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_rem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultrem = srem i32 %a, 33
+  ret i32 %resultrem
+}
+
+define i32 @test_use_divrem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_divrem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, 33
+  %resultrem = srem i32 %a, 33
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @test_use_div_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_div_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_rem_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
diff --git a/test/CodeGen/X86/atom-shuf.ll b/test/CodeGen/X86/atom-shuf.ll
new file mode 100644
index 0000000..4c3f2f6
--- /dev/null
+++ b/test/CodeGen/X86/atom-shuf.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -mtriple=x86_64-linux-pc -mcpu=atom | FileCheck %s
+
+define <16 x i8> @foo(<16 x i8> %in) {
+  %r = shufflevector <16 x i8> %in, <16 x i8> undef, <16 x i32> < i32 7, i32 3, i32 2, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %r
+; CHECK: foo
+; CHECK: pshufb
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/atomic-minmax-i6432.ll b/test/CodeGen/X86/atomic-minmax-i6432.ll
new file mode 100644
index 0000000..e3ef605
--- /dev/null
+++ b/test/CodeGen/X86/atomic-minmax-i6432.ll
@@ -0,0 +1,67 @@
+; RUN: llc -march=x86 -mattr=+cmov -mtriple=i386-pc-linux < %s | FileCheck %s -check-prefix=LINUX
+; RUN: llc -march=x86 -mtriple=i386-macosx -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC
+
+@sc64 = external global i64
+
+define void @atomic_maxmin_i6432() {
+; LINUX: atomic_maxmin_i6432
+  %1 = atomicrmw max  i64* @sc64, i64 5 acquire
+; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
+; LINUX: cmpl
+; LINUX: setl
+; LINUX: cmpl
+; LINUX: setl
+; LINUX: cmovne
+; LINUX: cmovne
+; LINUX: lock
+; LINUX-NEXT: cmpxchg8b
+; LINUX: jne [[LABEL]]
+  %2 = atomicrmw min  i64* @sc64, i64 6 acquire
+; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
+; LINUX: cmpl
+; LINUX: setg
+; LINUX: cmpl
+; LINUX: setg
+; LINUX: cmovne
+; LINUX: cmovne
+; LINUX: lock
+; LINUX-NEXT: cmpxchg8b
+; LINUX: jne [[LABEL]]
+  %3 = atomicrmw umax i64* @sc64, i64 7 acquire
+; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
+; LINUX: cmpl
+; LINUX: setb
+; LINUX: cmpl
+; LINUX: setb
+; LINUX: cmovne
+; LINUX: cmovne
+; LINUX: lock
+; LINUX-NEXT: cmpxchg8b
+; LINUX: jne [[LABEL]]
+  %4 = atomicrmw umin i64* @sc64, i64 8 acquire
+; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
+; LINUX: cmpl
+; LINUX: seta
+; LINUX: cmpl
+; LINUX: seta
+; LINUX: cmovne
+; LINUX: cmovne
+; LINUX: lock
+; LINUX-NEXT: cmpxchg8b
+; LINUX: jne [[LABEL]]
+  ret void
+}
+
+; rdar://12453106
+@id = internal global i64 0, align 8
+
+define void @tf_bug(i8* %ptr) nounwind {
+; PIC: tf_bug:
+; PIC: movl _id-L1$pb(
+; PIC: movl (_id-L1$pb)+4(
+  %tmp1 = atomicrmw add i64* @id, i64 1 seq_cst
+  %tmp2 = add i64 %tmp1, 1
+  %tmp3 = bitcast i8* %ptr to i64*
+  store i64 %tmp2, i64* %tmp3, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/atomic-pointer.ll b/test/CodeGen/X86/atomic-pointer.ll
new file mode 100644
index 0000000..a455277
--- /dev/null
+++ b/test/CodeGen/X86/atomic-pointer.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=i686-none-linux | FileCheck %s
+
+define i32* @test_atomic_ptr_load(i32** %a0) {
+; CHECK: test_atomic_ptr_load
+; CHECK: movl
+; CHECK: movl
+; CHECK: ret
+0:
+  %0 = load atomic i32** %a0 seq_cst, align 4
+  ret i32* %0
+}
+
+define void @test_atomic_ptr_store(i32* %a0, i32** %a1) {
+; CHECK: test_atomic_ptr_store
+; CHECK: movl
+; CHECK: movl
+; CHECK: xchgl
+; CHECK: ret
+0:
+  store atomic i32* %a0, i32** %a1 seq_cst, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/atomic16.ll b/test/CodeGen/X86/atomic16.ll
new file mode 100644
index 0000000..824995d
--- /dev/null
+++ b/test/CodeGen/X86/atomic16.ll
@@ -0,0 +1,250 @@
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mcpu=corei7 -show-mc-encoding | FileCheck %s --check-prefix X64
+; RUN: llc < %s -O0 -mtriple=i386-unknown-unknown -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc16 = external global i16
+
+define void @atomic_fetch_add16() nounwind {
+; X64:   atomic_fetch_add16
+; X32:   atomic_fetch_add16
+entry:
+; 32-bit
+  %t1 = atomicrmw add  i16* @sc16, i16 1 acquire
+; X64:       lock
+; X64:       incw
+; X32:       lock
+; X32:       incw
+  %t2 = atomicrmw add  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       addw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       addw $3
+  %t3 = atomicrmw add  i16* @sc16, i16 5 acquire
+; X64:       lock
+; X64:       xaddw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xaddw
+  %t4 = atomicrmw add  i16* @sc16, i16 %t3 acquire
+; X64:       lock
+; X64:       addw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       addw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_sub16() nounwind {
+; X64:   atomic_fetch_sub16
+; X32:   atomic_fetch_sub16
+  %t1 = atomicrmw sub  i16* @sc16, i16 1 acquire
+; X64:       lock
+; X64:       decw
+; X32:       lock
+; X32:       decw
+  %t2 = atomicrmw sub  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       subw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       subw $3
+  %t3 = atomicrmw sub  i16* @sc16, i16 5 acquire
+; X64:       lock
+; X64:       xaddw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xaddw
+  %t4 = atomicrmw sub  i16* @sc16, i16 %t3 acquire
+; X64:       lock
+; X64:       subw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       subw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_and16() nounwind {
+; X64:   atomic_fetch_and16
+; X32:   atomic_fetch_and16
+  %t1 = atomicrmw and  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       andw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       andw $3
+  %t2 = atomicrmw and  i16* @sc16, i16 5 acquire
+; X64:       andw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       andw
+; X32:       lock
+; X32:       cmpxchgw
+  %t3 = atomicrmw and  i16* @sc16, i16 %t2 acquire
+; X64:       lock
+; X64:       andw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       andw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_or16() nounwind {
+; X64:   atomic_fetch_or16
+; X32:   atomic_fetch_or16
+  %t1 = atomicrmw or   i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       orw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       orw $3
+  %t2 = atomicrmw or   i16* @sc16, i16 5 acquire
+; X64:       orw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       orw
+; X32:       lock
+; X32:       cmpxchgw
+  %t3 = atomicrmw or   i16* @sc16, i16 %t2 acquire
+; X64:       lock
+; X64:       orw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       orw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_xor16() nounwind {
+; X64:   atomic_fetch_xor16
+; X32:   atomic_fetch_xor16
+  %t1 = atomicrmw xor  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       xorw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xorw $3
+  %t2 = atomicrmw xor  i16* @sc16, i16 5 acquire
+; X64:       xorw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       xorw
+; X32:       lock
+; X32:       cmpxchgw
+  %t3 = atomicrmw xor  i16* @sc16, i16 %t2 acquire
+; X64:       lock
+; X64:       xorw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xorw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_nand16(i16 %x) nounwind {
+; X64:   atomic_fetch_nand16
+; X32:   atomic_fetch_nand16
+  %t1 = atomicrmw nand i16* @sc16, i16 %x acquire
+; X64:       andw
+; X64:       notw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       andw
+; X32:       notw
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max16(i16 %x) nounwind {
+  %t1 = atomicrmw max  i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min16(i16 %x) nounwind {
+  %t1 = atomicrmw min  i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax16(i16 %x) nounwind {
+  %t1 = atomicrmw umax i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin16(i16 %x) nounwind {
+  %t1 = atomicrmw umin i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg16() nounwind {
+  %t1 = cmpxchg i16* @sc16, i16 0, i16 1 acquire
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store16(i16 %x) nounwind {
+  store atomic i16 %x, i16* @sc16 release, align 4
+; X64-NOT:   lock
+; X64:       movw
+; X32-NOT:   lock
+; X32:       movw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap16(i16 %x) nounwind {
+  %t1 = atomicrmw xchg i16* @sc16, i16 %x acquire
+; X64-NOT:   lock
+; X64:       xchgw
+; X32-NOT:   lock
+; X32:       xchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic32.ll b/test/CodeGen/X86/atomic32.ll
new file mode 100644
index 0000000..dc927d8
--- /dev/null
+++ b/test/CodeGen/X86/atomic32.ll
@@ -0,0 +1,250 @@
+; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
+; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc32 = external global i32
+
+define void @atomic_fetch_add32() nounwind {
+; X64:   atomic_fetch_add32
+; X32:   atomic_fetch_add32
+entry:
+; 32-bit
+  %t1 = atomicrmw add  i32* @sc32, i32 1 acquire
+; X64:       lock
+; X64:       incl
+; X32:       lock
+; X32:       incl
+  %t2 = atomicrmw add  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       addl $3
+; X32:       lock
+; X32:       addl $3
+  %t3 = atomicrmw add  i32* @sc32, i32 5 acquire
+; X64:       lock
+; X64:       xaddl
+; X32:       lock
+; X32:       xaddl
+  %t4 = atomicrmw add  i32* @sc32, i32 %t3 acquire
+; X64:       lock
+; X64:       addl
+; X32:       lock
+; X32:       addl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_sub32() nounwind {
+; X64:   atomic_fetch_sub32
+; X32:   atomic_fetch_sub32
+  %t1 = atomicrmw sub  i32* @sc32, i32 1 acquire
+; X64:       lock
+; X64:       decl
+; X32:       lock
+; X32:       decl
+  %t2 = atomicrmw sub  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       subl $3
+; X32:       lock
+; X32:       subl $3
+  %t3 = atomicrmw sub  i32* @sc32, i32 5 acquire
+; X64:       lock
+; X64:       xaddl
+; X32:       lock
+; X32:       xaddl
+  %t4 = atomicrmw sub  i32* @sc32, i32 %t3 acquire
+; X64:       lock
+; X64:       subl
+; X32:       lock
+; X32:       subl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_and32() nounwind {
+; X64:   atomic_fetch_and32
+; X32:   atomic_fetch_and32
+  %t1 = atomicrmw and  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       andl $3
+; X32:       lock
+; X32:       andl $3
+  %t2 = atomicrmw and  i32* @sc32, i32 5 acquire
+; X64:       andl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchgl
+  %t3 = atomicrmw and  i32* @sc32, i32 %t2 acquire
+; X64:       lock
+; X64:       andl
+; X32:       lock
+; X32:       andl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_or32() nounwind {
+; X64:   atomic_fetch_or32
+; X32:   atomic_fetch_or32
+  %t1 = atomicrmw or   i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       orl $3
+; X32:       lock
+; X32:       orl $3
+  %t2 = atomicrmw or   i32* @sc32, i32 5 acquire
+; X64:       orl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchgl
+  %t3 = atomicrmw or   i32* @sc32, i32 %t2 acquire
+; X64:       lock
+; X64:       orl
+; X32:       lock
+; X32:       orl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_xor32() nounwind {
+; X64:   atomic_fetch_xor32
+; X32:   atomic_fetch_xor32
+  %t1 = atomicrmw xor  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       xorl $3
+; X32:       lock
+; X32:       xorl $3
+  %t2 = atomicrmw xor  i32* @sc32, i32 5 acquire
+; X64:       xorl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchgl
+  %t3 = atomicrmw xor  i32* @sc32, i32 %t2 acquire
+; X64:       lock
+; X64:       xorl
+; X32:       lock
+; X32:       xorl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_nand32(i32 %x) nounwind {
+; X64:   atomic_fetch_nand32
+; X32:   atomic_fetch_nand32
+  %t1 = atomicrmw nand i32* @sc32, i32 %x acquire
+; X64:       andl
+; X64:       notl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       andl
+; X32:       notl
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max32(i32 %x) nounwind {
+  %t1 = atomicrmw max  i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min32(i32 %x) nounwind {
+  %t1 = atomicrmw min  i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax32(i32 %x) nounwind {
+  %t1 = atomicrmw umax i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin32(i32 %x) nounwind {
+  %t1 = atomicrmw umin i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg32() nounwind {
+  %t1 = cmpxchg i32* @sc32, i32 0, i32 1 acquire
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store32(i32 %x) nounwind {
+  store atomic i32 %x, i32* @sc32 release, align 4
+; X64-NOT:   lock
+; X64:       movl
+; X32-NOT:   lock
+; X32:       movl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap32(i32 %x) nounwind {
+  %t1 = atomicrmw xchg i32* @sc32, i32 %x acquire
+; X64-NOT:   lock
+; X64:       xchgl
+; X32-NOT:   lock
+; X32:       xchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic64.ll b/test/CodeGen/X86/atomic64.ll
new file mode 100644
index 0000000..45785cc
--- /dev/null
+++ b/test/CodeGen/X86/atomic64.ll
@@ -0,0 +1,216 @@
+; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
+
+@sc64 = external global i64
+
+define void @atomic_fetch_add64() nounwind {
+; X64:   atomic_fetch_add64
+entry:
+  %t1 = atomicrmw add  i64* @sc64, i64 1 acquire
+; X64:       lock
+; X64:       incq
+  %t2 = atomicrmw add  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       addq $3
+  %t3 = atomicrmw add  i64* @sc64, i64 5 acquire
+; X64:       lock
+; X64:       xaddq
+  %t4 = atomicrmw add  i64* @sc64, i64 %t3 acquire
+; X64:       lock
+; X64:       addq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_sub64() nounwind {
+; X64:   atomic_fetch_sub64
+  %t1 = atomicrmw sub  i64* @sc64, i64 1 acquire
+; X64:       lock
+; X64:       decq
+  %t2 = atomicrmw sub  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       subq $3
+  %t3 = atomicrmw sub  i64* @sc64, i64 5 acquire
+; X64:       lock
+; X64:       xaddq
+  %t4 = atomicrmw sub  i64* @sc64, i64 %t3 acquire
+; X64:       lock
+; X64:       subq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_and64() nounwind {
+; X64:   atomic_fetch_and64
+  %t1 = atomicrmw and  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       andq $3
+  %t2 = atomicrmw and  i64* @sc64, i64 5 acquire
+; X64:       andq
+; X64:       lock
+; X64:       cmpxchgq
+  %t3 = atomicrmw and  i64* @sc64, i64 %t2 acquire
+; X64:       lock
+; X64:       andq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_or64() nounwind {
+; X64:   atomic_fetch_or64
+  %t1 = atomicrmw or   i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       orq $3
+  %t2 = atomicrmw or   i64* @sc64, i64 5 acquire
+; X64:       orq
+; X64:       lock
+; X64:       cmpxchgq
+  %t3 = atomicrmw or   i64* @sc64, i64 %t2 acquire
+; X64:       lock
+; X64:       orq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_xor64() nounwind {
+; X64:   atomic_fetch_xor64
+  %t1 = atomicrmw xor  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       xorq $3
+  %t2 = atomicrmw xor  i64* @sc64, i64 5 acquire
+; X64:       xorq
+; X64:       lock
+; X64:       cmpxchgq
+  %t3 = atomicrmw xor  i64* @sc64, i64 %t2 acquire
+; X64:       lock
+; X64:       xorq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_nand64(i64 %x) nounwind {
+; X64:   atomic_fetch_nand64
+; X32:   atomic_fetch_nand64
+  %t1 = atomicrmw nand i64* @sc64, i64 %x acquire
+; X64:       andq
+; X64:       notq
+; X64:       lock
+; X64:       cmpxchgq
+; X32:       andl
+; X32:       andl
+; X32:       notl
+; X32:       notl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max64(i64 %x) nounwind {
+  %t1 = atomicrmw max  i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min64(i64 %x) nounwind {
+  %t1 = atomicrmw min  i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax64(i64 %x) nounwind {
+  %t1 = atomicrmw umax i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin64(i64 %x) nounwind {
+  %t1 = atomicrmw umin i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg64() nounwind {
+  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire
+; X64:       lock
+; X64:       cmpxchgq
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store64(i64 %x) nounwind {
+  store atomic i64 %x, i64* @sc64 release, align 8
+; X64-NOT:   lock
+; X64:       movq
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap64(i64 %x) nounwind {
+  %t1 = atomicrmw xchg i64* @sc64, i64 %x acquire
+; X64-NOT:   lock
+; X64:       xchgq
+; X32:       lock
+; X32:       xchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic6432.ll b/test/CodeGen/X86/atomic6432.ll
new file mode 100644
index 0000000..f9b21c5
--- /dev/null
+++ b/test/CodeGen/X86/atomic6432.ll
@@ -0,0 +1,208 @@
+; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc64 = external global i64
+
+define void @atomic_fetch_add64() nounwind {
+; X32:   atomic_fetch_add64
+entry:
+  %t1 = atomicrmw add  i64* @sc64, i64 1 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw add  i64* @sc64, i64 3 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw add  i64* @sc64, i64 5 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t4 = atomicrmw add  i64* @sc64, i64 %t3 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_sub64() nounwind {
+; X32:   atomic_fetch_sub64
+  %t1 = atomicrmw sub  i64* @sc64, i64 1 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw sub  i64* @sc64, i64 3 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw sub  i64* @sc64, i64 5 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t4 = atomicrmw sub  i64* @sc64, i64 %t3 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_and64() nounwind {
+; X32:   atomic_fetch_and64
+  %t1 = atomicrmw and  i64* @sc64, i64 3 acquire
+; X32:       andl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw and  i64* @sc64, i64 5 acquire
+; X32:       andl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw and  i64* @sc64, i64 %t2 acquire
+; X32:       andl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_or64() nounwind {
+; X32:   atomic_fetch_or64
+  %t1 = atomicrmw or   i64* @sc64, i64 3 acquire
+; X32:       orl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw or   i64* @sc64, i64 5 acquire
+; X32:       orl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw or   i64* @sc64, i64 %t2 acquire
+; X32:       orl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_xor64() nounwind {
+; X32:   atomic_fetch_xor64
+  %t1 = atomicrmw xor  i64* @sc64, i64 3 acquire
+; X32:       xorl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw xor  i64* @sc64, i64 5 acquire
+; X32:       xorl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw xor  i64* @sc64, i64 %t2 acquire
+; X32:       xorl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_nand64(i64 %x) nounwind {
+; X32:   atomic_fetch_nand64
+  %t1 = atomicrmw nand i64* @sc64, i64 %x acquire
+; X32:       andl
+; X32:       andl
+; X32:       notl
+; X32:       notl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_max64(i64 %x) nounwind {
+  %t1 = atomicrmw max  i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_min64(i64 %x) nounwind {
+  %t1 = atomicrmw min  i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_umax64(i64 %x) nounwind {
+  %t1 = atomicrmw umax i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_umin64(i64 %x) nounwind {
+  %t1 = atomicrmw umin i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg64() nounwind {
+  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_store64(i64 %x) nounwind {
+  store atomic i64 %x, i64* @sc64 release, align 8
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_swap64(i64 %x) nounwind {
+  %t1 = atomicrmw xchg i64* @sc64, i64 %x acquire
+; X32:       lock
+; X32:       xchg8b
+  ret void
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic8.ll b/test/CodeGen/X86/atomic8.ll
new file mode 100644
index 0000000..4124284
--- /dev/null
+++ b/test/CodeGen/X86/atomic8.ll
@@ -0,0 +1,250 @@
+; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
+; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc8 = external global i8
+
+define void @atomic_fetch_add8() nounwind {
+; X64:   atomic_fetch_add8
+; X32:   atomic_fetch_add8
+entry:
+; 32-bit
+  %t1 = atomicrmw add  i8* @sc8, i8 1 acquire
+; X64:       lock
+; X64:       incb
+; X32:       lock
+; X32:       incb
+  %t2 = atomicrmw add  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       addb $3
+; X32:       lock
+; X32:       addb $3
+  %t3 = atomicrmw add  i8* @sc8, i8 5 acquire
+; X64:       lock
+; X64:       xaddb
+; X32:       lock
+; X32:       xaddb
+  %t4 = atomicrmw add  i8* @sc8, i8 %t3 acquire
+; X64:       lock
+; X64:       addb
+; X32:       lock
+; X32:       addb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_sub8() nounwind {
+; X64:   atomic_fetch_sub8
+; X32:   atomic_fetch_sub8
+  %t1 = atomicrmw sub  i8* @sc8, i8 1 acquire
+; X64:       lock
+; X64:       decb
+; X32:       lock
+; X32:       decb
+  %t2 = atomicrmw sub  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       subb $3
+; X32:       lock
+; X32:       subb $3
+  %t3 = atomicrmw sub  i8* @sc8, i8 5 acquire
+; X64:       lock
+; X64:       xaddb
+; X32:       lock
+; X32:       xaddb
+  %t4 = atomicrmw sub  i8* @sc8, i8 %t3 acquire
+; X64:       lock
+; X64:       subb
+; X32:       lock
+; X32:       subb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_and8() nounwind {
+; X64:   atomic_fetch_and8
+; X32:   atomic_fetch_and8
+  %t1 = atomicrmw and  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       andb $3
+; X32:       lock
+; X32:       andb $3
+  %t2 = atomicrmw and  i8* @sc8, i8 5 acquire
+; X64:       andb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       andb
+; X32:       lock
+; X32:       cmpxchgb
+  %t3 = atomicrmw and  i8* @sc8, i8 %t2 acquire
+; X64:       lock
+; X64:       andb
+; X32:       lock
+; X32:       andb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_or8() nounwind {
+; X64:   atomic_fetch_or8
+; X32:   atomic_fetch_or8
+  %t1 = atomicrmw or   i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       orb $3
+; X32:       lock
+; X32:       orb $3
+  %t2 = atomicrmw or   i8* @sc8, i8 5 acquire
+; X64:       orb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       orb
+; X32:       lock
+; X32:       cmpxchgb
+  %t3 = atomicrmw or   i8* @sc8, i8 %t2 acquire
+; X64:       lock
+; X64:       orb
+; X32:       lock
+; X32:       orb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_xor8() nounwind {
+; X64:   atomic_fetch_xor8
+; X32:   atomic_fetch_xor8
+  %t1 = atomicrmw xor  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       xorb $3
+; X32:       lock
+; X32:       xorb $3
+  %t2 = atomicrmw xor  i8* @sc8, i8 5 acquire
+; X64:       xorb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       xorb
+; X32:       lock
+; X32:       cmpxchgb
+  %t3 = atomicrmw xor  i8* @sc8, i8 %t2 acquire
+; X64:       lock
+; X64:       xorb
+; X32:       lock
+; X32:       xorb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_nand8(i8 %x) nounwind {
+; X64:   atomic_fetch_nand8
+; X32:   atomic_fetch_nand8
+  %t1 = atomicrmw nand i8* @sc8, i8 %x acquire
+; X64:       andb
+; X64:       notb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       andb
+; X32:       notb
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max8(i8 %x) nounwind {
+  %t1 = atomicrmw max  i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min8(i8 %x) nounwind {
+  %t1 = atomicrmw min  i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax8(i8 %x) nounwind {
+  %t1 = atomicrmw umax i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin8(i8 %x) nounwind {
+  %t1 = atomicrmw umin i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg8() nounwind {
+  %t1 = cmpxchg i8* @sc8, i8 0, i8 1 acquire
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store8(i8 %x) nounwind {
+  store atomic i8 %x, i8* @sc8 release, align 4
+; X64-NOT:   lock
+; X64:       movb
+; X32-NOT:   lock
+; X32:       movb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap8(i8 %x) nounwind {
+  %t1 = atomicrmw xchg i8* @sc8, i8 %x acquire
+; X64-NOT:   lock
+; X64:       xchgb
+; X32-NOT:   lock
+; X32:       xchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic_add.ll b/test/CodeGen/X86/atomic_add.ll
index 1fce256..d944998 100644
--- a/test/CodeGen/X86/atomic_add.ll
+++ b/test/CodeGen/X86/atomic_add.ll
@@ -178,7 +178,8 @@ entry:
 define void @sub2(i16* nocapture %p, i32 %v) nounwind ssp {
 entry:
 ; CHECK: sub2:
-; CHECK: negl
+; CHECK-NOT: negl
+; CHECK: subw
 	%0 = trunc i32 %v to i16		; <i16> [#uses=1]
   %1 = atomicrmw sub i16* %p, i16 %0 monotonic
   ret void
diff --git a/test/CodeGen/X86/atomic_op.ll b/test/CodeGen/X86/atomic_op.ll
index 152bece..c5fa07d 100644
--- a/test/CodeGen/X86/atomic_op.ll
+++ b/test/CodeGen/X86/atomic_op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+cmov | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
@@ -107,13 +107,12 @@ entry:
         ; CHECK: cmpxchgl
   %17 = cmpxchg i32* %val2, i32 1976, i32 1 monotonic
 	store i32 %17, i32* %old
+        ; CHECK: movl  [[R17atomic:.*]], %eax
         ; CHECK: movl	$1401, %[[R17mask:[a-z]*]]
-        ; CHECK: movl	[[R17atomic:.*]], %eax
-        ; CHECK: movl	%eax, %[[R17newval:[a-z]*]]
-        ; CHECK: andl	%[[R17mask]], %[[R17newval]]
-        ; CHECK: notl	%[[R17newval]]
+        ; CHECK: andl	%eax, %[[R17mask]]
+        ; CHECK: notl	%[[R17mask]]
         ; CHECK: lock
-        ; CHECK: cmpxchgl	%[[R17newval]], [[R17atomic]]
+        ; CHECK: cmpxchgl	%[[R17mask]], [[R17atomic]]
         ; CHECK: jne
         ; CHECK: movl	%eax,
   %18 = atomicrmw nand i32* %val2, i32 1401 monotonic
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 8ad0fa8..95854c7 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -109,8 +109,8 @@ allocas:
 ; rdar://10566486
 ; CHECK: fneg
 ; CHECK: vxorps
-define <16 x float> @fneg(<16 x float> addrspace(1)* nocapture %out) nounwind {
-  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+define <16 x float> @fneg(<16 x float> %a) nounwind {
+  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
   ret <16 x float> %1
 }
 
diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll
new file mode 100644
index 0000000..1446b36
--- /dev/null
+++ b/test/CodeGen/X86/avx-intel-ocl.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=NOT_WIN %s
+
+declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
+declare <16 x float> @func_float16(<16 x float>, <16 x float>)
+; WIN64: testf16_inp
+; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN64: leaq    {{.*}}(%rsp), %rcx
+; WIN64: call
+; WIN64: ret
+
+; WIN32: testf16_inp
+; WIN32: movl    %eax, (%esp)
+; WIN32: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN32: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN32: call
+; WIN32: ret
+
+; NOT_WIN: testf16_inp
+; NOT_WIN: vaddps  {{.*}}, {{%ymm[0-1]}}
+; NOT_WIN: vaddps  {{.*}}, {{%ymm[0-1]}}
+; NOT_WIN: leaq    {{.*}}(%rsp), %rdi
+; NOT_WIN: call
+; NOT_WIN: ret
+
+;test calling conventions - input parameters
+define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %2, %1
+  ret <16 x float> %3
+}
+
+;test calling conventions - preserved registers
+
+; preserved ymm6-ymm15
+; WIN64: testf16_regs
+; WIN64: call
+; WIN64: vaddps  {{%ymm[6-7]}}, %ymm0, %ymm0
+; WIN64: vaddps  {{%ymm[6-7]}}, %ymm1, %ymm1
+; WIN64: ret
+
+; preserved ymm8-ymm15
+; NOT_WIN: testf16_regs
+; NOT_WIN: call
+; NOT_WIN: vaddps  {{%ymm[8-9]}}, %ymm0, %ymm0
+; NOT_WIN: vaddps  {{%ymm[8-9]}}, %ymm1, %ymm1
+; NOT_WIN: ret
+
+define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %1, %b
+  %4 = fadd <16 x float> %2, %3
+  ret <16 x float> %4
+}
+
+; test calling conventions - prolog and epilog
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: call
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: call
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
+   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
+   ret <16 x float> %c
+}
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index c44beb4..88ecd5a 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -1140,9 +1140,9 @@ declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) noun
 
 
 define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
-  ; CHECK: movl
-  ; CHECK: movl
-  ; CHECK: vpcmpestri
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: vpcmpestri $7
   ; CHECK: movl
   %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
@@ -1150,6 +1150,18 @@ define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
 declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
 
 
+define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: vpcmpestri $7, (
+  ; CHECK: movl
+  %1 = load <16 x i8>* %a0
+  %2 = load <16 x i8>* %a2
+  %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+
+
 define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
@@ -1216,8 +1228,19 @@ define <16 x i8> @test_x86_sse42_pcmpestrm128(<16 x i8> %a0, <16 x i8> %a2) {
 declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
 
 
+define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2) {
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: vpcmpestrm $7,
+  ; CHECK-NOT: vmov
+  %1 = load <16 x i8>* %a2
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %1, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
 define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcmpistri
+  ; CHECK: vpcmpistri $7
   ; CHECK: movl
   %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
@@ -1225,6 +1248,16 @@ define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
 declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 
+define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
+  ; CHECK: vpcmpistri $7, (
+  ; CHECK: movl
+  %1 = load <16 x i8>* %a0
+  %2 = load <16 x i8>* %a1
+  %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %1, <16 x i8> %2, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+
+
 define i32 @test_x86_sse42_pcmpistria128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
   ; CHECK: seta
@@ -1271,7 +1304,7 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 
 define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcmpistrm
+  ; CHECK: vpcmpistrm $7
   ; CHECK-NOT: vmov
   %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
@@ -1279,6 +1312,15 @@ define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
 declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 
+define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1) {
+  ; CHECK: vpcmpistrm $7, (
+  ; CHECK-NOT: vmov
+  %1 = load <16 x i8>* %a1
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %1, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
 define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: vaddss
   %res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 9b41709..ec11654 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -229,9 +229,8 @@ define   <8 x float> @test17(<4 x float> %y) {
 }
 
 ; CHECK: test18
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: vunpcklps
+; CHECK: vmovshdup
+; CHECK: vblendps
 ; CHECK: ret
 define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
   %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -239,9 +238,8 @@ define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
 }
 
 ; CHECK: test19
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: vunpcklps
+; CHECK: vmovsldup
+; CHECK: vblendps
 ; CHECK: ret
 define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
   %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
diff --git a/test/CodeGen/X86/avx-vextractf128.ll b/test/CodeGen/X86/avx-vextractf128.ll
index fe0f6ca..ff56a45 100644
--- a/test/CodeGen/X86/avx-vextractf128.ll
+++ b/test/CodeGen/X86/avx-vextractf128.ll
@@ -19,12 +19,12 @@ entry:
 }
 
 ; CHECK: @t0
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t0(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
 entry:
-  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 1)
   %1 = bitcast float* %addr to <4 x float>*
   store <4 x float> %0, <4 x float>* %1, align 16
   ret void
@@ -32,27 +32,13 @@ entry:
 
 declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
 
-; CHECK: @t1
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovups %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t1(float* %addr, <8 x float> %a) nounwind uwtable ssp {
-entry:
-  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
-  %1 = bitcast float* %addr to i8*
-  tail call void @llvm.x86.sse.storeu.ps(i8* %1, <4 x float> %0)
-  ret void
-}
-
-declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
-
 ; CHECK: @t2
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t2(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
 entry:
-  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
+  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 1)
   %1 = bitcast double* %addr to <2 x double>*
   store <2 x double> %0, <2 x double>* %1, align 16
   ret void
@@ -60,28 +46,14 @@ entry:
 
 declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
 
-; CHECK: @t3
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovups %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t3(double* %addr, <4 x double> %a) nounwind uwtable ssp {
-entry:
-  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
-  %1 = bitcast double* %addr to i8*
-  tail call void @llvm.x86.sse2.storeu.pd(i8* %1, <2 x double> %0)
-  ret void
-}
-
-declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
-
 ; CHECK: @t4
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t4(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
 entry:
   %0 = bitcast <4 x i64> %a to <8 x i32>
-  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 1)
   %2 = bitcast <4 x i32> %1 to <2 x i64>
   store <2 x i64> %2, <2 x i64>* %addr, align 16
   ret void
@@ -90,17 +62,43 @@ entry:
 declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
 
 ; CHECK: @t5
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovdqu %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t5(<2 x i64>* %addr, <4 x i64> %a) nounwind uwtable ssp {
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t5(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
+  %1 = bitcast float* %addr to <4 x float>*
+  store <4 x float> %0, <4 x float>* %1, align 16
+  ret void
+}
+
+; CHECK: @t6
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t6(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
+entry:
+  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
+  %1 = bitcast double* %addr to <2 x double>*
+  store <2 x double> %0, <2 x double>* %1, align 16
+  ret void
+}
+
+; CHECK: @t7
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t7(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
 entry:
   %0 = bitcast <4 x i64> %a to <8 x i32>
   %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
-  %2 = bitcast <2 x i64>* %addr to i8*
-  %3 = bitcast <4 x i32> %1 to <16 x i8>
-  tail call void @llvm.x86.sse2.storeu.dq(i8* %2, <16 x i8> %3)
+  %2 = bitcast <4 x i32> %1 to <2 x i64>
+  store <2 x i64> %2, <2 x i64>* %addr, align 16
   ret void
 }
 
-declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
+; CHECK: @t8
+; CHECK: vmovups %xmm0, (%rdi)
+define void @t8(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+entry:
+  %0 = bitcast <4 x i64> %a to <8 x i32>
+  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+  %2 = bitcast <4 x i32> %1 to <2 x i64>
+  store <2 x i64> %2, <2 x i64>* %addr, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
index c5899fa..a414e68 100644
--- a/test/CodeGen/X86/avx2-shuffle.ll
+++ b/test/CodeGen/X86/avx2-shuffle.ll
@@ -26,3 +26,37 @@ entry:
   %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 3, i32 undef, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i16> %shuffle.i
 }
+
+; CHECK: vpshufb_test
+; CHECK: vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 19, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
+
+; CHECK: vpshufb1_test
+; CHECK: vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb1_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
+
+
+; CHECK: vpshufb2_test
+; CHECK: vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb2_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
diff --git a/test/CodeGen/X86/bitcast-i256.ll b/test/CodeGen/X86/bitcast-i256.ll
new file mode 100644
index 0000000..85ac2fe
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-i256.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i < %s | FileCheck %s --check-prefix CHECK
+
+define i256 @foo(<8 x i32> %a) {
+  %r = bitcast <8 x i32> %a to i256
+  ret i256 %r
+; CHECK: foo
+; CHECK: vextractf128
+; CHECK: vpextrq
+; CHECK: vpextrq
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/bool-simplify.ll b/test/CodeGen/X86/bool-simplify.ll
index 0cb9fd9..09eb5d1 100644
--- a/test/CodeGen/X86/bool-simplify.ll
+++ b/test/CodeGen/X86/bool-simplify.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx,+rdrand | FileCheck %s
 
 define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) {
   %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
@@ -39,4 +39,20 @@ define i32 @bax(<2 x i64> %c) {
 ; CHECK: ret
 }
 
+define i32 @rnd(i32 %arg) nounwind uwtable {
+  %1 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
+  %2 = extractvalue { i32, i32 } %1, 0
+  %3 = extractvalue { i32, i32 } %1, 1
+  %4 = icmp eq i32 %3, 0
+  %5 = select i1 %4, i32 0, i32 %arg
+  %6 = add i32 %5, %2
+  ret i32 %6
+; CHECK: rnd
+; CHECK: rdrand
+; CHECK: cmov
+; CHECK-NOT: cmov
+; CHECK: ret
+}
+
 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
+declare { i32, i32 } @llvm.x86.rdrand.32() nounwind
diff --git a/test/CodeGen/X86/buildvec-insertvec.ll b/test/CodeGen/X86/buildvec-insertvec.ll
new file mode 100644
index 0000000..3fb69a4
--- /dev/null
+++ b/test/CodeGen/X86/buildvec-insertvec.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {
+  %t0 = fptoui <3 x float> %in to <3 x i8>
+  %t1 = shufflevector <3 x i8> %t0, <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %t2 = insertelement <4 x i8> %t1, i8 -1, i32 3
+  store <4 x i8> %t2, <4 x i8>* %out, align 4
+  ret void
+; CHECK: foo
+; CHECK: cvttps2dq
+; CHECK-NOT: pextrd
+; CHECK: pinsrd
+; CHECK-NEXT: pshufb
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/cmov-fp.ll b/test/CodeGen/X86/cmov-fp.ll
new file mode 100644
index 0000000..ca91f9e
--- /dev/null
+++ b/test/CodeGen/X86/cmov-fp.ll
@@ -0,0 +1,451 @@
+; RUN: llc -march x86 -mcpu pentium4 < %s | FileCheck %s -check-prefix=SSE
+; RUN: llc -march x86 -mcpu pentium3 < %s | FileCheck %s -check-prefix=NOSSE2
+; RUN: llc -march x86 -mcpu pentium2 < %s | FileCheck %s -check-prefix=NOSSE1
+; RUN: llc -march x86 -mcpu pentium < %s | FileCheck %s -check-prefix=NOCMOV
+; PR14035
+
+define double @test1(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp ugt i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test1:
+; SSE: movsd
+
+; NOSSE2: test1:
+; NOSSE2: fcmovnbe
+
+; NOSSE1: test1:
+; NOSSE1: fcmovnbe
+
+; NOCMOV: test1:
+; NOCMOV: fstp
+
+}
+
+define double @test2(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp uge i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test2:
+; SSE: movsd
+
+; NOSSE2: test2:
+; NOSSE2: fcmovnb
+
+; NOSSE1: test2:
+; NOSSE1: fcmovnb
+
+; NOCMOV: test2:
+; NOCMOV: fstp
+}
+
+define double @test3(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test3:
+; SSE: movsd
+
+; NOSSE2: test3:
+; NOSSE2: fcmovb
+
+; NOSSE1: test3:
+; NOSSE1: fcmovb
+
+; NOCMOV: test3:
+; NOCMOV: fstp
+}
+
+define double @test4(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp ule i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test4:
+; SSE: movsd
+
+; NOSSE2: test4:
+; NOSSE2: fcmovbe
+
+; NOSSE1: test4:
+; NOSSE1: fcmovbe
+
+; NOCMOV: test4:
+; NOCMOV: fstp
+}
+
+define double @test5(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp sgt i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test5:
+; SSE: movsd
+
+; NOSSE2: test5:
+; NOSSE2: fstp
+
+; NOSSE1: test5:
+; NOSSE1: fstp
+
+; NOCMOV: test5:
+; NOCMOV: fstp
+}
+
+define double @test6(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp sge i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test6:
+; SSE: movsd
+
+; NOSSE2: test6:
+; NOSSE2: fstp
+
+; NOSSE1: test6:
+; NOSSE1: fstp
+
+; NOCMOV: test6:
+; NOCMOV: fstp
+}
+
+define double @test7(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test7:
+; SSE: movsd
+
+; NOSSE2: test7:
+; NOSSE2: fstp
+
+; NOSSE1: test7:
+; NOSSE1: fstp
+
+; NOCMOV: test7:
+; NOCMOV: fstp
+}
+
+define double @test8(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp sle i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test8:
+; SSE: movsd
+
+; NOSSE2: test8:
+; NOSSE2: fstp
+
+; NOSSE1: test8:
+; NOSSE1: fstp
+
+; NOCMOV: test8:
+; NOCMOV: fstp
+}
+
+define float @test9(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp ugt i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test9:
+; SSE: movss
+
+; NOSSE2: test9:
+; NOSSE2: movss
+
+; NOSSE1: test9:
+; NOSSE1: fcmovnbe
+
+; NOCMOV: test9:
+; NOCMOV: fstp
+}
+
+define float @test10(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp uge i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test10:
+; SSE: movss
+
+; NOSSE2: test10:
+; NOSSE2: movss
+
+; NOSSE1: test10:
+; NOSSE1: fcmovnb
+
+; NOCMOV: test10:
+; NOCMOV: fstp
+}
+
+define float @test11(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test11:
+; SSE: movss
+
+; NOSSE2: test11:
+; NOSSE2: movss
+
+; NOSSE1: test11:
+; NOSSE1: fcmovb
+
+; NOCMOV: test11:
+; NOCMOV: fstp
+}
+
+define float @test12(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp ule i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test12:
+; SSE: movss
+
+; NOSSE2: test12:
+; NOSSE2: movss
+
+; NOSSE1: test12:
+; NOSSE1: fcmovbe
+
+; NOCMOV: test12:
+; NOCMOV: fstp
+}
+
+define float @test13(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp sgt i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test13:
+; SSE: movss
+
+; NOSSE2: test13:
+; NOSSE2: movss
+
+; NOSSE1: test13:
+; NOSSE1: fstp
+
+; NOCMOV: test13:
+; NOCMOV: fstp
+}
+
+define float @test14(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp sge i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test14:
+; SSE: movss
+
+; NOSSE2: test14:
+; NOSSE2: movss
+
+; NOSSE1: test14:
+; NOSSE1: fstp
+
+; NOCMOV: test14:
+; NOCMOV: fstp
+}
+
+define float @test15(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test15:
+; SSE: movss
+
+; NOSSE2: test15:
+; NOSSE2: movss
+
+; NOSSE1: test15:
+; NOSSE1: fstp
+
+; NOCMOV: test15:
+; NOCMOV: fstp
+}
+
+define float @test16(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp sle i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test16:
+; SSE: movss
+
+; NOSSE2: test16:
+; NOSSE2: movss
+
+; NOSSE1: test16:
+; NOSSE1: fstp
+
+; NOCMOV: test16:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test17(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp ugt i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test17:
+; SSE: fcmovnbe
+
+; NOSSE2: test17:
+; NOSSE2: fcmovnbe
+
+; NOSSE1: test17:
+; NOSSE1: fcmovnbe
+
+; NOCMOV: test17:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test18(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp uge i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test18:
+; SSE: fcmovnb
+
+; NOSSE2: test18:
+; NOSSE2: fcmovnb
+
+; NOSSE1: test18:
+; NOSSE1: fcmovnb
+
+; NOCMOV: test18:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test19(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test19:
+; SSE: fcmovb
+
+; NOSSE2: test19:
+; NOSSE2: fcmovb
+
+; NOSSE1: test19:
+; NOSSE1: fcmovb
+
+; NOCMOV: test19:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test20(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp ule i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test20:
+; SSE: fcmovbe
+
+; NOSSE2: test20:
+; NOSSE2: fcmovbe
+
+; NOSSE1: test20:
+; NOSSE1: fcmovbe
+
+; NOCMOV: test20:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test21(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp sgt i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; We don't emit a branch for fp80, why?
+; SSE: test21:
+; SSE: testb
+; SSE: fcmovne
+
+; NOSSE2: test21:
+; NOSSE2: testb
+; NOSSE2: fcmovne
+
+; NOSSE1: test21:
+; NOSSE1: testb
+; NOSSE1: fcmovne
+
+; NOCMOV: test21:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test22(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp sge i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test22:
+; SSE: testb
+; SSE: fcmovne
+
+; NOSSE2: test22:
+; NOSSE2: testb
+; NOSSE2: fcmovne
+
+; NOSSE1: test22:
+; NOSSE1: testb
+; NOSSE1: fcmovne
+
+; NOCMOV: test22:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test23(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test23:
+; SSE: testb
+; SSE: fcmovne
+
+; NOSSE2: test23:
+; NOSSE2: testb
+; NOSSE2: fcmovne
+
+; NOSSE1: test23:
+; NOSSE1: testb
+; NOSSE1: fcmovne
+
+; NOCMOV: test23:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test24(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp sle i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test24:
+; SSE: testb
+; SSE: fcmovne
+
+; NOSSE2: test24:
+; NOSSE2: testb
+; NOSSE2: fcmovne
+
+; NOSSE1: test24:
+; NOSSE1: testb
+; NOSSE1: fcmovne
+
+; NOCMOV: test24:
+; NOCMOV: fstp
+}
diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll
index 9badfc8..276d0db 100644
--- a/test/CodeGen/X86/crash.ll
+++ b/test/CodeGen/X86/crash.ll
@@ -442,3 +442,150 @@ entry:
   ret void
 }
 declare void @_Z6PrintFz(...)
+
+@a = external global i32, align 4
+@fn1.g = private unnamed_addr constant [9 x i32*] [i32* null, i32* @a, i32* null, i32* null, i32* null, i32* null, i32* null, i32* null, i32* null], align 16
+@e = external global i32, align 4
+
+define void @pr13943() nounwind uwtable ssp {
+entry:
+  %srcval = load i576* bitcast ([9 x i32*]* @fn1.g to i576*), align 16
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %g.0 = phi i576 [ %srcval, %entry ], [ %ins, %for.inc ]
+  %0 = load i32* @e, align 4
+  %1 = lshr i576 %g.0, 64
+  %2 = trunc i576 %1 to i64
+  %3 = inttoptr i64 %2 to i32*
+  %cmp = icmp eq i32* undef, %3
+  %conv2 = zext i1 %cmp to i32
+  %and = and i32 %conv2, %0
+  tail call void (...)* @fn3(i32 %and) nounwind
+  %tobool = icmp eq i32 undef, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.cond
+  ret void
+
+for.inc:                                          ; preds = %for.cond
+  %4 = shl i576 %1, 384
+  %mask = and i576 %g.0, -726838724295606890509921801691610055141362320587174446476410459910173841445449629921945328942266354949348255351381262292727973638307841
+  %5 = and i576 %4, 726838724295606890509921801691610055141362320587174446476410459910173841445449629921945328942266354949348255351381262292727973638307840
+  %ins = or i576 %5, %mask
+  br label %for.cond
+}
+
+declare void @fn3(...)
+
+; Check coalescing of IMPLICIT_DEF instructions:
+;
+; %vreg1 = IMPLICIT_DEF
+; %vreg2 = MOV32r0
+;
+; When coalescing %vreg1 and %vreg2, the IMPLICIT_DEF instruction should be
+; erased along with its value number.
+;
+define void @rdar12474033() nounwind ssp {
+bb:
+  br i1 undef, label %bb21, label %bb1
+
+bb1:                                              ; preds = %bb
+  switch i32 undef, label %bb10 [
+    i32 4, label %bb2
+    i32 1, label %bb9
+    i32 5, label %bb3
+    i32 6, label %bb3
+    i32 2, label %bb9
+  ]
+
+bb2:                                              ; preds = %bb1
+  unreachable
+
+bb3:                                              ; preds = %bb1, %bb1
+  br i1 undef, label %bb4, label %bb5
+
+bb4:                                              ; preds = %bb3
+  unreachable
+
+bb5:                                              ; preds = %bb3
+  %tmp = load <4 x float>* undef, align 1
+  %tmp6 = bitcast <4 x float> %tmp to i128
+  %tmp7 = load <4 x float>* undef, align 1
+  %tmp8 = bitcast <4 x float> %tmp7 to i128
+  br label %bb10
+
+bb9:                                              ; preds = %bb1, %bb1
+  unreachable
+
+bb10:                                             ; preds = %bb5, %bb1
+  %tmp11 = phi i128 [ undef, %bb1 ], [ %tmp6, %bb5 ]
+  %tmp12 = phi i128 [ 0, %bb1 ], [ %tmp8, %bb5 ]
+  switch i32 undef, label %bb21 [
+    i32 2, label %bb18
+    i32 3, label %bb13
+    i32 5, label %bb16
+    i32 6, label %bb17
+    i32 1, label %bb18
+  ]
+
+bb13:                                             ; preds = %bb10
+  br i1 undef, label %bb15, label %bb14
+
+bb14:                                             ; preds = %bb13
+  br label %bb21
+
+bb15:                                             ; preds = %bb13
+  unreachable
+
+bb16:                                             ; preds = %bb10
+  unreachable
+
+bb17:                                             ; preds = %bb10
+  unreachable
+
+bb18:                                             ; preds = %bb10, %bb10
+  %tmp19 = bitcast i128 %tmp11 to <4 x float>
+  %tmp20 = bitcast i128 %tmp12 to <4 x float>
+  br label %bb21
+
+bb21:                                             ; preds = %bb18, %bb14, %bb10, %bb
+  %tmp22 = phi <4 x float> [ undef, %bb ], [ undef, %bb10 ], [ undef, %bb14 ], [ %tmp20, %bb18 ]
+  %tmp23 = phi <4 x float> [ undef, %bb ], [ undef, %bb10 ], [ undef, %bb14 ], [ %tmp19, %bb18 ]
+  store <4 x float> %tmp23, <4 x float>* undef, align 16
+  store <4 x float> %tmp22, <4 x float>* undef, align 16
+  switch i32 undef, label %bb29 [
+    i32 5, label %bb27
+    i32 1, label %bb24
+    i32 2, label %bb25
+    i32 14, label %bb28
+    i32 4, label %bb26
+  ]
+
+bb24:                                             ; preds = %bb21
+  unreachable
+
+bb25:                                             ; preds = %bb21
+  br label %bb29
+
+bb26:                                             ; preds = %bb21
+  br label %bb29
+
+bb27:                                             ; preds = %bb21
+  unreachable
+
+bb28:                                             ; preds = %bb21
+  br label %bb29
+
+bb29:                                             ; preds = %bb28, %bb26, %bb25, %bb21
+  unreachable
+}
+
+define void @pr14194() nounwind uwtable {
+  %tmp = load i64* undef, align 16
+  %tmp1 = trunc i64 %tmp to i32
+  %tmp2 = lshr i64 %tmp, 32
+  %tmp3 = trunc i64 %tmp2 to i32
+  %tmp4 = call { i32, i32 } asm sideeffect "", "=&r,=&r,r,r,0,1,~{dirflag},~{fpsr},~{flags}"(i32 %tmp3, i32 undef, i32 %tmp3, i32 %tmp1) nounwind
+ ret void
+}
diff --git a/test/CodeGen/X86/cvtv2f32.ll b/test/CodeGen/X86/cvtv2f32.ll
new file mode 100644
index 0000000..466b096
--- /dev/null
+++ b/test/CodeGen/X86/cvtv2f32.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=i686-linux-pc -mcpu=corei7 | FileCheck %s
+
+define <2 x float> @foo(i32 %x, i32 %y, <2 x float> %v) {
+  %t1 = uitofp i32 %x to float
+  %t2 = insertelement <2 x float> undef, float %t1, i32 0
+  %t3 = uitofp i32 %y to float
+  %t4 = insertelement <2 x float> %t2, float %t3, i32 1
+  %t5 = fmul <2 x float> %v, %t4
+  ret <2 x float> %t5
+; CHECK: foo
+; CHECK: or
+; CHECK: subpd
+; CHECK: cvtpd2ps
+; CHECK: ret
+}
+
+define <2 x float> @bar(<2 x i32> %in) {
+  %r = uitofp <2 x i32> %in to <2 x float>
+  ret <2 x float> %r
+; CHECK: bar
+; CHECK: or
+; CHECK: subpd
+; CHECK: cvtpd2ps
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/early-ifcvt-crash.ll b/test/CodeGen/X86/early-ifcvt-crash.ll
new file mode 100644
index 0000000..c828026
--- /dev/null
+++ b/test/CodeGen/X86/early-ifcvt-crash.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -x86-early-ifcvt -verify-machineinstrs
+; RUN: llc < %s -x86-early-ifcvt -stress-early-ifcvt -verify-machineinstrs
+;
+; Run these tests with and without -stress-early-ifcvt to exercise heuristics.
+;
+target triple = "x86_64-apple-macosx10.8.0"
+
+; MachineTraceMetrics::Ensemble::addLiveIns crashes because the first operand
+; on an inline asm instruction is not a vreg def.
+; <rdar://problem/12472811>
+define void @f1() nounwind {
+entry:
+  br i1 undef, label %if.then6.i, label %if.end.i
+
+if.then6.i:
+  br label %if.end.i
+
+if.end.i:
+  br i1 undef, label %if.end25.i, label %if.else17.i
+
+if.else17.i:
+  %shl24.i = shl i32 undef, undef
+  br label %if.end25.i
+
+if.end25.i:
+  %storemerge31.i = phi i32 [ %shl24.i, %if.else17.i ], [ 0, %if.end.i ]
+  store i32 %storemerge31.i, i32* undef, align 4
+  %0 = tail call i32 asm sideeffect "", "=r,r,i,i"(i32 undef, i32 15, i32 1) nounwind
+  %conv = trunc i32 %0 to i8
+  store i8 %conv, i8* undef, align 1
+  unreachable
+}
diff --git a/test/CodeGen/X86/early-ifcvt.ll b/test/CodeGen/X86/early-ifcvt.ll
index 7883ffa..2e1852d 100644
--- a/test/CodeGen/X86/early-ifcvt.ll
+++ b/test/CodeGen/X86/early-ifcvt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -enable-early-ifcvt -stress-early-ifcvt | FileCheck %s
+; RUN: llc < %s -x86-early-ifcvt -stress-early-ifcvt | FileCheck %s
 target triple = "x86_64-apple-macosx10.8.0"
 
 ; CHECK: mm2
@@ -67,3 +67,78 @@ if.end41:
 }
 
 declare void @fprintf(...) nounwind
+
+; CHECK: BZ2_decompress
+; This test case contains irreducible control flow, so MachineLoopInfo doesn't
+; recognize the cycle in the CFG. This would confuse MachineTraceMetrics.
+define void @BZ2_decompress(i8* %s) nounwind ssp {
+entry:
+  switch i32 undef, label %sw.default [
+    i32 39, label %if.end.sw.bb2050_crit_edge
+    i32 36, label %sw.bb1788
+    i32 37, label %if.end.sw.bb1855_crit_edge
+    i32 40, label %sw.bb2409
+    i32 38, label %sw.bb1983
+    i32 44, label %if.end.sw.bb3058_crit_edge
+  ]
+
+if.end.sw.bb3058_crit_edge:                       ; preds = %entry
+  br label %save_state_and_return
+
+if.end.sw.bb1855_crit_edge:                       ; preds = %entry
+  br label %save_state_and_return
+
+if.end.sw.bb2050_crit_edge:                       ; preds = %entry
+  br label %sw.bb2050
+
+sw.bb1788:                                        ; preds = %entry
+  br label %save_state_and_return
+
+sw.bb1983:                                        ; preds = %entry
+  br i1 undef, label %save_state_and_return, label %if.then1990
+
+if.then1990:                                      ; preds = %sw.bb1983
+  br label %while.body2038
+
+while.body2038:                                   ; preds = %sw.bb2050, %if.then1990
+  %groupPos.8 = phi i32 [ 0, %if.then1990 ], [ %groupPos.9, %sw.bb2050 ]
+  br i1 undef, label %save_state_and_return, label %if.end2042
+
+if.end2042:                                       ; preds = %while.body2038
+  br i1 undef, label %if.end2048, label %while.end2104
+
+if.end2048:                                       ; preds = %if.end2042
+  %bsLive2054.pre = getelementptr inbounds i8* %s, i32 8
+  br label %sw.bb2050
+
+sw.bb2050:                                        ; preds = %if.end2048, %if.end.sw.bb2050_crit_edge
+  %groupPos.9 = phi i32 [ 0, %if.end.sw.bb2050_crit_edge ], [ %groupPos.8, %if.end2048 ]
+  %and2064 = and i32 undef, 1
+  br label %while.body2038
+
+while.end2104:                                    ; preds = %if.end2042
+  br i1 undef, label %save_state_and_return, label %if.end2117
+
+if.end2117:                                       ; preds = %while.end2104
+  br i1 undef, label %while.body2161.lr.ph, label %while.body2145.lr.ph
+
+while.body2145.lr.ph:                             ; preds = %if.end2117
+  br label %save_state_and_return
+
+while.body2161.lr.ph:                             ; preds = %if.end2117
+  br label %save_state_and_return
+
+sw.bb2409:                                        ; preds = %entry
+  br label %save_state_and_return
+
+sw.default:                                       ; preds = %entry
+  call void @BZ2_bz__AssertH__fail() nounwind
+  br label %save_state_and_return
+
+save_state_and_return:
+  %groupPos.14 = phi i32 [ 0, %sw.default ], [ %groupPos.8, %while.body2038 ], [ %groupPos.8, %while.end2104 ], [ 0, %if.end.sw.bb3058_crit_edge ], [ 0, %if.end.sw.bb1855_crit_edge ], [ %groupPos.8, %while.body2161.lr.ph ], [ %groupPos.8, %while.body2145.lr.ph ], [ 0, %sw.bb2409 ], [ 0, %sw.bb1788 ], [ 0, %sw.bb1983 ]
+  store i32 %groupPos.14, i32* undef, align 4
+  ret void
+}
+
+declare void @BZ2_bz__AssertH__fail()
diff --git a/test/CodeGen/X86/extract-concat.ll b/test/CodeGen/X86/extract-concat.ll
new file mode 100644
index 0000000..704309e
--- /dev/null
+++ b/test/CodeGen/X86/extract-concat.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define void @foo(<4 x float> %in, <4 x i8>* %out) {
+  %t0 = fptosi <4 x float> %in to <4 x i32>
+  %t1 = trunc <4 x i32> %t0 to <4 x i16>
+  %t2 = shufflevector <4 x i16> %t1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t3 = trunc <8 x i16> %t2 to <8 x i8>
+  %t4 = shufflevector <8 x i8> %t3, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t5 = insertelement <4 x i8> %t4, i8 -1, i32 3
+  store <4 x i8> %t5, <4 x i8>* %out
+  ret void
+; CHECK: foo
+; CHECK: cvttps2dq
+; CHECK-NOT: pextrd
+; CHECK: pshufb
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/fast-cc-callee-pops.ll b/test/CodeGen/X86/fast-cc-callee-pops.ll
index ea10897..2c5b80a 100644
--- a/test/CodeGen/X86/fast-cc-callee-pops.ll
+++ b/test/CodeGen/X86/fast-cc-callee-pops.ll
@@ -2,12 +2,12 @@
 
 ; Check that a fastcc function pops its stack variables before returning.
 
-define x86_fastcallcc void @func(i64 %X, i64 %Y, float %G, double %Z) nounwind {
+define x86_fastcallcc void @func(i64 inreg %X, i64 %Y, float %G, double %Z) nounwind {
         ret void
 ; CHECK: ret{{.*}}20
 }
 
-define x86_thiscallcc void @func2(i32 %X, i64 %Y, float %G, double %Z) nounwind {
+define x86_thiscallcc void @func2(i32 inreg %X, i64 %Y, float %G, double %Z) nounwind {
         ret void
 ; CHECK: ret{{.*}}20
 }
diff --git a/test/CodeGen/X86/fast-cc-merge-stack-adj.ll b/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
index 14cb136..d591f94 100644
--- a/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
+++ b/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
@@ -3,7 +3,7 @@
 
 target triple = "i686-pc-linux-gnu"
 
-declare x86_fastcallcc void @func(i32*, i64)
+declare x86_fastcallcc void @func(i32*, i64 inreg)
 
 define x86_fastcallcc void @caller(i32, i64) {
         %X = alloca i32         ; <i32*> [#uses=1]
diff --git a/test/CodeGen/X86/fast-cc-pass-in-regs.ll b/test/CodeGen/X86/fast-cc-pass-in-regs.ll
index a96e504..b60b68b 100644
--- a/test/CodeGen/X86/fast-cc-pass-in-regs.ll
+++ b/test/CodeGen/X86/fast-cc-pass-in-regs.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | FileCheck %s
 ; check that fastcc is passing stuff in regs.
 
-declare x86_fastcallcc i64 @callee(i64)
+declare x86_fastcallcc i64 @callee(i64 inreg)
 
 define i64 @caller() {
         %X = call x86_fastcallcc  i64 @callee( i64 4294967299 )          ; <i64> [#uses=1]
@@ -9,7 +9,7 @@ define i64 @caller() {
         ret i64 %X
 }
 
-define x86_fastcallcc i64 @caller2(i64 %X) {
+define x86_fastcallcc i64 @caller2(i64 inreg %X) {
         ret i64 %X
 ; CHECK: mov{{.*}}EAX, ECX
 }
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index d8f4663..cdfaf7f 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s  -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=-avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=+avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
@@ -197,6 +198,11 @@ block2:
 ; CHECK: cvtsi2sdq {{.*}} %xmm0
 ; CHECK: movb $1, %al
 ; CHECK: callq _test16callee
+
+; AVX: movabsq $1
+; AVX: vmovsd LCP{{.*}}_{{.*}}(%rip), %xmm0
+; AVX: movb $1, %al
+; AVX: callq _test16callee
   call void (...)* @test16callee(double 1.000000e+00)
   ret void
 }
@@ -285,3 +291,16 @@ entry:
 }
 
 declare void @foo22(i32)
+
+; PR13563
+define void @test23(i8* noalias sret %result) {
+  %a = alloca i8
+  %b = call i8* @foo23()
+  ret void
+; CHECK: test23:
+; CHECK: call
+; CHECK: movq  %rdi, %rax
+; CHECK: ret
+}
+
+declare i8* @foo23()
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index b0c1d0a..bd3514c 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -1,11 +1,13 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma  | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=i386-apple-darwin10               | FileCheck %s --check-prefix=CHECK-FMA-CALL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10             | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma,-fma4  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=-fma,-fma4  | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10  -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma4  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
 
 ; CHECK: test_f32
 ; CHECK-FMA-INST: vfmadd213ss
-; CHECK-FMA-CALL: _fmaf
+; CHECK-FMA-CALL: fmaf
 
 define float @test_f32(float %a, float %b, float %c) nounwind readnone ssp {
 entry:
@@ -15,7 +17,7 @@ entry:
 
 ; CHECK: test_f64
 ; CHECK-FMA-INST: vfmadd213sd
-; CHECK-FMA-CALL: _fma
+; CHECK-FMA-CALL: fma
 
 define double @test_f64(double %a, double %b, double %c) nounwind readnone ssp {
 entry:
@@ -24,7 +26,7 @@ entry:
 }
 
 ; CHECK: test_f80
-; CHECK: _fmal
+; CHECK: fmal
 
 define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) nounwind readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/fma3-intrinsics.ll b/test/CodeGen/X86/fma3-intrinsics.ll
index 90529e0..e3910a6 100755
--- a/test/CodeGen/X86/fma3-intrinsics.ll
+++ b/test/CodeGen/X86/fma3-intrinsics.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 -mattr=avx2,+fma | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 | FileCheck %s
+; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
 
 define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   ; CHECK: fmadd213ss %xmm
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
index fd414b3..2fe1ecd 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+avx,+fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s
 
 ; VFMADD
 define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 5d97a87..6d98d59 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -1,8 +1,13 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 -fp-contract=fast | FileCheck %s --check-prefix=CHECK_FMA4
 
 ; CHECK: test_x86_fmadd_ps
-; CHECK: vfmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: vfmadd213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps
+; CHECK_FMA4: vfmaddps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fadd <4 x float> %x, %a2
@@ -10,8 +15,11 @@ define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
 }
 
 ; CHECK: test_x86_fmsub_ps
-; CHECK: fmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fmsub213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps
+; CHECK_FMA4: vfmsubps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fsub <4 x float> %x, %a2
@@ -19,8 +27,11 @@ define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
 }
 
 ; CHECK: test_x86_fnmadd_ps
-; CHECK: fnmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fnmadd213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps
+; CHECK_FMA4: vfnmaddps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fsub <4 x float> %a2, %x
@@ -28,8 +39,11 @@ define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x fl
 }
 
 ; CHECK: test_x86_fnmsub_ps
-; CHECK: fnmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fnmsub213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmsub_ps
+; CHECK_FMA4: fnmsubps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
@@ -38,8 +52,11 @@ define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x fl
 }
 
 ; CHECK: test_x86_fmadd_ps_y
-; CHECK: vfmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfmadd213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps_y
+; CHECK_FMA4: vfmaddps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fadd <8 x float> %x, %a2
@@ -47,8 +64,11 @@ define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
 }
 
 ; CHECK: test_x86_fmsub_ps_y
-; CHECK: vfmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfmsub213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps_y
+; CHECK_FMA4: vfmsubps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fsub <8 x float> %x, %a2
@@ -56,8 +76,11 @@ define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
 }
 
 ; CHECK: test_x86_fnmadd_ps_y
-; CHECK: vfnmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfnmadd213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps_y
+; CHECK_FMA4: vfnmaddps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fsub <8 x float> %a2, %x
@@ -65,7 +88,7 @@ define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x
 }
 
 ; CHECK: test_x86_fnmsub_ps_y
-; CHECK: vfnmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfnmsub213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
 define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
@@ -75,8 +98,11 @@ define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x
 }
 
 ; CHECK: test_x86_fmadd_pd_y
-; CHECK: vfmadd213pd     %ymm2, %ymm0, %ymm1
+; CHECK: vfmadd213pd     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_pd_y
+; CHECK_FMA4: vfmaddpd     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
   %x = fmul <4 x double> %a0, %a1
   %res = fadd <4 x double> %x, %a2
@@ -84,8 +110,11 @@ define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4
 }
 
 ; CHECK: test_x86_fmsub_pd_y
-; CHECK: vfmsub213pd     %ymm2, %ymm0, %ymm1
+; CHECK: vfmsub213pd     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_pd_y
+; CHECK_FMA4: vfmsubpd     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
   %x = fmul <4 x double> %a0, %a1
   %res = fsub <4 x double> %x, %a2
@@ -93,8 +122,11 @@ define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4
 }
 
 ; CHECK: test_x86_fmsub_pd
-; CHECK: vfmsub213pd     %xmm2, %xmm0, %xmm1
+; CHECK: vfmsub213pd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_pd
+; CHECK_FMA4: vfmsubpd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
   %x = fmul <2 x double> %a0, %a1
   %res = fsub <2 x double> %x, %a2
@@ -102,8 +134,11 @@ define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x
 }
 
 ; CHECK: test_x86_fnmadd_ss
-; CHECK: vfnmadd213ss    %xmm2, %xmm0, %xmm1
+; CHECK: vfnmadd213ss    %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ss
+; CHECK_FMA4: vfnmaddss    %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
   %x = fmul float %a0, %a1
   %res = fsub float %a2, %x
@@ -111,8 +146,11 @@ define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
 }
 
 ; CHECK: test_x86_fnmadd_sd
-; CHECK: vfnmadd213sd     %xmm2, %xmm0, %xmm1
+; CHECK: vfnmadd213sd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_sd
+; CHECK_FMA4: vfnmaddsd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
   %x = fmul double %a0, %a1
   %res = fsub double %a2, %x
@@ -120,8 +158,11 @@ define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
 }
 
 ; CHECK: test_x86_fmsub_sd
-; CHECK: vfmsub213sd     %xmm2, %xmm0, %xmm1
+; CHECK: vfmsub213sd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_sd
+; CHECK_FMA4: vfmsubsd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
   %x = fmul double %a0, %a1
   %res = fsub double %x, %a2
@@ -129,11 +170,43 @@ define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
 }
 
 ; CHECK: test_x86_fnmsub_ss
-; CHECK: vfnmsub213ss     %xmm2, %xmm0, %xmm1
+; CHECK: vfnmsub213ss     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmsub_ss
+; CHECK_FMA4: vfnmsubss     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
   %x = fsub float -0.000000e+00, %a0
   %y = fmul float %x, %a1
   %res = fsub float %y, %a2
   ret float %res
 }
+
+; CHECK: test_x86_fmadd_ps
+; CHECK: vmovaps         (%rdi), %xmm2
+; CHECK: vfmadd213ps     %xmm1, %xmm0, %xmm2
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps
+; CHECK_FMA4: vfmaddps     %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK_FMA4: ret
+define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = load <4 x float>* %a0
+  %y = fmul <4 x float> %x, %a1
+  %res = fadd <4 x float> %y, %a2
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fmsub_ps
+; CHECK: vmovaps         (%rdi), %xmm2
+; CHECK: fmsub213ps     %xmm1, %xmm0, %xmm2
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps
+; CHECK_FMA4: vfmsubps     %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK_FMA4: ret
+define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = load <4 x float>* %a0
+  %y = fmul <4 x float> %x, %a1
+  %res = fsub <4 x float> %y, %a2
+  ret <4 x float> %res
+}
+
diff --git a/test/CodeGen/X86/fold-load.ll b/test/CodeGen/X86/fold-load.ll
index c961f75..d836665 100644
--- a/test/CodeGen/X86/fold-load.ll
+++ b/test/CodeGen/X86/fold-load.ll
@@ -57,13 +57,13 @@ entry:
   %0 = load i32* %P, align 4
   %1 = load i32* %Q, align 4
   %2 = xor i32 %0, %1
-  %3 = and i32 %2, 65535
+  %3 = and i32 %2, 89947
   %4 = icmp eq i32 %3, 0
   br i1 %4, label %exit, label %land.end
 
 exit:
   %shr.i.i19 = xor i32 %1, %0
-  %5 = and i32 %shr.i.i19, 2147418112
+  %5 = and i32 %shr.i.i19, 3456789123
   %6 = icmp eq i32 %5, 0
   br label %land.end
 
diff --git a/test/CodeGen/X86/fp-fast.ll b/test/CodeGen/X86/fp-fast.ll
new file mode 100644
index 0000000..d70aa7d
--- /dev/null
+++ b/test/CodeGen/X86/fp-fast.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=x86-64 -mattr=+avx,-fma4 -mtriple=x86_64-apple-darwin -enable-unsafe-fp-math < %s | FileCheck %s
+
+; CHECK: test1
+define float @test1(float %a) {
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fadd float %a, %a
+  %r = fadd float %t1, %t1
+  ret float %r
+}
+
+; CHECK: test2
+define float @test2(float %a) {
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fmul float 4.0, %a
+  %t2 = fadd float %a, %a
+  %r = fadd float %t1, %t2
+  ret float %r
+}
+
+; CHECK: test3
+define float @test3(float %a) {
+; CHECK-NOT: addss
+; CHECK: xorps
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fmul float 2.0, %a
+  %t2 = fadd float %a, %a
+  %r = fsub float %t1, %t2
+  ret float %r
+}
+
+; CHECK: test4
+define float @test4(float %a) {
+; CHECK-NOT: fma
+; CHECK-NOT mul
+; CHECK-NOT: add
+; CHECK: ret
+  %t1 = fmul float %a, 0.0
+  %t2 = fadd float %a, %t1
+  ret float %t2
+}
+
+; CHECK: test5
+define float @test5(float %a) {
+; CHECK-NOT: add
+; CHECK: vxorps
+; CHECK: ret
+  %t1 = fsub float -0.0, %a
+  %t2 = fadd float %a, %t1
+  ret float %t2
+}
diff --git a/test/CodeGen/X86/fp-load-trunc.ll b/test/CodeGen/X86/fp-load-trunc.ll
new file mode 100644
index 0000000..2ae65c9
--- /dev/null
+++ b/test/CodeGen/X86/fp-load-trunc.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX
+
+define <1 x float> @test1(<1 x double>* %p) nounwind {
+; CHECK: test1
+; CHECK: cvtsd2ss
+; CHECK: ret
+; AVX:   test1
+; AVX:   vcvtsd2ss
+; AVX:   ret
+  %x = load <1 x double>* %p
+  %y = fptrunc <1 x double> %x to <1 x float>
+  ret <1 x float> %y
+}
+
+define <2 x float> @test2(<2 x double>* %p) nounwind {
+; CHECK: test2
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: ret
+; AVX:   test2
+; AVX:   vcvtpd2psx {{[0-9]*}}(%{{.*}})
+; AVX:   ret
+  %x = load <2 x double>* %p
+  %y = fptrunc <2 x double> %x to <2 x float>
+  ret <2 x float> %y
+}
+
+define <4 x float> @test3(<4 x double>* %p) nounwind {
+; CHECK: test3
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: movlhps
+; CHECK: ret
+; AVX:   test3
+; AVX:   vcvtpd2psy {{[0-9]*}}(%{{.*}})
+; AVX:   ret
+  %x = load <4 x double>* %p
+  %y = fptrunc <4 x double> %x to <4 x float>
+  ret <4 x float> %y
+}
+
+define <8 x float> @test4(<8 x double>* %p) nounwind {
+; CHECK: test4
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: movlhps
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: movlhps
+; CHECK: ret
+; AVX:   test4
+; AVX:   vcvtpd2psy {{[0-9]*}}(%{{.*}})
+; AVX:   vcvtpd2psy {{[0-9]*}}(%{{.*}})
+; AVX:   vinsertf128
+; AVX:   ret
+  %x = load <8 x double>* %p
+  %y = fptrunc <8 x double> %x to <8 x float>
+  ret <8 x float> %y
+}
+
+
diff --git a/test/CodeGen/X86/fp-trunc.ll b/test/CodeGen/X86/fp-trunc.ll
index 170637a..25442fc 100644
--- a/test/CodeGen/X86/fp-trunc.ll
+++ b/test/CodeGen/X86/fp-trunc.ll
@@ -1,33 +1,56 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX
 
 define <1 x float> @test1(<1 x double> %x) nounwind {
+; CHECK: test1
 ; CHECK: cvtsd2ss
 ; CHECK: ret
+; AVX:   test1
+; AVX:   vcvtsd2ss
+; AVX:   ret
   %y = fptrunc <1 x double> %x to <1 x float>
   ret <1 x float> %y
 }
 
-
 define <2 x float> @test2(<2 x double> %x) nounwind {
-; FIXME: It would be nice if this compiled down to a cvtpd2ps
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
+; CHECK: test2
+; CHECK: cvtpd2ps
 ; CHECK: ret
+; AVX:   test2
+; AVX-NOT:  vcvtpd2psy
+; AVX:   vcvtpd2ps
+; AVX:   ret
   %y = fptrunc <2 x double> %x to <2 x float>
   ret <2 x float> %y
 }
 
-define <8 x float> @test3(<8 x double> %x) nounwind {
-; FIXME: It would be nice if this compiled down to a series of cvtpd2ps
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
+define <4 x float> @test3(<4 x double> %x) nounwind {
+; CHECK: test3
+; CHECK: cvtpd2ps
+; CHECK: cvtpd2ps
+; CHECK: movlhps
+; CHECK: ret
+; AVX:   test3
+; AVX:   vcvtpd2psy
+; AVX:   ret
+  %y = fptrunc <4 x double> %x to <4 x float>
+  ret <4 x float> %y
+}
+
+define <8 x float> @test4(<8 x double> %x) nounwind {
+; CHECK: test4
+; CHECK: cvtpd2ps
+; CHECK: cvtpd2ps
+; CHECK: movlhps
+; CHECK: cvtpd2ps
+; CHECK: cvtpd2ps
+; CHECK: movlhps
 ; CHECK: ret
+; AVX:   test4
+; AVX:   vcvtpd2psy
+; AVX:   vcvtpd2psy
+; AVX:   vinsertf128
+; AVX:   ret
   %y = fptrunc <8 x double> %x to <8 x float>
   ret <8 x float> %y
 }
diff --git a/test/CodeGen/X86/handle-move.ll b/test/CodeGen/X86/handle-move.ll
new file mode 100644
index 0000000..e9f7a96
--- /dev/null
+++ b/test/CodeGen/X86/handle-move.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=x86-64 -mcpu=core2 -fast-isel -enable-misched -misched=shuffle -misched-bottomup -verify-machineinstrs < %s
+; RUN: llc -march=x86-64 -mcpu=core2 -fast-isel -enable-misched -misched=shuffle -misched-topdown -verify-machineinstrs < %s
+; REQUIRES: asserts
+;
+; Test the LiveIntervals::handleMove() function.
+;
+; Moving the DIV32r instruction exercises the regunit update code because
+; %EDX has a live range into the function and is used by the DIV32r.
+;
+; Here sinking a kill + dead def:
+; 144B -> 180B: DIV32r %vreg4, %EAX<imp-def>, %EDX<imp-def,dead>, %EFLAGS<imp-def,dead>, %EAX<imp-use,kill>, %EDX<imp-use>
+;       %vreg4: [48r,144r:0)  0@48r
+;         -->   [48r,180r:0)  0@48r
+;       DH:     [0B,16r:0)[128r,144r:2)[144r,144d:1)  0@0B-phi 1@144r 2@128r
+;         -->   [0B,16r:0)[128r,180r:2)[180r,180d:1)  0@0B-phi 1@180r 2@128r
+;       DL:     [0B,16r:0)[128r,144r:2)[144r,144d:1)  0@0B-phi 1@144r 2@128r
+;         -->   [0B,16r:0)[128r,180r:2)[180r,180d:1)  0@0B-phi 1@180r 2@128r
+;
+define i32 @f1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind uwtable readnone ssp {
+entry:
+  %y = add i32 %c, 1
+  %x = udiv i32 %b, %a
+  %add = add nsw i32 %y, %x
+  ret i32 %add
+}
+
+; Same as above, but moving a kill + live def:
+; 144B -> 180B: DIV32r %vreg4, %EAX<imp-def,dead>, %EDX<imp-def>, %EFLAGS<imp-def,dead>, %EAX<imp-use,kill>, %EDX<imp-use>
+;       %vreg4: [48r,144r:0)  0@48r
+;         -->   [48r,180r:0)  0@48r
+;       DH:     [0B,16r:0)[128r,144r:2)[144r,184r:1)  0@0B-phi 1@144r 2@128r
+;         -->   [0B,16r:0)[128r,180r:2)[180r,184r:1)  0@0B-phi 1@180r 2@128r
+;       DL:     [0B,16r:0)[128r,144r:2)[144r,184r:1)  0@0B-phi 1@144r 2@128r
+;         -->   [0B,16r:0)[128r,180r:2)[180r,184r:1)  0@0B-phi 1@180r 2@128r
+;
+define i32 @f2(i32 %a, i32 %b, i32 %c, i32 %d) nounwind uwtable readnone ssp {
+entry:
+  %y = sub i32 %c, %d
+  %x = urem i32 %b, %a
+  %add = add nsw i32 %x, %y
+  ret i32 %add
+}
+
+; Moving a use below the existing kill (%vreg5):
+; Moving a tied virtual register def (%vreg11):
+;
+; 96B -> 120B: %vreg11<def,tied1> = SUB32rr %vreg11<tied0>, %vreg5
+;       %vreg11:        [80r,96r:1)[96r,144r:0)  0@96r 1@80r
+;            -->        [80r,120r:1)[120r,144r:0)  0@120r 1@80r
+;       %vreg5:         [16r,112r:0)  0@16r
+;            -->        [16r,120r:0)  0@16r
+;
+define i32 @f3(i32 %a, i32 %b, i32 %c, i32 %d) nounwind uwtable readnone ssp {
+entry:
+  %y = sub i32 %a, %b
+  %x = add i32 %a, %b
+  %r = mul i32 %x, %y
+  ret i32 %r
+}
+
+; Move EFLAGS dead def across another def:
+; handleMove 208B -> 36B: %EDX<def> = MOV32r0 %EFLAGS<imp-def,dead>
+;    EFLAGS:    [20r,20d:4)[160r,160d:3)[208r,208d:0)[224r,224d:1)[272r,272d:2)[304r,304d:5)  0@208r 1@224r 2@272r 3@160r 4@20r 5@304r
+;         -->   [20r,20d:4)[36r,36d:0)[160r,160d:3)[224r,224d:1)[272r,272d:2)[304r,304d:5)  0@36r 1@224r 2@272r 3@160r 4@20r 5@304r
+;
+define i32 @f4(i32 %a, i32 %b, i32 %c, i32 %d) nounwind uwtable readnone ssp {
+entry:
+  %x = sub i32 %a, %b
+  %y = sub i32 %b, %c
+  %z = sub i32 %c, %d
+  %r1 = udiv i32 %x, %y
+  %r2 = mul i32 %z, %r1
+  ret i32 %r2
+}
diff --git a/test/CodeGen/X86/inline-asm-tied.ll b/test/CodeGen/X86/inline-asm-tied.ll
index 91576fb..597236e 100644
--- a/test/CodeGen/X86/inline-asm-tied.ll
+++ b/test/CodeGen/X86/inline-asm-tied.ll
@@ -19,3 +19,12 @@ entry:
 	%1 = load i64* %retval		; <i64> [#uses=1]
 	ret i64 %1
 }
+
+; The tied operands are not necessarily in the same order as the defs.
+; PR13742
+define i64 @swapped(i64 %x, i64 %y) nounwind {
+entry:
+	%x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) nounwind
+        %x1 = extractvalue { i64, i64 } %x0, 0
+        ret i64 %x1
+}
diff --git a/test/CodeGen/X86/inline-asm.ll b/test/CodeGen/X86/inline-asm.ll
index e6eb9ef..d201ebd 100644
--- a/test/CodeGen/X86/inline-asm.ll
+++ b/test/CodeGen/X86/inline-asm.ll
@@ -52,3 +52,10 @@ entry:
   %0 = call { i32, i32, i32, i32, i32 } asm sideeffect "", "=&r,=&r,=&r,=&r,=&q,r,~{ecx},~{memory},~{dirflag},~{fpsr},~{flags}"(i8* %h) nounwind
   ret void
 }
+
+; Mix normal and EC defs of the same register.
+define i32 @pr14376() nounwind noinline {
+entry:
+  %asm = tail call i32 asm sideeffect "", "={ax},i,~{eax},~{flags},~{rax}"(i64 61) nounwind
+  ret i32 %asm
+}
diff --git a/test/CodeGen/X86/inlineasm-sched-bug.ll b/test/CodeGen/X86/inlineasm-sched-bug.ll
new file mode 100644
index 0000000..08de0c0
--- /dev/null
+++ b/test/CodeGen/X86/inlineasm-sched-bug.ll
@@ -0,0 +1,13 @@
+; PR13504
+; RUN: llc -march=x86 -mcpu=atom <%s | FileCheck %s
+; CHECK: bsfl
+; CHECK-NOT: movl
+
+define i32 @foo(i32 %treemap) nounwind uwtable {
+entry:
+  %sub = sub i32 0, %treemap
+  %and = and i32 %treemap, %sub
+  %0 = tail call i32 asm "bsfl $1,$0\0A\09", "=r,rm,~{dirflag},~{fpsr},~{flags}"(i32 %and) nounwind
+  ret i32 %0
+}
+
diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll
index 48e2106..0e34222 100644
--- a/test/CodeGen/X86/jump_sign.ll
+++ b/test/CodeGen/X86/jump_sign.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=pentiumpro | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=pentiumpro -verify-machineinstrs | FileCheck %s
 
 define i32 @f(i32 %X) {
 entry:
@@ -219,7 +219,6 @@ entry:
 ; by sbb, we should not optimize cmp away.
 define i32 @q(i32 %j.4, i32 %w, i32 %el) {
 ; CHECK: q:
-; CHECK: sub
 ; CHECK: cmp
 ; CHECK-NEXT: sbb
   %tmp532 = add i32 %j.4, %w
@@ -253,3 +252,56 @@ return:
   %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
   ret i8* %retval.0
 }
+
+; Test optimizations of dec/inc.
+define i32 @dec(i32 %a) nounwind {
+entry:
+; CHECK: dec:
+; CHECK: decl
+; CHECK-NOT: test
+; CHECK: cmovsl
+  %sub = sub nsw i32 %a, 1
+  %cmp = icmp sgt i32 %sub, 0
+  %cond = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %cond
+}
+
+define i32 @inc(i32 %a) nounwind {
+entry:
+; CHECK: inc:
+; CHECK: incl
+; CHECK-NOT: test
+; CHECK: cmovsl
+  %add = add nsw i32 %a, 1
+  %cmp = icmp sgt i32 %add, 0
+  %cond = select i1 %cmp, i32 %add, i32 0
+  ret i32 %cond
+}
+
+; PR13966
+@b = common global i32 0, align 4
+@a = common global i32 0, align 4
+define i32 @test1(i32 %p1) nounwind uwtable {
+entry:
+; CHECK: test1:
+; CHECK: testb
+; CHECK: j
+; CHECK: ret
+  %0 = load i32* @b, align 4
+  %cmp = icmp ult i32 %0, %p1
+  %conv = zext i1 %cmp to i32
+  %1 = load i32* @a, align 4
+  %and = and i32 %conv, %1
+  %conv1 = trunc i32 %and to i8
+  %2 = urem i8 %conv1, 3
+  %tobool = icmp eq i8 %2, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  %dec = add nsw i32 %1, -1
+  store i32 %dec, i32* @a, align 4
+  br label %if.end
+
+if.end:
+  ret i32 undef
+}
diff --git a/test/CodeGen/X86/misched-balance.ll b/test/CodeGen/X86/misched-balance.ll
new file mode 100644
index 0000000..2184d9e
--- /dev/null
+++ b/test/CodeGen/X86/misched-balance.ll
@@ -0,0 +1,230 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
+; RUN:          -verify-machineinstrs | FileCheck %s
+;
+; Verify that misched resource/latency balancy heuristics are sane.
+
+define void @unrolled_mmult1(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
+  i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
+ i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
+  nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+; imull folded loads should be in order and interleaved with addl, never
+; adjacent. Also check that we have no spilling.
+;
+; Since mmult1 IR is already in good order, this effectively ensure
+; the scheduler maintains source order.
+;
+; CHECK: %for.body
+; CHECK-NOT: %rsp
+; CHECK: imull 4
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 8
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 12
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 16
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 20
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 24
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 28
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 32
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 36
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: %end
+for.body:
+  %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
+  %tmp57 = load i32* %tmp56, align 4
+  %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i
+  %tmp58 = load i32* %arrayidx12.us.i61, align 4
+  %mul.us.i = mul nsw i32 %tmp58, %tmp57
+  %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1
+  %tmp59 = load i32* %arrayidx8.us.i.1, align 4
+  %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i
+  %tmp60 = load i32* %arrayidx12.us.i61.1, align 4
+  %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59
+  %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i
+  %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2
+  %tmp61 = load i32* %arrayidx8.us.i.2, align 4
+  %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i
+  %tmp62 = load i32* %arrayidx12.us.i61.2, align 4
+  %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61
+  %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1
+  %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3
+  %tmp63 = load i32* %arrayidx8.us.i.3, align 4
+  %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i
+  %tmp64 = load i32* %arrayidx12.us.i61.3, align 4
+  %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63
+  %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2
+  %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4
+  %tmp65 = load i32* %arrayidx8.us.i.4, align 4
+  %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i
+  %tmp66 = load i32* %arrayidx12.us.i61.4, align 4
+  %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65
+  %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3
+  %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5
+  %tmp67 = load i32* %arrayidx8.us.i.5, align 4
+  %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i
+  %tmp68 = load i32* %arrayidx12.us.i61.5, align 4
+  %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67
+  %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4
+  %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6
+  %tmp69 = load i32* %arrayidx8.us.i.6, align 4
+  %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i
+  %tmp70 = load i32* %arrayidx12.us.i61.6, align 4
+  %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69
+  %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5
+  %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7
+  %tmp71 = load i32* %arrayidx8.us.i.7, align 4
+  %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i
+  %tmp72 = load i32* %arrayidx12.us.i61.7, align 4
+  %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71
+  %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6
+  %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8
+  %tmp73 = load i32* %arrayidx8.us.i.8, align 4
+  %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i
+  %tmp74 = load i32* %arrayidx12.us.i61.8, align 4
+  %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73
+  %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7
+  %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9
+  %tmp75 = load i32* %arrayidx8.us.i.9, align 4
+  %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i
+  %tmp76 = load i32* %arrayidx12.us.i61.9, align 4
+  %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75
+  %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8
+  %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i
+  store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4
+  %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %end, label %for.body
+
+end:
+  ret void
+}
+
+; Unlike the above loop, this IR starts out bad and must be
+; rescheduled.
+;
+; CHECK: %for.body
+; CHECK-NOT: %rsp
+; CHECK: imull 4
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 8
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 12
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 16
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 20
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 24
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 28
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 32
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 36
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: %end
+define void @unrolled_mmult2(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
+  i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
+  i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
+  nounwind uwtable ssp {
+entry:
+  br label %for.body
+for.body:
+  %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
+  %tmp57 = load i32* %tmp56, align 4
+  %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i
+  %tmp58 = load i32* %arrayidx12.us.i61, align 4
+  %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1
+  %tmp59 = load i32* %arrayidx8.us.i.1, align 4
+  %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i
+  %tmp60 = load i32* %arrayidx12.us.i61.1, align 4
+  %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2
+  %tmp61 = load i32* %arrayidx8.us.i.2, align 4
+  %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i
+  %tmp62 = load i32* %arrayidx12.us.i61.2, align 4
+  %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3
+  %tmp63 = load i32* %arrayidx8.us.i.3, align 4
+  %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i
+  %tmp64 = load i32* %arrayidx12.us.i61.3, align 4
+  %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4
+  %tmp65 = load i32* %arrayidx8.us.i.4, align 4
+  %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i
+  %tmp66 = load i32* %arrayidx12.us.i61.4, align 4
+  %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5
+  %tmp67 = load i32* %arrayidx8.us.i.5, align 4
+  %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i
+  %tmp68 = load i32* %arrayidx12.us.i61.5, align 4
+  %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6
+  %tmp69 = load i32* %arrayidx8.us.i.6, align 4
+  %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i
+  %tmp70 = load i32* %arrayidx12.us.i61.6, align 4
+  %mul.us.i = mul nsw i32 %tmp58, %tmp57
+  %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7
+  %tmp71 = load i32* %arrayidx8.us.i.7, align 4
+  %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i
+  %tmp72 = load i32* %arrayidx12.us.i61.7, align 4
+  %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8
+  %tmp73 = load i32* %arrayidx8.us.i.8, align 4
+  %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i
+  %tmp74 = load i32* %arrayidx12.us.i61.8, align 4
+  %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9
+  %tmp75 = load i32* %arrayidx8.us.i.9, align 4
+  %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i
+  %tmp76 = load i32* %arrayidx12.us.i61.9, align 4
+  %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59
+  %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i
+  %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61
+  %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1
+  %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63
+  %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2
+  %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65
+  %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3
+  %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67
+  %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4
+  %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69
+  %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5
+  %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71
+  %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6
+  %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73
+  %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7
+  %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75
+  %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8
+  %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i
+  store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4
+  %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %end, label %for.body
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/X86/misched-ilp.ll b/test/CodeGen/X86/misched-ilp.ll
new file mode 100644
index 0000000..c6cedb7
--- /dev/null
+++ b/test/CodeGen/X86/misched-ilp.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mcpu=core2 -enable-misched -misched=ilpmax | FileCheck -check-prefix=MAX %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mcpu=core2 -enable-misched -misched=ilpmin | FileCheck -check-prefix=MIN %s
+;
+; Basic verification of the ScheduleDAGILP metric.
+;
+; MAX: addss
+; MAX: addss
+; MAX: addss
+; MAX: subss
+; MAX: addss
+;
+; MIN: addss
+; MIN: addss
+; MIN: subss
+; MIN: addss
+; MIN: addss
+define float @ilpsched(float %a, float %b, float %c, float %d, float %e, float %f) nounwind uwtable readnone ssp {
+entry:
+  %add = fadd float %a, %b
+  %add1 = fadd float %c, %d
+  %add2 = fadd float %e, %f
+  %add3 = fsub float %add1, %add2
+  %add4 = fadd float %add, %add3
+  ret float %add4
+}
diff --git a/test/CodeGen/X86/misched-new.ll b/test/CodeGen/X86/misched-new.ll
index 8f2f6f7..cec04b5 100644
--- a/test/CodeGen/X86/misched-new.ll
+++ b/test/CodeGen/X86/misched-new.ll
@@ -1,4 +1,6 @@
-; RUN: llc -march=x86-64 -mcpu=core2 -enable-misched -misched=shuffle -misched-bottomup < %s
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
+; RUN:          -misched=shuffle -misched-bottomup -verify-machineinstrs \
+; RUN:     | FileCheck %s
 ; REQUIRES: asserts
 ;
 ; Interesting MachineScheduler cases.
@@ -25,3 +27,27 @@ for.cond.preheader:                               ; preds = %entry
 if.end:                                           ; preds = %entry
   ret void
 }
+
+; The machine verifier checks that EFLAGS kill flags are updated when
+; the scheduler reorders cmovel instructions.
+;
+; CHECK: test
+; CHECK: cmovel
+; CHECK: cmovel
+; CHECK: call
+define void @foo(i32 %b) nounwind uwtable ssp {
+entry:
+  %tobool = icmp ne i32 %b, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %v1 = phi i32 [1, %entry], [2, %if.then]
+  %v2 = phi i32 [3, %entry], [4, %if.then]
+  call void @bar(i32 %v1, i32 %v2)
+  ret void
+}
+
+declare void @bar(i32,i32)
diff --git a/test/CodeGen/X86/mmx-builtins.ll b/test/CodeGen/X86/mmx-builtins.ll
index 8b7200d..a8d33f4 100644
--- a/test/CodeGen/X86/mmx-builtins.ll
+++ b/test/CodeGen/X86/mmx-builtins.ll
@@ -1043,6 +1043,20 @@ entry:
   ret i64 %5
 }
 
+define i32 @test21_2(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: test21_2
+; CHECK: pshufw
+; CHECK: movd
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <2 x i32>
+  %5 = extractelement <2 x i32> %4, i32 0
+  ret i32 %5
+}
+
 declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll
new file mode 100644
index 0000000..24d28ad
--- /dev/null
+++ b/test/CodeGen/X86/ms-inline-asm.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -march=x86 | FileCheck %s
+
+define i32 @t1() nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect inteldialect "mov eax, $1\0A\09mov $0, eax", "=r,r,~{eax},~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
+  ret i32 %0
+; CHECK: t1
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov eax, ecx
+; CHECK: mov ecx, eax
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
+
+define void @t2() nounwind {
+entry:
+  call void asm sideeffect inteldialect "mov eax, $$1", "~{eax},~{dirflag},~{fpsr},~{flags}"() nounwind
+  ret void
+; CHECK: t2
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov eax, 1
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
+
+define void @t3(i32 %V) nounwind {
+entry:
+  %V.addr = alloca i32, align 4
+  store i32 %V, i32* %V.addr, align 4
+  call void asm sideeffect inteldialect "mov eax, DWORD PTR [$0]", "*m,~{eax},~{dirflag},~{fpsr},~{flags}"(i32* %V.addr) nounwind
+  ret void
+; CHECK: t3
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov eax, DWORD PTR {{[[esp]}}
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
+
+%struct.t18_type = type { i32, i32 }
+
+define i32 @t18() nounwind {
+entry:
+  %foo = alloca %struct.t18_type, align 4
+  %a = getelementptr inbounds %struct.t18_type* %foo, i32 0, i32 0
+  store i32 1, i32* %a, align 4
+  %b = getelementptr inbounds %struct.t18_type* %foo, i32 0, i32 1
+  store i32 2, i32* %b, align 4
+  call void asm sideeffect inteldialect "lea ebx, foo\0A\09mov eax, [ebx].0\0A\09mov [ebx].4, ecx", "~{eax},~{dirflag},~{fpsr},~{flags}"() nounwind
+  %b1 = getelementptr inbounds %struct.t18_type* %foo, i32 0, i32 1
+  %0 = load i32* %b1, align 4
+  ret i32 %0
+; CHECK: t18
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: lea ebx, foo
+; CHECK: mov eax, [ebx].0
+; CHECK: mov [ebx].4, ecx
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
diff --git a/test/CodeGen/X86/mulx32.ll b/test/CodeGen/X86/mulx32.ll
new file mode 100644
index 0000000..b75ac00
--- /dev/null
+++ b/test/CodeGen/X86/mulx32.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mcpu=core-avx2 -march=x86 < %s | FileCheck %s
+
+define i64 @f1(i32 %a, i32 %b) {
+  %x = zext i32 %a to i64
+  %y = zext i32 %b to i64
+  %r = mul i64 %x, %y
+; CHECK: f1
+; CHECK: mulxl
+; CHECK: ret
+  ret i64 %r
+}
+
+define i64 @f2(i32 %a, i32* %p) {
+  %b = load i32* %p
+  %x = zext i32 %a to i64
+  %y = zext i32 %b to i64
+  %r = mul i64 %x, %y
+; CHECK: f2
+; CHECK: mulxl ({{.+}}), %{{.+}}, %{{.+}}
+; CHECK: ret
+  ret i64 %r
+}
diff --git a/test/CodeGen/X86/mulx64.ll b/test/CodeGen/X86/mulx64.ll
new file mode 100644
index 0000000..d573028
--- /dev/null
+++ b/test/CodeGen/X86/mulx64.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mcpu=core-avx2 -march=x86-64 < %s | FileCheck %s
+
+define i128 @f1(i64 %a, i64 %b) {
+  %x = zext i64 %a to i128
+  %y = zext i64 %b to i128
+  %r = mul i128 %x, %y
+; CHECK: f1
+; CHECK: mulxq
+; CHECK: ret
+  ret i128 %r
+}
+
+define i128 @f2(i64 %a, i64* %p) {
+  %b = load i64* %p
+  %x = zext i64 %a to i128
+  %y = zext i64 %b to i128
+  %r = mul i128 %x, %y
+; CHECK: f2
+; CHECK: mulxq ({{.+}}), %{{.+}}, %{{.+}}
+; CHECK: ret
+  ret i128 %r
+}
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-3.ll b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
index 984d7e5..2a20e7a 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-3.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
@@ -1,14 +1,10 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
-; XFAIL: *
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=corei7 | FileCheck %s
 ; rdar://5571034
 
 ; This requires physreg joining, %vreg13 is live everywhere:
 ; 304L		%CL<def> = COPY %vreg13:sub_8bit; GR32_ABCD:%vreg13
 ; 320L		%vreg15<def> = COPY %vreg19; GR32:%vreg15 GR32_NOSP:%vreg19
 ; 336L		%vreg15<def> = SAR32rCL %vreg15, %EFLAGS<imp-def,dead>, %CL<imp-use,kill>; GR32:%vreg15
-;
-; This test is XFAIL until the register allocator understands trivial physreg
-; interference. <rdar://9802098>
 
 define void @foo(i32* nocapture %quadrant, i32* nocapture %ptr, i32 %bbSize, i32 %bbStart, i32 %shifts) nounwind ssp {
 ; CHECK: foo:
diff --git a/test/CodeGen/X86/pic_jumptable.ll b/test/CodeGen/X86/pic_jumptable.ll
index 8c16dc6..bdd8859 100644
--- a/test/CodeGen/X86/pic_jumptable.ll
+++ b/test/CodeGen/X86/pic_jumptable.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -relocation-model=pic -mtriple=i386-linux-gnu -asm-verbose=false \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-LINUX
+; RUN: llc < %s -relocation-model=pic -mark-data-regions -mtriple=i686-apple-darwin -asm-verbose=false \
+; RUN:   | FileCheck %s --check-prefix=CHECK-DATA
 ; RUN: llc < %s -relocation-model=pic -mtriple=i686-apple-darwin -asm-verbose=false \
 ; RUN:   | FileCheck %s
 ; RUN: llc < %s                       -mtriple=x86_64-apple-darwin | not grep 'lJTI'
@@ -16,6 +18,16 @@ entry:
 ; CHECK:       Ltmp0 = LJTI0_0-L0$pb
 ; CHECK-NEXT:  addl Ltmp0(%eax,%ecx,4)
 ; CHECK-NEXT:  jmpl *%eax
+
+;; When data-in-code markers are enabled, we should see them around the jump
+;; table.
+; CHECK-DATA: .data_region jt32
+; CHECK-DATA: LJTI0_0
+; CHECK-DATA: .end_data_region
+
+;; When they're not enabled, make sure we don't see them at all.
+; CHECK-NOT: .data_region
+; CHECK-LINUX-NOT: .data_region
 	%Y_addr = alloca i32		; <i32*> [#uses=2]
 	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
 	store i32 %Y, i32* %Y_addr
diff --git a/test/CodeGen/X86/pmovext.ll b/test/CodeGen/X86/pmovext.ll
new file mode 100644
index 0000000..16e9c28
--- /dev/null
+++ b/test/CodeGen/X86/pmovext.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+
+; rdar://11897677
+
+;CHECK: intrin_pmov
+;CHECK: pmovzxbw  (%{{.*}}), %xmm0
+;CHECK-NEXT: movdqu
+;CHECK-NEXT: ret
+define void @intrin_pmov(i16* noalias %dest, i8* noalias %src) nounwind uwtable ssp {
+  %1 = bitcast i8* %src to <2 x i64>*
+  %2 = load <2 x i64>* %1, align 16
+  %3 = bitcast <2 x i64> %2 to <16 x i8>
+  %4 = tail call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %3) nounwind
+  %5 = bitcast i16* %dest to i8*
+  %6 = bitcast <8 x i16> %4 to <16 x i8>
+  tail call void @llvm.x86.sse2.storeu.dq(i8* %5, <16 x i8> %6) nounwind
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
+
+declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
diff --git a/test/CodeGen/X86/pointer-vector.ll b/test/CodeGen/X86/pointer-vector.ll
index 800fbed..58423d1 100644
--- a/test/CodeGen/X86/pointer-vector.ll
+++ b/test/CodeGen/X86/pointer-vector.ll
@@ -81,8 +81,7 @@ define <4 x i32*> @INT2PTR1(<4 x i8>* %p) nounwind {
 entry:
   %G = load <4 x i8>* %p
 ;CHECK: movl
-;CHECK: movd
-;CHECK: pshufb
+;CHECK: pmovzxbd
 ;CHECK: pand
   %K = inttoptr <4 x i8> %G to <4 x i32*>
 ;CHECK: ret
@@ -105,7 +104,7 @@ define <2 x i32*> @BITCAST1(<2 x i8*>* %p) nounwind {
 entry:
   %G = load <2 x i8*>* %p
 ;CHECK: movl
-;CHECK: movsd
+;CHECK: pmovzxdq
   %T = bitcast <2 x i8*> %G to <2 x i32*>
 ;CHECK: ret
   ret <2 x i32*> %T
diff --git a/test/CodeGen/X86/pr11334.ll b/test/CodeGen/X86/pr11334.ll
index 5b7b5ea..e7e29e0 100644
--- a/test/CodeGen/X86/pr11334.ll
+++ b/test/CodeGen/X86/pr11334.ll
@@ -54,3 +54,11 @@ entry:
   %f1 = fpext <8 x float> %v1 to <8 x double>
   ret <8 x double> %f1
 }
+
+define void @test_vector_creation() nounwind {
+  %1 = insertelement <4 x double> undef, double 0.000000e+00, i32 2
+  %2 = load double addrspace(1)* null
+  %3 = insertelement <4 x double> %1, double %2, i32 3
+  store <4 x double> %3, <4 x double>* undef
+  ret void
+}
diff --git a/test/CodeGen/X86/pr11985.ll b/test/CodeGen/X86/pr11985.ll
new file mode 100644
index 0000000..fa37850
--- /dev/null
+++ b/test/CodeGen/X86/pr11985.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=prescott | FileCheck %s
+
+define float @foo(i8* nocapture %buf, float %a, float %b) nounwind uwtable {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %buf, i8* blockaddress(@foo, %out), i64 22, i32 1, i1 false)
+  br label %out
+
+out:                                              ; preds = %entry
+  %add = fadd float %a, %b
+  ret float %add
+; CHECK: foo
+; CHECK: movw .L{{.*}}+20(%rip), %{{.*}}
+; CHECK: movl .L{{.*}}+16(%rip), %{{.*}}
+; CHECK: movq .L{{.*}}+8(%rip), %{{.*}}
+; CHECK: movq .L{{.*}}(%rip), %{{.*}}
+; CHECK: ret
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/X86/pr12312.ll b/test/CodeGen/X86/pr12312.ll
new file mode 100644
index 0000000..087b8d7
--- /dev/null
+++ b/test/CodeGen/X86/pr12312.ll
@@ -0,0 +1,155 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse41,-avx < %s | FileCheck %s --check-prefix SSE41
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 < %s | FileCheck %s --check-prefix AVX
+
+define i32 @veccond128(<4 x i32> %input) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond128
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond128
+; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
+; AVX:   ret
+}
+
+define i32 @veccond256(<8 x i32> %input) {
+entry:
+  %0 = bitcast <8 x i32> %input to i256
+  %1 = icmp ne i256 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond256
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond256
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @veccond512(<16 x i32> %input) {
+entry:
+  %0 = bitcast <16 x i32> %input to i512
+  %1 = icmp ne i512 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond512
+; SSE41: por
+; SSE41: por
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond512
+; AVX:   vorps
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vectest128(<4 x i32> %input) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest128
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest128
+; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vectest256(<8 x i32> %input) {
+entry:
+  %0 = bitcast <8 x i32> %input to i256
+  %1 = icmp ne i256 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest256
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest256
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vectest512(<16 x i32> %input) {
+entry:
+  %0 = bitcast <16 x i32> %input to i512
+  %1 = icmp ne i512 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest512
+; SSE41: por
+; SSE41: por
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest512
+; AVX:   vorps
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vecsel128(<4 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel128
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel128
+; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vecsel256(<8 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <8 x i32> %input to i256
+  %1 = icmp ne i256 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel256
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel256
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <16 x i32> %input to i512
+  %1 = icmp ne i512 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel512
+; SSE41: por
+; SSE41: por
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel512
+; AVX:   vorps
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
diff --git a/test/CodeGen/X86/pr12359.ll b/test/CodeGen/X86/pr12359.ll
new file mode 100644
index 0000000..024b163
--- /dev/null
+++ b/test/CodeGen/X86/pr12359.ll
@@ -0,0 +1,10 @@
+; RUN: llc -asm-verbose -mtriple=x86_64-unknown-unknown -mcpu=corei7 < %s | FileCheck %s
+define <16 x i8> @shuf(<16 x i8> %inval1) {
+entry:
+  %0 = shufflevector <16 x i8> %inval1, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4, i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4>
+  ret <16 x i8> %0
+; CHECK: shuf
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/pr13458.ll b/test/CodeGen/X86/pr13458.ll
new file mode 100644
index 0000000..55548b3
--- /dev/null
+++ b/test/CodeGen/X86/pr13458.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin11.4.2"
+
+%v8_uniform_Stats.0.2.4.10 = type { i64, i64, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, i64, [7 x i32], [7 x i64] }
+
+@globalStats = external global %v8_uniform_Stats.0.2.4.10
+
+define void @MergeStats() nounwind {
+allocas:
+  %r.i.i720 = atomicrmw max i64* getelementptr inbounds (%v8_uniform_Stats.0.2.4.10* @globalStats, i64 0, i32 30), i64 0 seq_cst
+  ret void
+}
diff --git a/test/CodeGen/X86/pr13859.ll b/test/CodeGen/X86/pr13859.ll
new file mode 100644
index 0000000..719721d
--- /dev/null
+++ b/test/CodeGen/X86/pr13859.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.7.0"
+
+define void @_Z17FilterYUVRows_MMXi(i32 %af) nounwind ssp {
+entry:
+  %aMyAlloca = alloca i32, align 32
+  %dest = alloca <1 x i64>, align 32
+
+  %a32 = load i32* %aMyAlloca, align 4
+  %aconv = trunc i32 %a32 to i16
+  %a36 = insertelement <4 x i16> undef, i16 %aconv, i32 0
+  %a37 = insertelement <4 x i16> %a36, i16 %aconv, i32 1
+  %a38 = insertelement <4 x i16> %a37, i16 %aconv, i32 2
+  %a39 = insertelement <4 x i16> %a38, i16 %aconv, i32 3
+  %a40 = bitcast <4 x i16> %a39 to x86_mmx
+  %a41 = bitcast x86_mmx %a40 to <1 x i64>
+
+  %a47 = trunc i32 %a32 to i1
+  br i1 %a47, label %a48, label %a49
+
+a48:
+  unreachable
+
+a49:
+  store <1 x i64> %a41, <1 x i64>* %dest, align 8 ; !!!
+  ret void
+}
diff --git a/test/CodeGen/X86/pr13899.ll b/test/CodeGen/X86/pr13899.ll
new file mode 100644
index 0000000..bc81e34
--- /dev/null
+++ b/test/CodeGen/X86/pr13899.ll
@@ -0,0 +1,58 @@
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s --check-prefix=X64
+
+; ModuleID = 'a.bc'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S32"
+target triple = "i386-pc-win32"
+
+%v4_varying_big_struct = type { [4 x <4 x i32>] }
+
+declare <4 x i32> @"foo"(%v4_varying_big_struct, <4 x i32>) nounwind
+
+define <4 x i32> @"bar"(%v4_varying_big_struct %s, <4 x i32> %__mask) nounwind {
+allocas:
+  %calltmp = call <4 x i32> @"foo"(%v4_varying_big_struct %s, <4 x i32> %__mask)
+  ret <4 x i32> %calltmp
+; CHECK: bar
+; CHECK: andl
+; CHECK: call
+; CHECK: ret
+}
+
+declare <8 x float> @bar64(<8 x float> %i0, <8 x float> %i1,
+                         <8 x float> %i2, <8 x float> %i3,
+                         <8 x float> %i4, <8 x float> %i5,
+                         <8 x float> %i6, <8 x float> %i7,
+                         <8 x float> %i8, <8 x float> %i9)
+
+define <8 x float> @foo64(<8 x float>* %p) {
+  %1 = load <8 x float>* %p
+  %idx1 = getelementptr inbounds <8 x float>* %p, i64 1
+  %2 = load <8 x float>* %idx1
+  %idx2 = getelementptr inbounds <8 x float>* %p, i64 2
+  %3 = load <8 x float>* %idx2
+  %idx3 = getelementptr inbounds <8 x float>* %p, i64 3
+  %4 = load <8 x float>* %idx3
+  %idx4 = getelementptr inbounds <8 x float>* %p, i64 4
+  %5 = load <8 x float>* %idx4
+  %idx5 = getelementptr inbounds <8 x float>* %p, i64 5
+  %6 = load <8 x float>* %idx5
+  %idx6 = getelementptr inbounds <8 x float>* %p, i64 6
+  %7 = load <8 x float>* %idx6
+  %idx7 = getelementptr inbounds <8 x float>* %p, i64 7
+  %8 = load <8 x float>* %idx7
+  %idx8 = getelementptr inbounds <8 x float>* %p, i64 8
+  %9 = load <8 x float>* %idx8
+  %idx9 = getelementptr inbounds <8 x float>* %p, i64 9
+  %10 = load <8 x float>* %idx9
+  %r = tail call <8 x float> @bar64(<8 x float> %1, <8 x float> %2,
+                                    <8 x float> %3, <8 x float> %4,
+                                    <8 x float> %5, <8 x float> %6,
+                                    <8 x float> %7, <8 x float> %8,
+                                    <8 x float> %9, <8 x float> %10)
+  ret <8 x float> %r
+; X64: foo
+; X64: and
+; X64: call
+; X64: ret
+}
diff --git a/test/CodeGen/X86/pr14088.ll b/test/CodeGen/X86/pr14088.ll
new file mode 100644
index 0000000..505e3b5
--- /dev/null
+++ b/test/CodeGen/X86/pr14088.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple x86_64-linux -mcpu core2 -verify-machineinstrs %s -o - | FileCheck %s
+define i32 @f(i1 %foo, i16* %tm_year2, i8* %bar, i16 %zed, i32 %zed2) {
+entry:
+  br i1 %foo, label %return, label %if.end
+
+if.end:
+  %rem = srem i32 %zed2, 100
+  %conv3 = trunc i32 %rem to i16
+  store i16 %conv3, i16* %tm_year2
+  %sext = shl i32 %rem, 16
+  %conv5 = ashr exact i32 %sext, 16
+  %div = sdiv i32 %conv5, 10
+  %conv6 = trunc i32 %div to i8
+  store i8 %conv6, i8* %bar
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 0, %if.end ], [ -1, %entry ]
+  ret i32 %retval.0
+}
+
+; We were miscompiling this and using %ax instead of %cx in the movw.
+; CHECK: movswl	%cx, %ecx
+; CHECK: movw	%cx, (%rsi)
+; CHECK: movslq	%ecx, %rcx
diff --git a/test/CodeGen/X86/pr14090.ll b/test/CodeGen/X86/pr14090.ll
new file mode 100644
index 0000000..d76b912
--- /dev/null
+++ b/test/CodeGen/X86/pr14090.ll
@@ -0,0 +1,76 @@
+; RUN: llc < %s -march=x86-64 -print-before=stack-coloring -print-after=stack-coloring >%t 2>&1 && FileCheck <%t %s
+
+define void @foo(i64* %retval.i, i32 %call, i32* %.ph.i80, i32 %fourteen, i32* %out.lo, i32* %out.hi) nounwind align 2 {
+entry:
+  %_Tmp.i39 = alloca i64, align 8
+  %retval.i33 = alloca i64, align 8
+  %_Tmp.i = alloca i64, align 8
+  %retval.i.i = alloca i64, align 8
+  %_First.i = alloca i64, align 8
+
+  %0 = load i64* %retval.i, align 8
+
+  %1 = load i64* %retval.i, align 8
+
+  %_Tmp.i39.0.cast73 = bitcast i64* %_Tmp.i39 to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %_Tmp.i39.0.cast73)
+  store i64 %1, i64* %_Tmp.i39, align 8
+  %cmp.i.i.i40 = icmp slt i32 %call, 0
+  %2 = lshr i64 %1, 32
+  %3 = trunc i64 %2 to i32
+  %sub.i.i.i44 = sub i32 0, %call
+  %cmp2.i.i.i45 = icmp ult i32 %3, %sub.i.i.i44
+  %or.cond.i.i.i46 = and i1 %cmp.i.i.i40, %cmp2.i.i.i45
+  %add.i.i.i47 = add i32 %3, %call
+  %sub5.i.i.i48 = lshr i32 %add.i.i.i47, 5
+  %trunc.i50 = trunc i64 %1 to i32
+  %inttoptr.i51 = inttoptr i32 %trunc.i50 to i32*
+  %add61617.i.i.i52 = or i32 %sub5.i.i.i48, -134217728
+  %add61617.i.sub5.i.i.i53 = select i1 %or.cond.i.i.i46, i32 %add61617.i.i.i52, i32 %sub5.i.i.i48
+  %storemerge2.i.i54 = getelementptr inbounds i32* %inttoptr.i51, i32 %add61617.i.sub5.i.i.i53
+  %_Tmp.i39.0.cast74 = bitcast i64* %_Tmp.i39 to i32**
+  store i32* %storemerge2.i.i54, i32** %_Tmp.i39.0.cast74, align 8
+  %storemerge.i.i55 = and i32 %add.i.i.i47, 31
+  %_Tmp.i39.4.raw_idx = getelementptr inbounds i8* %_Tmp.i39.0.cast73, i32 4
+  %_Tmp.i39.4.cast = bitcast i8* %_Tmp.i39.4.raw_idx to i32*
+  store i32 %storemerge.i.i55, i32* %_Tmp.i39.4.cast, align 4
+  %srcval.i56 = load i64* %_Tmp.i39, align 8
+  call void @llvm.lifetime.end(i64 8, i8* %_Tmp.i39.0.cast73)
+
+; CHECK: Before Merge disjoint stack slots
+; CHECK: [[PREFIX15:MOV64mr.*<fi#]]{{[0-9]}}[[SUFFIX15:.*;]] mem:ST8[%fifteen]
+; CHECK: [[PREFIX87:MOV32mr.*;]] mem:ST4[%sunkaddr87]
+
+; CHECK: After Merge disjoint stack slots
+; CHECK: [[PREFIX15]]{{[0-9]}}[[SUFFIX15]] mem:ST8[%_Tmp.i39]
+; CHECK: [[PREFIX87]] mem:ST4[<unknown>]
+
+  %fifteen = bitcast i64* %retval.i.i to i32**
+  %sixteen = bitcast i64* %retval.i.i to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %sixteen)
+  store i32* %.ph.i80, i32** %fifteen, align 8, !tbaa !0
+  %sunkaddr = ptrtoint i64* %retval.i.i to i32
+  %sunkaddr86 = add i32 %sunkaddr, 4
+  %sunkaddr87 = inttoptr i32 %sunkaddr86 to i32*
+  store i32 %fourteen, i32* %sunkaddr87, align 4, !tbaa !3
+  %seventeen = load i64* %retval.i.i, align 8
+  call void @llvm.lifetime.end(i64 8, i8* %sixteen)
+  %eighteen = lshr i64 %seventeen, 32
+  %nineteen = trunc i64 %eighteen to i32
+  %shl.i.i.i = shl i32 1, %nineteen
+
+  store i32 %shl.i.i.i, i32* %out.lo, align 8
+  store i32 %nineteen, i32* %out.hi, align 8
+
+  ret void
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}
+!4 = metadata !{metadata !"vtable pointer", metadata !2}
diff --git a/test/CodeGen/X86/pr14098.ll b/test/CodeGen/X86/pr14098.ll
new file mode 100644
index 0000000..6ce2449
--- /dev/null
+++ b/test/CodeGen/X86/pr14098.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple i386-unknown-linux-gnu -relocation-model=pic -verify-machineinstrs < %s
+; We used to crash on this.
+
+declare void @foo()
+declare void @foo3(i1 %x)
+define void @bar(i1 %a1, i16 %a2) nounwind align 2 {
+bb0:
+  %a3 = trunc i16 %a2 to i8
+  %a4 = lshr i16 %a2, 8
+  %a5 = trunc i16 %a4 to i8
+  br i1 %a1, label %bb1, label %bb2
+bb1:
+  br label %bb2
+bb2:
+  %a6 = phi i8 [ 3, %bb0 ], [ %a5, %bb1 ]
+  %a7 = phi i8 [ 9, %bb0 ], [ %a3, %bb1 ]
+  %a8 = icmp eq i8 %a6, 1
+  call void @foo()
+  %a9 = icmp eq i8 %a7, 0
+  call void @foo3(i1 %a9)
+  call void @foo3(i1 %a8)
+  ret void
+}
diff --git a/test/CodeGen/X86/pr14161.ll b/test/CodeGen/X86/pr14161.ll
new file mode 100644
index 0000000..ff4532e
--- /dev/null
+++ b/test/CodeGen/X86/pr14161.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-linux-pc -mcpu=corei7 | FileCheck %s
+
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>)
+
+define <2 x i16> @good(<4 x i32>*, <4 x i8>*) {
+entry:
+  %2 = load <4 x i32>* %0, align 16
+  %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
+  %4 = extractelement <4 x i32> %3, i32 0
+  %5 = extractelement <4 x i32> %3, i32 1
+  %6 = extractelement <4 x i32> %3, i32 2
+  %7 = extractelement <4 x i32> %3, i32 3
+  %8 = bitcast i32 %4 to <2 x i16>
+  %9 = bitcast i32 %5 to <2 x i16>
+  ret <2 x i16> %8
+; CHECK: good
+; CHECK: pminud
+; CHECK-NEXT: pmovzxwq
+; CHECK: ret
+}
+
+define <2 x i16> @bad(<4 x i32>*, <4 x i8>*) {
+entry:
+  %2 = load <4 x i32>* %0, align 16
+  %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
+  %4 = extractelement <4 x i32> %3, i32 0
+  %5 = extractelement <4 x i32> %3, i32 1
+  %6 = extractelement <4 x i32> %3, i32 2
+  %7 = extractelement <4 x i32> %3, i32 3
+  %8 = bitcast i32 %4 to <2 x i16>
+  %9 = bitcast i32 %5 to <2 x i16>
+  ret <2 x i16> %9
+; CHECK: bad
+; CHECK: pminud
+; CHECK: pextrd
+; CHECK: pmovzxwq
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/pr14204.ll b/test/CodeGen/X86/pr14204.ll
new file mode 100644
index 0000000..42e362b
--- /dev/null
+++ b/test/CodeGen/X86/pr14204.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=x86_64-linux-pc -mcpu=core-avx2 | FileCheck %s
+
+; FIXME: vpmovsxwd should be generated instead of vpmovzxwd followed by
+; SLL/SRA.
+
+define <8 x i32> @foo(<8 x i1> %bar) nounwind readnone {
+entry:
+  %s = sext <8 x i1> %bar to <8 x i32>
+  ret <8 x i32> %s
+; CHECK: foo
+; CHECK: vpmovzxwd
+; CHECK: vpslld
+; CHECK: vpsrad
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/pr14314.ll b/test/CodeGen/X86/pr14314.ll
new file mode 100644
index 0000000..5388a4b
--- /dev/null
+++ b/test/CodeGen/X86/pr14314.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 | FileCheck %s
+
+define i64 @atomicSub(i64* %a, i64 %b) nounwind {
+entry:
+  %0 = atomicrmw sub i64* %a, i64 %b seq_cst
+  ret i64 %0
+; CHECK: atomicSub
+; movl %eax, %ebx
+; subl {{%[a-z]+}}, %ebx
+; movl %edx, %ecx
+; sbbl {{%[a-z]+}}, %ecx
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/pr14333.ll b/test/CodeGen/X86/pr14333.ll
new file mode 100644
index 0000000..86c12ef
--- /dev/null
+++ b/test/CodeGen/X86/pr14333.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s
+%foo = type { i64, i64 }
+define void @bar(%foo* %zed) {
+  %tmp = getelementptr inbounds %foo* %zed, i64 0, i32 0
+  store i64 0, i64* %tmp, align 8
+  %tmp2 = getelementptr inbounds %foo* %zed, i64 0, i32 1
+  store i64 0, i64* %tmp2, align 8
+  %tmp3 = bitcast %foo* %zed to i8*
+  call void @llvm.memset.p0i8.i64(i8* %tmp3, i8 0, i64 16, i32 8, i1 false)
+  ret void
+}
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/X86/pr5145.ll b/test/CodeGen/X86/pr5145.ll
new file mode 100644
index 0000000..d048db8
--- /dev/null
+++ b/test/CodeGen/X86/pr5145.ll
@@ -0,0 +1,35 @@
+; RUN: llc -march=x86-64 < %s | FileCheck %s
+@sc8 = external global i8
+
+define void @atomic_maxmin_i8() {
+; CHECK: atomic_maxmin_i8
+  %1 = atomicrmw max  i8* @sc8, i8 5 acquire
+; CHECK: [[LABEL1:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmovl
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL1]]
+  %2 = atomicrmw min  i8* @sc8, i8 6 acquire
+; CHECK: [[LABEL3:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmovg
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL3]]
+  %3 = atomicrmw umax i8* @sc8, i8 7 acquire
+; CHECK: [[LABEL5:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmovb
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL5]]
+  %4 = atomicrmw umin i8* @sc8, i8 8 acquire
+; CHECK: [[LABEL7:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmova
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL7]]
+  ret void
+}
diff --git a/test/CodeGen/X86/promote.ll b/test/CodeGen/X86/promote.ll
index 8b30dc7..283f48c 100644
--- a/test/CodeGen/X86/promote.ll
+++ b/test/CodeGen/X86/promote.ll
@@ -20,7 +20,7 @@ entry:
 ; CHECK: shuff_f
 define i32 @shuff_f(<4 x i8>* %A) {
 entry:
-; CHECK: pshufb
+; CHECK: pmovzxbd
 ; CHECK: paddd
 ; CHECK: pshufb
   %0 = load <4 x i8>* %A, align 8
diff --git a/test/CodeGen/X86/ptr-rotate.ll b/test/CodeGen/X86/ptr-rotate.ll
index 6debd16..fbd13b5 100644
--- a/test/CodeGen/X86/ptr-rotate.ll
+++ b/test/CodeGen/X86/ptr-rotate.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i386-apple-darwin -o - < %s | FileCheck %s
+; RUN: llc -mtriple=i386-apple-darwin -mcpu=corei7 -o - < %s | FileCheck %s
 
 define i32 @func(i8* %A) nounwind readnone {
 entry:
diff --git a/test/CodeGen/X86/red-zone2.ll b/test/CodeGen/X86/red-zone2.ll
index f092163..3e9c790 100644
--- a/test/CodeGen/X86/red-zone2.ll
+++ b/test/CodeGen/X86/red-zone2.ll
@@ -1,6 +1,7 @@
-; RUN: llc < %s -mcpu=generic -march=x86-64 > %t
-; RUN: grep subq %t | count 1
-; RUN: grep addq %t | count 1
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
+; CHECK: f0:
+; CHECK: subq
+; CHECK: addq
 
 define x86_fp80 @f0(float %f) nounwind readnone noredzone {
 entry:
diff --git a/test/CodeGen/X86/rot32.ll b/test/CodeGen/X86/rot32.ll
index 99602fd..e95a734 100644
--- a/test/CodeGen/X86/rot32.ll
+++ b/test/CodeGen/X86/rot32.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
 
 define i32 @foo(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
@@ -48,12 +49,25 @@ define i32 @xfoo(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK: xfoo:
 ; CHECK: roll $7
+; BMI2: xfoo:
+; BMI2: rorxl $25
 	%0 = lshr i32 %x, 25
 	%1 = shl i32 %x, 7
 	%2 = or i32 %0, %1
 	ret i32 %2
 }
 
+define i32 @xfoop(i32* %p) nounwind readnone {
+entry:
+; BMI2: xfoop:
+; BMI2: rorxl $25, ({{.+}}), %{{.+}}
+	%x = load i32* %p
+	%a = lshr i32 %x, 25
+	%b = shl i32 %x, 7
+	%c = or i32 %a, %b
+	ret i32 %c
+}
+
 define i32 @xbar(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK: xbar:
@@ -68,12 +82,25 @@ define i32 @xun(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK: xun:
 ; CHECK: roll $25
+; BMI2: xun:
+; BMI2: rorxl $7
 	%0 = lshr i32 %x, 7
 	%1 = shl i32 %x, 25
 	%2 = or i32 %0, %1
 	ret i32 %2
 }
 
+define i32 @xunp(i32* %p) nounwind readnone {
+entry:
+; BMI2: xunp:
+; BMI2: rorxl $7, ({{.+}}), %{{.+}}
+	%x = load i32* %p
+	%a = lshr i32 %x, 7
+	%b = shl i32 %x, 25
+	%c = or i32 %a, %b
+	ret i32 %c
+}
+
 define i32 @xbu(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK: xbu:
diff --git a/test/CodeGen/X86/rot64.ll b/test/CodeGen/X86/rot64.ll
index 4e082bb..7fa982d 100644
--- a/test/CodeGen/X86/rot64.ll
+++ b/test/CodeGen/X86/rot64.ll
@@ -1,8 +1,9 @@
-; RUN: llc < %s -march=x86-64 > %t
-; RUN: grep rol %t | count 3
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 > %t
+; RUN: grep rol %t | count 5
 ; RUN: grep ror %t | count 1
 ; RUN: grep shld %t | count 2
 ; RUN: grep shrd %t | count 2
+; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
 
 define i64 @foo(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
@@ -42,12 +43,25 @@ entry:
 
 define i64 @xfoo(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
+; BMI2: xfoo:
+; BMI2: rorxq $57
 	%0 = lshr i64 %x, 57
 	%1 = shl i64 %x, 7
 	%2 = or i64 %0, %1
 	ret i64 %2
 }
 
+define i64 @xfoop(i64* %p) nounwind readnone {
+entry:
+; BMI2: xfoop:
+; BMI2: rorxq $57, ({{.+}}), %{{.+}}
+	%x = load i64* %p
+	%a = lshr i64 %x, 57
+	%b = shl i64 %x, 7
+	%c = or i64 %a, %b
+	ret i64 %c
+}
+
 define i64 @xbar(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
 	%0 = shl i64 %y, 7
@@ -58,12 +72,25 @@ entry:
 
 define i64 @xun(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
+; BMI2: xun:
+; BMI2: rorxq $7
 	%0 = lshr i64 %x, 7
 	%1 = shl i64 %x, 57
 	%2 = or i64 %0, %1
 	ret i64 %2
 }
 
+define i64 @xunp(i64* %p) nounwind readnone {
+entry:
+; BMI2: xunp:
+; BMI2: rorxq $7, ({{.+}}), %{{.+}}
+	%x = load i64* %p
+	%a = lshr i64 %x, 7
+	%b = shl i64 %x, 57
+	%c = or i64 %a, %b
+	ret i64 %c
+}
+
 define i64 @xbu(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
 	%0 = lshr i64 %y, 7
diff --git a/test/CodeGen/X86/rotate2.ll b/test/CodeGen/X86/rotate2.ll
index 2eea399..2316c70 100644
--- a/test/CodeGen/X86/rotate2.ll
+++ b/test/CodeGen/X86/rotate2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep rol | count 2
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | grep rol | count 2
 
 define i64 @test1(i64 %x) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/rtm.ll b/test/CodeGen/X86/rtm.ll
new file mode 100644
index 0000000..76eb951
--- /dev/null
+++ b/test/CodeGen/X86/rtm.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mattr=+rtm -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+declare i32 @llvm.x86.xbegin() nounwind
+declare void @llvm.x86.xend() nounwind
+declare void @llvm.x86.xabort(i8) noreturn nounwind
+
+define i32 @test_xbegin() nounwind uwtable {
+entry:
+  %0 = tail call i32 @llvm.x86.xbegin() nounwind
+  ret i32 %0
+; CHECK: test_xbegin
+; CHECK: xbegin [[LABEL:.*BB.*]]
+; CHECK: [[LABEL]]:
+}
+
+define void @test_xend() nounwind uwtable {
+entry:
+  tail call void @llvm.x86.xend() nounwind
+  ret void
+; CHECK: test_xend
+; CHECK: xend
+}
+
+define void @test_xabort() nounwind uwtable {
+entry:
+  tail call void @llvm.x86.xabort(i8 2)
+  unreachable
+; CHECK: test_xabort
+; CHECK: xabort $2
+}
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index 2e39473..3bec3ac 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -344,3 +344,16 @@ entry:
 ; ATOM: negw
 ; ATOM: sbbw
 }
+
+define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
+  %cmp = icmp slt i32 %x, 15
+  %sel = select i1 %cmp, i8 %a, i8 %b
+  ret i8 %sel
+; CHECK: test18:
+; CHECK: cmpl $15, %edi
+; CHECK: cmovgel %edx
+
+; ATOM: test18:
+; ATOM: cmpl $15, %edi
+; ATOM: cmovgel %edx
+}
diff --git a/test/CodeGen/X86/select_const.ll b/test/CodeGen/X86/select_const.ll
new file mode 100644
index 0000000..5b2409d
--- /dev/null
+++ b/test/CodeGen/X86/select_const.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mcpu=corei7 | FileCheck %s
+
+define i64 @test1(i64 %x) nounwind {
+entry:
+  %cmp = icmp eq i64 %x, 2
+  %add = add i64 %x, 1
+  %retval.0 = select i1 %cmp, i64 2, i64 %add
+  ret i64 %retval.0
+
+; CHECK: test1:
+; CHECK: leaq 1(%rdi), %rax
+; CHECK: cmpq $2, %rdi
+; CHECK: cmoveq %rdi, %rax
+; CHECK: ret
+
+}
diff --git a/test/CodeGen/X86/shift-bmi2.ll b/test/CodeGen/X86/shift-bmi2.ll
new file mode 100644
index 0000000..d1f321f
--- /dev/null
+++ b/test/CodeGen/X86/shift-bmi2.ll
@@ -0,0 +1,178 @@
+; RUN: llc -mtriple=i386-unknown-unknown -mcpu=core-avx2 < %s | FileCheck --check-prefix=BMI2 %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 < %s | FileCheck --check-prefix=BMI264 %s
+
+define i32 @shl32(i32 %x, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = shl i32 %x, %shamt
+; BMI2: shl32
+; BMI2: shlxl
+; BMI2: ret
+; BMI264: shl32
+; BMI264: shlxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @shl32i(i32 %x) nounwind uwtable readnone {
+entry:
+  %shl = shl i32 %x, 5
+; BMI2: shl32i
+; BMI2-NOT: shlxl
+; BMI2: ret
+; BMI264: shl32i
+; BMI264-NOT: shlxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @shl32p(i32* %p, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i32* %p
+  %shl = shl i32 %x, %shamt
+; BMI2: shl32p
+; BMI2: shlxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI2: ret
+; BMI264: shl32p
+; BMI264: shlxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @shl32pi(i32* %p) nounwind uwtable readnone {
+entry:
+  %x = load i32* %p
+  %shl = shl i32 %x, 5
+; BMI2: shl32pi
+; BMI2-NOT: shlxl
+; BMI2: ret
+; BMI264: shl32pi
+; BMI264-NOT: shlxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i64 @shl64(i64 %x, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = shl i64 %x, %shamt
+; BMI264: shl64
+; BMI264: shlxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @shl64i(i64 %x) nounwind uwtable readnone {
+entry:
+  %shl = shl i64 %x, 7
+; BMI264: shl64i
+; BMI264-NOT: shlxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @shl64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i64* %p
+  %shl = shl i64 %x, %shamt
+; BMI264: shl64p
+; BMI264: shlxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @shl64pi(i64* %p) nounwind uwtable readnone {
+entry:
+  %x = load i64* %p
+  %shl = shl i64 %x, 7
+; BMI264: shl64p
+; BMI264-NOT: shlxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i32 @lshr32(i32 %x, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = lshr i32 %x, %shamt
+; BMI2: lshr32
+; BMI2: shrxl
+; BMI2: ret
+; BMI264: lshr32
+; BMI264: shrxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @lshr32p(i32* %p, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i32* %p
+  %shl = lshr i32 %x, %shamt
+; BMI2: lshr32p
+; BMI2: shrxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI2: ret
+; BMI264: lshr32
+; BMI264: shrxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i64 @lshr64(i64 %x, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = lshr i64 %x, %shamt
+; BMI264: lshr64
+; BMI264: shrxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @lshr64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i64* %p
+  %shl = lshr i64 %x, %shamt
+; BMI264: lshr64p
+; BMI264: shrxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i32 @ashr32(i32 %x, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = ashr i32 %x, %shamt
+; BMI2: ashr32
+; BMI2: sarxl
+; BMI2: ret
+; BMI264: ashr32
+; BMI264: sarxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @ashr32p(i32* %p, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i32* %p
+  %shl = ashr i32 %x, %shamt
+; BMI2: ashr32p
+; BMI2: sarxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI2: ret
+; BMI264: ashr32
+; BMI264: sarxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i64 @ashr64(i64 %x, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = ashr i64 %x, %shamt
+; BMI264: ashr64
+; BMI264: sarxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @ashr64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i64* %p
+  %shl = ashr i64 %x, %shamt
+; BMI264: ashr64p
+; BMI264: sarxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i64 %shl
+}
diff --git a/test/CodeGen/X86/sincos.ll b/test/CodeGen/X86/sincos.ll
index 1479be1..734f48a 100644
--- a/test/CodeGen/X86/sincos.ll
+++ b/test/CodeGen/X86/sincos.ll
@@ -1,6 +1,7 @@
 ; Make sure this testcase codegens to the sin and cos instructions, not calls
 ; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | FileCheck %s --check-prefix=SIN
 ; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | FileCheck %s --check-prefix=COS
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 | FileCheck %s --check-prefix=SAFE
 
 declare float  @sinf(float) readonly
 
@@ -17,6 +18,9 @@ define float @test1(float %X) {
 
 ; SIN-NOT: fsin
 
+; SAFE: test1
+; SAFE-NOT: fsin
+
 ; SIN: test2:
 define double @test2(double %X) {
         %Y = call double @sin(double %X) readonly
@@ -26,6 +30,9 @@ define double @test2(double %X) {
 
 ; SIN-NOT: fsin
 
+; SAFE: test2
+; SAFE-NOT: fsin
+
 ; SIN: test3:
 define x86_fp80 @test3(x86_fp80 %X) {
         %Y = call x86_fp80 @sinl(x86_fp80 %X) readonly
@@ -50,12 +57,18 @@ define float @test4(float %X) {
 }
 ; COS: {{^[ \t]*fcos}}
 
+; SAFE: test4
+; SAFE-NOT: fcos
+
 define double @test5(double %X) {
         %Y = call double @cos(double %X) readonly
         ret double %Y
 }
 ; COS: {{^[ \t]*fcos}}
 
+; SAFE: test5
+; SAFE-NOT: fcos
+
 define x86_fp80 @test6(x86_fp80 %X) {
         %Y = call x86_fp80 @cosl(x86_fp80 %X) readonly
         ret x86_fp80 %Y
diff --git a/test/CodeGen/X86/sjlj.ll b/test/CodeGen/X86/sjlj.ll
new file mode 100644
index 0000000..681db00
--- /dev/null
+++ b/test/CodeGen/X86/sjlj.ll
@@ -0,0 +1,60 @@
+; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 -relocation-model=static | FileCheck --check-prefix=X86 %s
+; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 -relocation-model=pic | FileCheck --check-prefix=PIC86 %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 -relocation-model=static | FileCheck --check-prefix=X64 %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 -relocation-model=pic | FileCheck --check-prefix=PIC64 %s
+
+@buf = internal global [5 x i8*] zeroinitializer
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
+
+declare i8* @llvm.stacksave() nounwind
+
+declare i32 @llvm.eh.sjlj.setjmp(i8*) nounwind
+
+declare void @llvm.eh.sjlj.longjmp(i8*) nounwind
+
+define i32 @sj0() nounwind {
+  %fp = tail call i8* @llvm.frameaddress(i32 0)
+  store i8* %fp, i8** getelementptr inbounds ([5 x i8*]* @buf, i64 0, i64 0), align 16
+  %sp = tail call i8* @llvm.stacksave()
+  store i8* %sp, i8** getelementptr inbounds ([5 x i8*]* @buf, i64 0, i64 2), align 16
+  %r = tail call i32 @llvm.eh.sjlj.setjmp(i8* bitcast ([5 x i8*]* @buf to i8*))
+  ret i32 %r
+; X86: sj0
+; x86: movl %ebp, buf
+; X86: movl %esp, buf+8
+; x86: movl ${{.*LBB.*}}, buf+4
+; X86: ret
+; PIC86: sj0
+; PIC86: movl %ebp, buf@GOTOFF(%[[GOT:.*]])
+; PIC86: movl %esp, buf@GOTOFF+8(%[[GOT]])
+; PIC86: leal {{.*LBB.*}}@GOTOFF(%[[GOT]]), %[[LREG:.*]]
+; PIC86: movl %[[LREG]], buf@GOTOFF+4
+; PIC86: ret
+; X64: sj0
+; x64: movq %rbp, buf(%rip)
+; x64: movq ${{.*LBB.*}}, buf+8(%rip)
+; X64: movq %rsp, buf+16(%rip)
+; X64: ret
+; PIC64: sj0
+; PIC64: movq %rbp, buf(%rip)
+; PIC64: movq %rsp, buf+16(%rip)
+; PIC64: leaq {{.*LBB.*}}(%rip), %[[LREG:.*]]
+; PIC64: movq %[[LREG]], buf+8(%rip)
+; PIC64: ret
+}
+
+define void @lj0() nounwind {
+  tail call void @llvm.eh.sjlj.longjmp(i8* bitcast ([5 x i8*]* @buf to i8*))
+  unreachable
+; X86: lj0
+; X86: movl buf, %ebp
+; X86: movl buf+4, %[[REG32:.*]]
+; X86: movl buf+8, %esp
+; X86: jmpl *%[[REG32]]
+; X64: lj0
+; X64: movq buf(%rip), %rbp
+; X64: movq buf+8(%rip), %[[REG64:.*]]
+; X64: movq buf+16(%rip), %rsp
+; X64: jmpq *%[[REG64]]
+}
diff --git a/test/CodeGen/X86/smul-with-overflow.ll b/test/CodeGen/X86/smul-with-overflow.ll
index 7ac3840..2d0b2f7 100644
--- a/test/CodeGen/X86/smul-with-overflow.ll
+++ b/test/CodeGen/X86/smul-with-overflow.ll
@@ -67,3 +67,17 @@ entry:
 ; CHECK: mull
 ; CHECK-NEXT: ret
 }
+
+declare { i63, i1 } @llvm.smul.with.overflow.i63(i63, i63) nounwind readnone
+
+define i1 @test5() nounwind {
+entry:
+  %res = call { i63, i1 } @llvm.smul.with.overflow.i63(i63 4, i63 4611686018427387903)
+  %sum = extractvalue { i63, i1 } %res, 0
+  %overflow = extractvalue { i63, i1 } %res, 1
+  ret i1 %overflow
+; Was returning false, should return true (not constant folded yet though).
+; PR13991
+; CHECK: test5:
+; CHECK-NOT: xorb
+}
diff --git a/test/CodeGen/X86/sse-intel-ocl.ll b/test/CodeGen/X86/sse-intel-ocl.ll
new file mode 100644
index 0000000..1885050
--- /dev/null
+++ b/test/CodeGen/X86/sse-intel-ocl.ll
@@ -0,0 +1,93 @@
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck -check-prefix=NOT_WIN %s
+
+declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
+declare <16 x float> @func_float16(<16 x float>, <16 x float>)
+; WIN64: testf16_inp
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: leaq    {{.*}}(%rsp), %rcx
+; WIN64: call
+; WIN64: ret
+
+; WIN32: testf16_inp
+; WIN32: movl    %eax, (%esp)
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: call
+; WIN32: ret
+
+; NOT_WIN: testf16_inp
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: leaq    {{.*}}(%rsp), %rdi
+; NOT_WIN: call
+; NOT_WIN: ret
+
+;test calling conventions - input parameters
+define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %2, %1
+  ret <16 x float> %3
+}
+
+;test calling conventions - preserved registers
+
+; preserved xmm6-xmm15
+; WIN64: testf16_regs
+; WIN64: call
+; WIN64: addps  {{%xmm[6-9]}}, {{.*}}
+; WIN64: addps  {{%xmm[6-9]}}, {{.*}}
+; WIN64: ret
+
+; preserved xmm8-xmm15
+; NOT_WIN: testf16_regs
+; NOT_WIN: call
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: ret
+
+define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %1, %b
+  %4 = fadd <16 x float> %2, %3
+  ret <16 x float> %4
+}
+
+; test calling conventions - prolog and epilog
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: call
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
+   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
+   ret <16 x float> %c
+}
diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index 3839e87..0ba0215 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll
@@ -47,8 +47,7 @@ define double @olt(double %x, double %y) nounwind {
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      ogt_inverse:
-; UNSAFE-NEXT: minsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ogt_inverse:
 ; FINITE-NEXT: minsd  %xmm0, %xmm1
@@ -65,8 +64,7 @@ define double @ogt_inverse(double %x, double %y) nounwind {
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      olt_inverse:
-; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      olt_inverse:
 ; FINITE-NEXT: maxsd  %xmm0, %xmm1
@@ -107,8 +105,7 @@ define double @ole(double %x, double %y) nounwind {
 ; CHECK:      oge_inverse:
 ; CHECK-NEXT: ucomisd %xmm1, %xmm0
 ; UNSAFE:      oge_inverse:
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      oge_inverse:
 ; FINITE-NEXT: minsd %xmm0, %xmm1
@@ -123,8 +120,7 @@ define double @oge_inverse(double %x, double %y) nounwind {
 ; CHECK:      ole_inverse:
 ; CHECK-NEXT: ucomisd %xmm0, %xmm1
 ; UNSAFE:      ole_inverse:
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ole_inverse:
 ; FINITE-NEXT: maxsd %xmm0, %xmm1
@@ -142,7 +138,8 @@ define double @ole_inverse(double %x, double %y) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE:      ogt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ogt_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -160,7 +157,8 @@ define double @ogt_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE:      olt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      olt_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -218,7 +216,8 @@ define double @olt_inverse_x(double %x) nounwind {
 ; CHECK:      ucomisd %xmm1, %xmm0
 ; UNSAFE:      oge_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      oge_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -234,7 +233,8 @@ define double @oge_x(double %x) nounwind {
 ; CHECK:      ucomisd %xmm0, %xmm1
 ; UNSAFE:      ole_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ole_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -313,8 +313,7 @@ define double @ult(double %x, double %y) nounwind {
 ; CHECK:      ugt_inverse:
 ; CHECK:      ucomisd %xmm0, %xmm1
 ; UNSAFE:      ugt_inverse:
-; UNSAFE-NEXT: minsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ugt_inverse:
 ; FINITE-NEXT: minsd   %xmm0, %xmm1
@@ -329,8 +328,7 @@ define double @ugt_inverse(double %x, double %y) nounwind {
 ; CHECK:      ult_inverse:
 ; CHECK:      ucomisd %xmm1, %xmm0
 ; UNSAFE:      ult_inverse:
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ult_inverse:
 ; FINITE-NEXT: maxsd   %xmm0, %xmm1
@@ -378,8 +376,7 @@ define double @ule(double %x, double %y) nounwind {
 ; CHECK-NEXT: minsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      uge_inverse:
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      uge_inverse:
 ; FINITE-NEXT: minsd %xmm0, %xmm1
@@ -395,8 +392,7 @@ define double @uge_inverse(double %x, double %y) nounwind {
 ; CHECK-NEXT: maxsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      ule_inverse:
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ule_inverse:
 ; FINITE-NEXT: maxsd %xmm0, %xmm1
@@ -412,7 +408,8 @@ define double @ule_inverse(double %x, double %y) nounwind {
 ; CHECK:      ucomisd %xmm0, %xmm1
 ; UNSAFE:      ugt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ugt_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -428,7 +425,8 @@ define double @ugt_x(double %x) nounwind {
 ; CHECK:      ucomisd %xmm1, %xmm0
 ; UNSAFE:      ult_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: minsd   %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ult_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -483,7 +481,8 @@ define double @ult_inverse_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE:      uge_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      uge_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
@@ -502,7 +501,8 @@ define double @uge_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE:      ule_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
-; UNSAFE-NEXT: minsd  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd  %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ule_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
@@ -590,9 +590,7 @@ define double @olt_y(double %x) nounwind {
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      ogt_inverse_y:
-; UNSAFE-NEXT: movsd  {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: minsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd  {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ogt_inverse_y:
 ; FINITE-NEXT: movsd  {{[^,]*}}, %xmm1
@@ -611,9 +609,7 @@ define double @ogt_inverse_y(double %x) nounwind {
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      olt_inverse_y:
-; UNSAFE-NEXT: movsd  {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd  {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      olt_inverse_y:
 ; FINITE-NEXT: movsd  {{[^,]*}}, %xmm1
@@ -657,9 +653,7 @@ define double @ole_y(double %x) nounwind {
 ; CHECK:      oge_inverse_y:
 ; CHECK:      ucomisd %xmm
 ; UNSAFE:      oge_inverse_y:
-; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: minsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      oge_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
@@ -675,9 +669,7 @@ define double @oge_inverse_y(double %x) nounwind {
 ; CHECK:      ole_inverse_y:
 ; CHECK:      ucomisd %xmm
 ; UNSAFE:      ole_inverse_y:
-; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ole_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
@@ -721,9 +713,7 @@ define double @ult_y(double %x) nounwind {
 ; CHECK:      ugt_inverse_y:
 ; CHECK:      ucomisd %xmm
 ; UNSAFE:      ugt_inverse_y:
-; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: minsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ugt_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
@@ -739,9 +729,7 @@ define double @ugt_inverse_y(double %x) nounwind {
 ; CHECK:      ult_inverse_y:
 ; CHECK:      ucomisd %xmm
 ; UNSAFE:      ult_inverse_y:
-; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ult_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
@@ -792,9 +780,7 @@ define double @ule_y(double %x) nounwind {
 ; CHECK-NEXT: minsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      uge_inverse_y:
-; UNSAFE-NEXT: movsd {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      uge_inverse_y:
 ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
@@ -811,9 +797,7 @@ define double @uge_inverse_y(double %x) nounwind {
 ; CHECK-NEXT: maxsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      ule_inverse_y:
-; UNSAFE-NEXT: movsd {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ule_inverse_y:
 ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
diff --git a/test/CodeGen/X86/sse_partial_update.ll b/test/CodeGen/X86/sse_partial_update.ll
new file mode 100644
index 0000000..655f758
--- /dev/null
+++ b/test/CodeGen/X86/sse_partial_update.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -mcpu=nehalem | FileCheck %s
+
+; rdar: 12558838
+; PR14221
+; There is a mismatch between the intrinsic and the actual instruction.
+; The actual instruction has a partial update of dest, while the intrinsic
+; passes through the upper FP values. Here, we make sure the source and
+; destination of rsqrtss are the same.
+define void @t1(<4 x float> %a) nounwind uwtable ssp {
+entry:
+; CHECK: t1:
+; CHECK: rsqrtss %xmm0, %xmm0
+  %0 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a) nounwind
+  %a.addr.0.extract = extractelement <4 x float> %0, i32 0
+  %conv = fpext float %a.addr.0.extract to double
+  %a.addr.4.extract = extractelement <4 x float> %0, i32 1
+  %conv3 = fpext float %a.addr.4.extract to double
+  tail call void @callee(double %conv, double %conv3) nounwind
+  ret void
+}
+declare void @callee(double, double)
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define void @t2(<4 x float> %a) nounwind uwtable ssp {
+entry:
+; CHECK: t2:
+; CHECK: rcpss %xmm0, %xmm0
+  %0 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a) nounwind
+  %a.addr.0.extract = extractelement <4 x float> %0, i32 0
+  %conv = fpext float %a.addr.0.extract to double
+  %a.addr.4.extract = extractelement <4 x float> %0, i32 1
+  %conv3 = fpext float %a.addr.4.extract to double
+  tail call void @callee(double %conv, double %conv3) nounwind
+  ret void
+}
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/tailcall-64.ll b/test/CodeGen/X86/tailcall-64.ll
index 7030753..ecc253b 100644
--- a/test/CodeGen/X86/tailcall-64.ll
+++ b/test/CodeGen/X86/tailcall-64.ll
@@ -1,6 +1,4 @@
-; RUN: llc < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin11.4.0"
+; RUN: llc -mtriple=x86_64-apple-macosx -mcpu=core2 < %s | FileCheck %s
 
 declare i64 @testi()
 
@@ -93,4 +91,67 @@ define { i64, i64 } @crash(i8* %this) {
   ret { i64, i64 } %mrv7
 }
 
+; Check that we can fold an indexed load into a tail call instruction.
+; CHECK: fold_indexed_load
+; CHECK: leaq (%rsi,%rsi,4), %[[RAX:r..]]
+; CHECK: jmpq *16(%{{r..}},%[[RAX]],8)  # TAILCALL
+%struct.funcs = type { i32 (i8*, i32*, i32)*, i32 (i8*)*, i32 (i8*)*, i32 (i8*, i32)*, i32 }
+@func_table = external global [0 x %struct.funcs]
+define void @fold_indexed_load(i8* %mbstr, i64 %idxprom) nounwind uwtable ssp {
+entry:
+  %dsplen = getelementptr inbounds [0 x %struct.funcs]* @func_table, i64 0, i64 %idxprom, i32 2
+  %x1 = load i32 (i8*)** %dsplen, align 8
+  %call = tail call i32 %x1(i8* %mbstr) nounwind
+  ret void
+}
+
+; <rdar://problem/12282281> Fold an indexed load into the tail call instruction.
+; Calling a varargs function with 6 arguments requires 7 registers (%al is the
+; vector count for varargs functions). This leaves %r11 as the only available
+; scratch register.
+;
+; It is not possible to fold an indexed load into TCRETURNmi64 in that case.
+;
+; typedef int (*funcptr)(void*, ...);
+; extern const funcptr funcs[];
+; int f(int n) {
+;   return funcs[n](0, 0, 0, 0, 0, 0);
+; }
+;
+; CHECK: rdar12282281
+; CHECK: jmpq *%r11 # TAILCALL
+@funcs = external constant [0 x i32 (i8*, ...)*]
+
+define i32 @rdar12282281(i32 %n) nounwind uwtable ssp {
+entry:
+  %idxprom = sext i32 %n to i64
+  %arrayidx = getelementptr inbounds [0 x i32 (i8*, ...)*]* @funcs, i64 0, i64 %idxprom
+  %0 = load i32 (i8*, ...)** %arrayidx, align 8
+  %call = tail call i32 (i8*, ...)* %0(i8* null, i32 0, i32 0, i32 0, i32 0, i32 0) nounwind
+  ret i32 %call
+}
+
+define x86_fp80 @fp80_call(x86_fp80 %x) nounwind  {
+entry:
+; CHECK: fp80_call:
+; CHECK: jmp _fp80_callee
+  %call = tail call x86_fp80 @fp80_callee(x86_fp80 %x) nounwind
+  ret x86_fp80 %call
+}
+
+declare x86_fp80 @fp80_callee(x86_fp80)
+
+; rdar://12229511
+define x86_fp80 @trunc_fp80(x86_fp80 %x) nounwind  {
+entry:
+; CHECK: trunc_fp80
+; CHECK: callq _trunc
+; CHECK-NOT: jmp _trunc
+; CHECK: ret
+  %conv = fptrunc x86_fp80 %x to double
+  %call = tail call double @trunc(double %conv) nounwind readnone
+  %conv1 = fpext double %call to x86_fp80
+  ret x86_fp80 %conv1
+}
 
+declare double @trunc(double) nounwind readnone
diff --git a/test/CodeGen/X86/targetLoweringGeneric.ll b/test/CodeGen/X86/targetLoweringGeneric.ll
index ba5f8f8..a773e9d 100644
--- a/test/CodeGen/X86/targetLoweringGeneric.ll
+++ b/test/CodeGen/X86/targetLoweringGeneric.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i386-apple-darwin9 -fast-isel=false -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=i386-apple-darwin9 -mcpu=corei7 -fast-isel=false -O0 < %s | FileCheck %s
 
 ; Gather non-machine specific tests for the transformations in
 ; CodeGen/SelectionDAG/TargetLowering.  Currently, these
diff --git a/test/CodeGen/X86/tls-pic.ll b/test/CodeGen/X86/tls-pic.ll
index 51c3d23..b823f0a 100644
--- a/test/CodeGen/X86/tls-pic.ll
+++ b/test/CodeGen/X86/tls-pic.ll
@@ -76,12 +76,12 @@ entry:
 
 ; X32:    f5:
 ; X32:      leal {{[jk]}}@TLSLDM(%ebx)
-; X32-NEXT: calll ___tls_get_addr@PLT
-; X32-NEXT: movl {{[jk]}}@DTPOFF(%eax)
-; X32-NEXT: addl {{[jk]}}@DTPOFF(%eax)
+; X32: calll ___tls_get_addr@PLT
+; X32: movl {{[jk]}}@DTPOFF(%e
+; X32: addl {{[jk]}}@DTPOFF(%e
 
 ; X64:    f5:
 ; X64:      leaq {{[jk]}}@TLSLD(%rip), %rdi
-; X64-NEXT: callq	__tls_get_addr@PLT
-; X64-NEXT: movl {{[jk]}}@DTPOFF(%rax)
-; X64-NEXT: addl {{[jk]}}@DTPOFF(%rax)
+; X64: callq	__tls_get_addr@PLT
+; X64: movl {{[jk]}}@DTPOFF(%r
+; X64: addl {{[jk]}}@DTPOFF(%r
diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll
index 9877d7b..1d22a18 100644
--- a/test/CodeGen/X86/trunc-ext-ld-st.ll
+++ b/test/CodeGen/X86/trunc-ext-ld-st.ll
@@ -2,8 +2,7 @@
 
 ;CHECK: load_2_i8
 ; A single 16-bit load
-;CHECK: movzwl
-;CHECK: pshufb
+;CHECK: pmovzxbq
 ;CHECK: paddq
 ;CHECK: pshufb
 ; A single 16-bit store
@@ -19,8 +18,7 @@ define void @load_2_i8(<2 x i8>* %A)  {
 
 ;CHECK: load_2_i16
 ; Read 32-bits
-;CHECK: movd
-;CHECK: pshufb
+;CHECK: pmovzxwq
 ;CHECK: paddq
 ;CHECK: pshufb
 ;CHECK: movd
@@ -33,7 +31,7 @@ define void @load_2_i16(<2 x i16>* %A)  {
 } 
 
 ;CHECK: load_2_i32
-;CHECK: pshufd
+;CHECK: pmovzxdq
 ;CHECK: paddq
 ;CHECK: pshufd
 ;CHECK: ret
@@ -45,8 +43,7 @@ define void @load_2_i32(<2 x i32>* %A)  {
 } 
 
 ;CHECK: load_4_i8
-;CHECK: movd
-;CHECK: pshufb
+;CHECK: pmovzxbd
 ;CHECK: paddd
 ;CHECK: pshufb
 ;CHECK: ret
@@ -58,7 +55,7 @@ define void @load_4_i8(<4 x i8>* %A)  {
 } 
 
 ;CHECK: load_4_i16
-;CHECK: punpcklwd
+;CHECK: pmovzxwd
 ;CHECK: paddd
 ;CHECK: pshufb
 ;CHECK: ret
@@ -70,7 +67,7 @@ define void @load_4_i16(<4 x i16>* %A)  {
 } 
 
 ;CHECK: load_8_i8
-;CHECK: punpcklbw
+;CHECK: pmovzxbw
 ;CHECK: paddw
 ;CHECK: pshufb
 ;CHECK: ret
diff --git a/test/CodeGen/X86/vec_compare-2.ll b/test/CodeGen/X86/vec_compare-2.ll
index 46d6a23..4da7953 100644
--- a/test/CodeGen/X86/vec_compare-2.ll
+++ b/test/CodeGen/X86/vec_compare-2.ll
@@ -10,8 +10,7 @@ define void @blackDespeckle_wrapper(i8** %args_list, i64* %gtid, i64 %xend) {
 entry:
 ; CHECK: cfi_def_cfa_offset
 ; CHECK-NOT: set
-; CHECK: punpcklwd
-; CHECK: pshufd
+; CHECK: pmovzxwq
 ; CHECK: pshufb
   %shr.i = ashr <4 x i32> zeroinitializer, <i32 3, i32 3, i32 3, i32 3> ; <<4 x i32>> [#uses=1]
   %cmp318.i = sext <4 x i1> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
new file mode 100644
index 0000000..82517cb
--- /dev/null
+++ b/test/CodeGen/X86/vec_fabs.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+
+
+define <2 x double> @fabs_v2f64(<2 x double> %p)
+{
+  ; CHECK: fabs_v2f64
+  ; CHECK: vandps
+  %t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
+  ret <2 x double> %t
+}
+declare <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
+
+define <4 x float> @fabs_v4f32(<4 x float> %p)
+{
+  ; CHECK: fabs_v4f32
+  ; CHECK: vandps
+  %t = call <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
+  ret <4 x float> %t
+}
+declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
+
+define <4 x double> @fabs_v4f64(<4 x double> %p)
+{
+  ; CHECK: fabs_v4f64
+  ; CHECK: vandps
+  %t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
+  ret <4 x double> %t
+}
+declare <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
+
+define <8 x float> @fabs_v8f32(<8 x float> %p)
+{
+  ; CHECK: fabs_v8f32
+  ; CHECK: vandps
+  %t = call <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
+  ret <8 x float> %t
+}
+declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
diff --git a/test/CodeGen/X86/vec_floor.ll b/test/CodeGen/X86/vec_floor.ll
new file mode 100644
index 0000000..5e0160b
--- /dev/null
+++ b/test/CodeGen/X86/vec_floor.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+
+
+define <2 x double> @floor_v2f64(<2 x double> %p)
+{
+  ; CHECK: floor_v2f64
+  ; CHECK: vroundpd
+  %t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p)
+  ret <2 x double> %t
+}
+declare <2 x double> @llvm.floor.v2f64(<2 x double> %p)
+
+define <4 x float> @floor_v4f32(<4 x float> %p)
+{
+  ; CHECK: floor_v4f32
+  ; CHECK: vroundps
+  %t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p)
+  ret <4 x float> %t
+}
+declare <4 x float> @llvm.floor.v4f32(<4 x float> %p)
+
+define <4 x double> @floor_v4f64(<4 x double> %p)
+{
+  ; CHECK: floor_v4f64
+  ; CHECK: vroundpd
+  %t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p)
+  ret <4 x double> %t
+}
+declare <4 x double> @llvm.floor.v4f64(<4 x double> %p)
+
+define <8 x float> @floor_v8f32(<8 x float> %p)
+{
+  ; CHECK: floor_v8f32
+  ; CHECK: vroundps
+  %t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p)
+  ret <8 x float> %t
+}
+declare <8 x float> @llvm.floor.v8f32(<8 x float> %p)
diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index 05b263e..dc0464f 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll
@@ -1,14 +1,38 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse41,-avx | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck --check-prefix=AVX %s
 
 ; PR11674
 define void @fpext_frommem(<2 x float>* %in, <2 x double>* %out) {
 entry:
-; TODO: We should be able to generate cvtps2pd for the load.
-; For now, just check that we generate something sane.
-; CHECK: cvtss2sd
-; CHECK: cvtss2sd
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
   %0 = load <2 x float>* %in, align 8
   %1 = fpext <2 x float> %0 to <2 x double>
   store <2 x double> %1, <2 x double>* %out, align 1
   ret void
 }
+
+define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
+entry:
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
+  %0 = load <4 x float>* %in
+  %1 = fpext <4 x float> %0 to <4 x double>
+  store <4 x double> %1, <4 x double>* %out, align 1
+  ret void
+}
+
+define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
+entry:
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 16(%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 24(%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
+; AVX: vcvtps2pd 16(%{{.+}}), %ymm{{[0-9]+}}
+  %0 = load <8 x float>* %in
+  %1 = fpext <8 x float> %0 to <8 x double>
+  store <8 x double> %1, <8 x double>* %out, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/vec_shuffle-26.ll b/test/CodeGen/X86/vec_shuffle-26.ll
index 086af6b..4c56f84 100644
--- a/test/CodeGen/X86/vec_shuffle-26.ll
+++ b/test/CodeGen/X86/vec_shuffle-26.ll
@@ -1,6 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
-; RUN: grep unpcklps %t | count 1
-; RUN: grep unpckhps %t | count 3
+; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s
 
 ; Transpose example using the more generic vector shuffle. Return float8
 ; instead of float16
@@ -14,6 +13,17 @@ target triple = "i386-apple-cl.1.0"
 
 define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind {
 entry:
+; CHECK: transpose2
+; CHECK: unpckhps
+; CHECK: unpckhps
+; CHECK: unpcklps
+; CHECK: unpckhps
+; Different instruction order for Atom.
+; ATOM: transpose2
+; ATOM: unpckhps
+; ATOM: unpckhps
+; ATOM: unpckhps
+; ATOM: unpcklps
 	%unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
 	%unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
 	%unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
@@ -27,3 +37,32 @@ entry:
 ;       %r3 = shufflevector <8 x float> %r1,  <8 x float> %r2,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15 >; 
 	ret <8 x float> %r2
 }
+
+define <2 x i64> @lo_hi_shift(float* nocapture %x, float* nocapture %y) nounwind {
+entry:
+; movhps should happen before extractps to assure it gets the correct value.
+; CHECK: lo_hi_shift
+; CHECK: movhps ([[BASEREG:%[a-z]+]]),
+; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; ATOM: lo_hi_shift
+; ATOM: movhps ([[BASEREG:%[a-z]+]]),
+; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+  %v.i = bitcast float* %y to <4 x float>*
+  %0 = load <4 x float>* %v.i, align 1
+  %1 = bitcast float* %x to <1 x i64>*
+  %.val = load <1 x i64>* %1, align 1
+  %2 = bitcast <1 x i64> %.val to <2 x float>
+  %shuffle.i = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle1.i = shufflevector <4 x float> %0, <4 x float> %shuffle.i, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %cast.i = bitcast <4 x float> %0 to <2 x i64>
+  %extract.i = extractelement <2 x i64> %cast.i, i32 1
+  %3 = bitcast float* %x to i64*
+  store i64 %extract.i, i64* %3, align 4
+  %4 = bitcast <4 x float> %0 to <16 x i8>
+  %5 = bitcast <4 x float> %shuffle1.i to <16 x i8>
+  %palignr = shufflevector <16 x i8> %5, <16 x i8> %4, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %6 = bitcast <16 x i8> %palignr to <2 x i64>
+  ret <2 x i64> %6
+}
diff --git a/test/CodeGen/X86/vec_shuffle-30.ll b/test/CodeGen/X86/vec_shuffle-30.ll
index 1651c4c..f5f8842 100644
--- a/test/CodeGen/X86/vec_shuffle-30.ll
+++ b/test/CodeGen/X86/vec_shuffle-30.ll
@@ -1,21 +1,25 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
-; RUN: grep pshufhw %t | grep -- -95 | count 1
-; RUN: grep shufps %t | count 1
-; RUN: not grep pslldq %t
+; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s
 
+; CHECK: test
 ; Test case when creating pshufhw, we incorrectly set the higher order bit
 ; for an undef,
 define void @test(<8 x i16>* %dest, <8 x i16> %in) nounwind {
 entry:
+; CHECK-NOT: vmovaps
+; CHECK: vmovlpd
+; CHECK: vpshufhw        $-95
   %0 = load <8 x i16>* %dest
   %1 = shufflevector <8 x i16> %0, <8 x i16> %in, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 13, i32 undef, i32 14, i32 14>
   store <8 x i16> %1, <8 x i16>* %dest
   ret void
-}                              
+}
 
+; CHECK: test2
 ; A test case where we shouldn't generate a punpckldq but a pshufd and a pslldq
 define void @test2(<4 x i32>* %dest, <4 x i32> %in) nounwind {
 entry:
+; CHECK-NOT: pslldq
+; CHECK: shufps
   %0 = shufflevector <4 x i32> %in, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> < i32 undef, i32 5, i32 undef, i32 2>
   store <4 x i32> %0, <4 x i32>* %dest
   ret void
diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll
index ebdfea9..56c6364 100644
--- a/test/CodeGen/X86/widen_cast-1.ll
+++ b/test/CodeGen/X86/widen_cast-1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=x86 -mcpu=generic -mattr=+sse42 < %s | FileCheck %s
-; RUN: llc -march=x86 -mcpu=atom -mattr=+sse42 < %s | FileCheck -check-prefix=ATOM %s
+; RUN: llc -march=x86 -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK: paddd
 ; CHECK: movl
diff --git a/test/CodeGen/X86/widen_load-1.ll b/test/CodeGen/X86/widen_load-1.ll
index 9705d14..dfaa3d6 100644
--- a/test/CodeGen/X86/widen_load-1.ll
+++ b/test/CodeGen/X86/widen_load-1.ll
@@ -1,12 +1,17 @@
-; RUN: llc %s -o - -march=x86-64 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: llc %s -o - -march=x86-64 -mattr=-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=SSE
+; RUN: llc %s -o - -march=x86-64 -mattr=+avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=AVX
 ; PR4891
 ; PR5626
 
 ; This load should be before the call, not after.
 
-; CHECK: movaps    compl+128(%rip), %xmm0
-; CHECK: movaps  %xmm0, (%rsp)
-; CHECK: callq   killcommon
+; SSE: movaps    compl+128(%rip), %xmm0
+; SSE: movaps  %xmm0, (%rsp)
+; SSE: callq   killcommon
+
+; AVX: vmovapd    compl+128(%rip), %xmm0
+; AVX: vmovapd  %xmm0, (%rsp)
+; AVX: callq   killcommon
 
 @compl = linkonce global [20 x i64] zeroinitializer, align 64 ; <[20 x i64]*> [#uses=1]
 
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index 79aa000..224898c 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -170,7 +170,7 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp
 ; CHECK: rot
 %i8vec3pack = type { <3 x i8>, i8 }
 define %i8vec3pack  @rot() nounwind {
-; CHECK: movd {{-?[0-9]+}}(%rsp), {{%xmm[0-9]}}
+; CHECK: pmovzxbd {{-?[0-9]+}}(%rsp), {{%xmm[0-9]}}
 entry:
   %X = alloca %i8vec3pack, align 4
   %rot = alloca %i8vec3pack, align 4
diff --git a/test/CodeGen/X86/xmulo.ll b/test/CodeGen/X86/xmulo.ll
new file mode 100644
index 0000000..486dafe
--- /dev/null
+++ b/test/CodeGen/X86/xmulo.ll
@@ -0,0 +1,50 @@
+; RUN: llc %s -o - | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
+declare i32 @printf(i8*, ...)
+
+@.str = private unnamed_addr constant [10 x i8] c"%llx, %d\0A\00", align 1
+
+define i32 @t1() nounwind {
+; CHECK: t1:
+; CHECK:  movl $0, 12(%esp)
+; CHECK:  movl $0, 8(%esp)
+; CHECK:  movl $72, 4(%esp)
+
+    %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 8)
+    %2 = extractvalue {i64, i1} %1, 0
+    %3 = extractvalue {i64, i1} %1, 1
+    %4 = zext i1 %3 to i32
+    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    ret i32 0
+}
+
+define i32 @t2() nounwind {
+; CHECK: t2:
+; CHECK:  movl $0, 12(%esp)
+; CHECK:  movl $0, 8(%esp)
+; CHECK:  movl $0, 4(%esp)
+
+    %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 0)
+    %2 = extractvalue {i64, i1} %1, 0
+    %3 = extractvalue {i64, i1} %1, 1
+    %4 = zext i1 %3 to i32
+    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    ret i32 0
+}
+
+define i32 @t3() nounwind {
+; CHECK: t3:
+; CHECK:  movl $1, 12(%esp)
+; CHECK:  movl $-1, 8(%esp)
+; CHECK:  movl $-9, 4(%esp)
+
+    %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 -1)
+    %2 = extractvalue {i64, i1} %1, 0
+    %3 = extractvalue {i64, i1} %1, 1
+    %4 = zext i1 %3 to i32
+    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    ret i32 0
+}
diff --git a/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64
new file mode 100755
index 0000000..9a1d538
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64
diff --git a/test/DebugInfo/2010-04-13-PubType.ll b/test/DebugInfo/X86/2010-04-13-PubType.ll
index db7bb0a..559f032 100644
--- a/test/DebugInfo/2010-04-13-PubType.ll
+++ b/test/DebugInfo/X86/2010-04-13-PubType.ll
@@ -1,6 +1,6 @@
-; RUN: llc -O0 -asm-verbose < %s > %t
-; RUN: grep "External Name" %t | grep -v X
-; RUN: grep "External Name" %t | grep Y | count 1
+; RUN: llc -O0 -asm-verbose -mtriple=x86_64-macosx < %s | FileCheck %s
+; CHECK-NOT: .asciz "X" ## External Name
+; CHECK: .asciz "Y" ## External Name
 ; Test to check type with no definition is listed in pubtypes section.
 %struct.X = type opaque
 %struct.Y = type { i32 }
diff --git a/test/DebugInfo/X86/DW_AT_byte_size.ll b/test/DebugInfo/X86/DW_AT_byte_size.ll
index 59280e0..25b5f00 100644
--- a/test/DebugInfo/X86/DW_AT_byte_size.ll
+++ b/test/DebugInfo/X86/DW_AT_byte_size.ll
@@ -4,7 +4,8 @@
 ; Checks that we don't emit a size for a pointer type.
 ; CHECK: DW_TAG_pointer_type
 ; CHECK-NEXT: DW_AT_type
-; CHECK-NOT-NEXT: DW_AT_byte_size
+; CHECK-NOT: DW_AT_byte_size
+; CHECK: .debug_info contents
 
 %struct.A = type { i32 }
 
diff --git a/test/DebugInfo/X86/DW_AT_object_pointer.ll b/test/DebugInfo/X86/DW_AT_object_pointer.ll
new file mode 100644
index 0000000..163a1e7
--- /dev/null
+++ b/test/DebugInfo/X86/DW_AT_object_pointer.ll
@@ -0,0 +1,79 @@
+; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; CHECK: DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x00bf => {0x000000bf})
+; CHECK: 0x000000bf:     DW_TAG_formal_parameter [12]
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]     ( .debug_str[0x00000085] = "this")
+
+%class.A = type { i32 }
+
+define i32 @_Z3foov() nounwind uwtable ssp {
+entry:
+  %a = alloca %class.A, align 4
+  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !21), !dbg !23
+  call void @_ZN1AC1Ev(%class.A* %a), !dbg !24
+  %m_a = getelementptr inbounds %class.A* %a, i32 0, i32 0, !dbg !25
+  %0 = load i32* %m_a, align 4, !dbg !25
+  ret i32 %0, !dbg !25
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define linkonce_odr void @_ZN1AC1Ev(%class.A* %this) unnamed_addr nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.A*, align 8
+  store %class.A* %this, %class.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !26), !dbg !28
+  %this1 = load %class.A** %this.addr
+  call void @_ZN1AC2Ev(%class.A* %this1), !dbg !29
+  ret void, !dbg !29
+}
+
+define linkonce_odr void @_ZN1AC2Ev(%class.A* %this) unnamed_addr nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.A*, align 8
+  store %class.A* %this, %class.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !30), !dbg !31
+  %this1 = load %class.A** %this.addr
+  %m_a = getelementptr inbounds %class.A* %this1, i32 0, i32 0, !dbg !32
+  store i32 0, i32* %m_a, align 4, !dbg !32
+  ret void, !dbg !34
+}
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"bar.cpp", metadata !"/Users/echristo/debug-tests", metadata !"clang version 3.2 (trunk 163586) (llvm/trunk 163570)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/Users/echristo/debug-tests/bar.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5, metadata !10, metadata !20}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3foov", metadata !6, i32 7, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z3foov, null, null, metadata !1, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [foo]
+!6 = metadata !{i32 786473, metadata !"bar.cpp", metadata !"/Users/echristo/debug-tests", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 786478, i32 0, null, metadata !"A", metadata !"A", metadata !"_ZN1AC1Ev", metadata !6, i32 3, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1AC1Ev, null, metadata !17, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
+!11 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{null, metadata !13}
+!13 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!14 = metadata !{i32 786434, null, metadata !"A", metadata !6, i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !15, i32 0, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [from ]
+!15 = metadata !{metadata !16, metadata !17}
+!16 = metadata !{i32 786445, metadata !14, metadata !"m_a", metadata !6, i32 4, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ] [m_a] [line 4, size 32, align 32, offset 0] [from int]
+!17 = metadata !{i32 786478, i32 0, metadata !14, metadata !"A", metadata !"A", metadata !"", metadata !6, i32 3, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !18, i32 3} ; [ DW_TAG_subprogram ] [line 3] [A]
+!18 = metadata !{metadata !19}
+!19 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!20 = metadata !{i32 786478, i32 0, null, metadata !"A", metadata !"A", metadata !"_ZN1AC2Ev", metadata !6, i32 3, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1AC2Ev, null, metadata !17, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
+!21 = metadata !{i32 786688, metadata !22, metadata !"a", metadata !6, i32 8, metadata !14, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 8]
+!22 = metadata !{i32 786443, metadata !5, i32 7, i32 11, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/Users/echristo/debug-tests/bar.cpp]
+!23 = metadata !{i32 8, i32 5, metadata !22, null}
+!24 = metadata !{i32 8, i32 6, metadata !22, null}
+!25 = metadata !{i32 9, i32 3, metadata !22, null}
+!26 = metadata !{i32 786689, metadata !10, metadata !"this", metadata !6, i32 16777219, metadata !27, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 3]
+!27 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!28 = metadata !{i32 3, i32 3, metadata !10, null}
+!29 = metadata !{i32 3, i32 18, metadata !10, null}
+!30 = metadata !{i32 786689, metadata !20, metadata !"this", metadata !6, i32 16777219, metadata !27, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 3]
+!31 = metadata !{i32 3, i32 3, metadata !20, null}
+!32 = metadata !{i32 3, i32 9, metadata !33, null}
+!33 = metadata !{i32 786443, metadata !20, i32 3, i32 7, metadata !6, i32 1} ; [ DW_TAG_lexical_block ] [/Users/echristo/debug-tests/bar.cpp]
+!34 = metadata !{i32 3, i32 18, metadata !33, null}
diff --git a/test/DebugInfo/X86/concrete_out_of_line.ll b/test/DebugInfo/X86/concrete_out_of_line.ll
index a227071..58fb055 100644
--- a/test/DebugInfo/X86/concrete_out_of_line.ll
+++ b/test/DebugInfo/X86/concrete_out_of_line.ll
@@ -7,16 +7,15 @@
 ; first check that we have a TAG_subprogram at a given offset and it has
 ; AT_inline.
 
-; CHECK: 0x00000134:   DW_TAG_subprogram [18]
-; CHECK-NEXT:     DW_AT_MIPS_linkage_name
+; CHECK: 0x0000011e:   DW_TAG_subprogram [18]
 ; CHECK-NEXT:     DW_AT_specification
 ; CHECK-NEXT:     DW_AT_inline
 
 
 ; and then that a TAG_subprogram refers to it with AT_abstract_origin.
 
-; CHECK: 0x00000184:   DW_TAG_subprogram [20]
-; CHECK-NEXT: DW_AT_abstract_origin [DW_FORM_ref4]    (cu + 0x0134 => {0x00000134})
+; CHECK: 0x0000015f:   DW_TAG_subprogram [20]
+; CHECK-NEXT: DW_AT_abstract_origin [DW_FORM_ref4]    (cu + 0x011e => {0x0000011e})
 
 define i32 @_ZN17nsAutoRefCnt7ReleaseEv() {
 entry:
diff --git a/test/DebugInfo/X86/elf-names.ll b/test/DebugInfo/X86/elf-names.ll
new file mode 100644
index 0000000..b908bce
--- /dev/null
+++ b/test/DebugInfo/X86/elf-names.ll
@@ -0,0 +1,109 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; CHECK: 0x0000000b: DW_TAG_compile_unit
+; CHECK: 0x00000012:   DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000035] = "foo.cpp")
+; CHECK: 0x0000003c:   DW_TAG_class_type
+; CHECK: 0x0000003d:     DW_AT_name [DW_FORM_strp]       ( .debug_str[0x0000006d] = "D")
+; CHECK: 0x00000044:     DW_TAG_member
+; CHECK: 0x00000045:       DW_AT_name [DW_FORM_strp]     ( .debug_str[0x0000005d] = "c1")
+
+%class.D = type { i32, i32, i32, i32 }
+
+@_ZN1DC1Ev = alias void (%class.D*)* @_ZN1DC2Ev
+@_ZN1DC1ERKS_ = alias void (%class.D*, %class.D*)* @_ZN1DC2ERKS_
+
+define void @_ZN1DC2Ev(%class.D* nocapture %this) unnamed_addr nounwind uwtable align 2 {
+entry:
+  tail call void @llvm.dbg.value(metadata !{%class.D* %this}, i64 0, metadata !29), !dbg !36
+  %c1 = getelementptr inbounds %class.D* %this, i64 0, i32 0, !dbg !37
+  store i32 1, i32* %c1, align 4, !dbg !37, !tbaa !39
+  %c2 = getelementptr inbounds %class.D* %this, i64 0, i32 1, !dbg !42
+  store i32 2, i32* %c2, align 4, !dbg !42, !tbaa !39
+  %c3 = getelementptr inbounds %class.D* %this, i64 0, i32 2, !dbg !43
+  store i32 3, i32* %c3, align 4, !dbg !43, !tbaa !39
+  %c4 = getelementptr inbounds %class.D* %this, i64 0, i32 3, !dbg !44
+  store i32 4, i32* %c4, align 4, !dbg !44, !tbaa !39
+  ret void, !dbg !45
+}
+
+define void @_ZN1DC2ERKS_(%class.D* nocapture %this, %class.D* nocapture %d) unnamed_addr nounwind uwtable align 2 {
+entry:
+  tail call void @llvm.dbg.value(metadata !{%class.D* %this}, i64 0, metadata !34), !dbg !46
+  tail call void @llvm.dbg.value(metadata !{%class.D* %d}, i64 0, metadata !35), !dbg !46
+  %c1 = getelementptr inbounds %class.D* %d, i64 0, i32 0, !dbg !47
+  %0 = load i32* %c1, align 4, !dbg !47, !tbaa !39
+  %c12 = getelementptr inbounds %class.D* %this, i64 0, i32 0, !dbg !47
+  store i32 %0, i32* %c12, align 4, !dbg !47, !tbaa !39
+  %c2 = getelementptr inbounds %class.D* %d, i64 0, i32 1, !dbg !49
+  %1 = load i32* %c2, align 4, !dbg !49, !tbaa !39
+  %c23 = getelementptr inbounds %class.D* %this, i64 0, i32 1, !dbg !49
+  store i32 %1, i32* %c23, align 4, !dbg !49, !tbaa !39
+  %c3 = getelementptr inbounds %class.D* %d, i64 0, i32 2, !dbg !50
+  %2 = load i32* %c3, align 4, !dbg !50, !tbaa !39
+  %c34 = getelementptr inbounds %class.D* %this, i64 0, i32 2, !dbg !50
+  store i32 %2, i32* %c34, align 4, !dbg !50, !tbaa !39
+  %c4 = getelementptr inbounds %class.D* %d, i64 0, i32 3, !dbg !51
+  %3 = load i32* %c4, align 4, !dbg !51, !tbaa !39
+  %c45 = getelementptr inbounds %class.D* %this, i64 0, i32 3, !dbg !51
+  store i32 %3, i32* %c45, align 4, !dbg !51, !tbaa !39
+  ret void, !dbg !52
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/usr/local/google/home/echristo", metadata !"clang version 3.2 (trunk 167506) (llvm/trunk 167505)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/foo.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5, metadata !31}
+!5 = metadata !{i32 786478, i32 0, null, metadata !"D", metadata !"D", metadata !"_ZN1DC2Ev", metadata !6, i32 12, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%class.D*)* @_ZN1DC2Ev, null, metadata !17, metadata !27, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [D]
+!6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/usr/local/google/home/echristo", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{null, metadata !9}
+!9 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
+!10 = metadata !{i32 786434, null, metadata !"D", metadata !6, i32 1, i64 128, i64 32, i32 0, i32 0, null, metadata !11, i32 0, null, null} ; [ DW_TAG_class_type ] [D] [line 1, size 128, align 32, offset 0] [from ]
+!11 = metadata !{metadata !12, metadata !14, metadata !15, metadata !16, metadata !17, metadata !20}
+!12 = metadata !{i32 786445, metadata !10, metadata !"c1", metadata !6, i32 6, i64 32, i64 32, i64 0, i32 1, metadata !13} ; [ DW_TAG_member ] [c1] [line 6, size 32, align 32, offset 0] [private] [from int]
+!13 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = metadata !{i32 786445, metadata !10, metadata !"c2", metadata !6, i32 7, i64 32, i64 32, i64 32, i32 1, metadata !13} ; [ DW_TAG_member ] [c2] [line 7, size 32, align 32, offset 32] [private] [from int]
+!15 = metadata !{i32 786445, metadata !10, metadata !"c3", metadata !6, i32 8, i64 32, i64 32, i64 64, i32 1, metadata !13} ; [ DW_TAG_member ] [c3] [line 8, size 32, align 32, offset 64] [private] [from int]
+!16 = metadata !{i32 786445, metadata !10, metadata !"c4", metadata !6, i32 9, i64 32, i64 32, i64 96, i32 1, metadata !13} ; [ DW_TAG_member ] [c4] [line 9, size 32, align 32, offset 96] [private] [from int]
+!17 = metadata !{i32 786478, i32 0, metadata !10, metadata !"D", metadata !"D", metadata !"", metadata !6, i32 3, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !18, i32 3} ; [ DW_TAG_subprogram ] [line 3] [D]
+!18 = metadata !{metadata !19}
+!19 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!20 = metadata !{i32 786478, i32 0, metadata !10, metadata !"D", metadata !"D", metadata !"", metadata !6, i32 4, metadata !21, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !25, i32 4} ; [ DW_TAG_subprogram ] [line 4] [D]
+!21 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{null, metadata !9, metadata !23}
+!23 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !24} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from D]
+!25 = metadata !{metadata !26}
+!26 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!27 = metadata !{metadata !28}
+!28 = metadata !{metadata !29}
+!29 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777228, metadata !30, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 12]
+!30 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
+!31 = metadata !{i32 786478, i32 0, null, metadata !"D", metadata !"D", metadata !"_ZN1DC2ERKS_", metadata !6, i32 19, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%class.D*, %class.D*)* @_ZN1DC2ERKS_, null, metadata !20, metadata !32, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [D]
+!32 = metadata !{metadata !33}
+!33 = metadata !{metadata !34, metadata !35}
+!34 = metadata !{i32 786689, metadata !31, metadata !"this", metadata !6, i32 16777235, metadata !30, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 19]
+!35 = metadata !{i32 786689, metadata !31, metadata !"d", metadata !6, i32 33554451, metadata !23, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [d] [line 19]
+!36 = metadata !{i32 12, i32 0, metadata !5, null}
+!37 = metadata !{i32 13, i32 0, metadata !38, null}
+!38 = metadata !{i32 786443, metadata !5, i32 12, i32 0, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
+!39 = metadata !{metadata !"int", metadata !40}
+!40 = metadata !{metadata !"omnipotent char", metadata !41}
+!41 = metadata !{metadata !"Simple C/C++ TBAA"}
+!42 = metadata !{i32 14, i32 0, metadata !38, null}
+!43 = metadata !{i32 15, i32 0, metadata !38, null}
+!44 = metadata !{i32 16, i32 0, metadata !38, null}
+!45 = metadata !{i32 17, i32 0, metadata !38, null}
+!46 = metadata !{i32 19, i32 0, metadata !31, null}
+!47 = metadata !{i32 20, i32 0, metadata !48, null}
+!48 = metadata !{i32 786443, metadata !31, i32 19, i32 0, metadata !6, i32 1} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
+!49 = metadata !{i32 21, i32 0, metadata !48, null}
+!50 = metadata !{i32 22, i32 0, metadata !48, null}
+!51 = metadata !{i32 23, i32 0, metadata !48, null}
+!52 = metadata !{i32 24, i32 0, metadata !48, null}
diff --git a/test/DebugInfo/X86/enum-fwd-decl.ll b/test/DebugInfo/X86/enum-fwd-decl.ll
index c2dacea..0902430 100644
--- a/test/DebugInfo/X86/enum-fwd-decl.ll
+++ b/test/DebugInfo/X86/enum-fwd-decl.ll
@@ -5,16 +5,14 @@
 
 !llvm.dbg.cu = !{!0}
 
-!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", metadata !"clang version 3.2 (trunk 157772) (llvm/trunk 157761)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !6, metadata !6, metadata !7} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/tmp", metadata !"clang version 3.2 (trunk 165274) (llvm/trunk 165272)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3} ; [ DW_TAG_compile_unit ] [/tmp/foo.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !2}
-!2 = metadata !{metadata !3}
-!3 = metadata !{i32 786436, null, metadata !"E", metadata !4, i32 1, i64 16, i64 16, i32 0, i32 4, null, metadata !5, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
-!4 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", null} ; [ DW_TAG_file_type ]
-!5 = metadata !{i32 0}
-!6 = metadata !{metadata !5}
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !9}
-!9 = metadata !{i32 786484, i32 0, null, metadata !"e", metadata !"e", metadata !"", metadata !4, i32 2, metadata !3, i32 0, i32 1, i16* @e} ; [ DW_TAG_variable ]
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786484, i32 0, null, metadata !"e", metadata !"e", metadata !"", metadata !6, i32 2, metadata !7, i32 0, i32 1, i16* @e} ; [ DW_TAG_variable ] [e] [line 2] [def]
+!6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/tmp", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786436, null, metadata !"E", metadata !6, i32 1, i64 16, i64 16, i32 0, i32 4, null, null, i32 0} ; [ DW_TAG_enumeration_type ] [E] [line 1, size 16, align 16, offset 0] [fwd] [from ]
 
 ; CHECK: DW_TAG_enumeration_type
 ; CHECK-NEXT: DW_AT_name
diff --git a/test/DebugInfo/X86/linkage-name.ll b/test/DebugInfo/X86/linkage-name.ll
new file mode 100644
index 0000000..b984923
--- /dev/null
+++ b/test/DebugInfo/X86/linkage-name.ll
@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=x86_64-macosx -darwin-gdb-compat=Disable %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; CHECK: DW_TAG_subprogram [9] *
+; CHECK-NOT: DW_AT_MIPS_linkage_name
+; CHECK: DW_AT_specification
+
+%class.A = type { i8 }
+
+@a = global %class.A zeroinitializer, align 1
+
+define i32 @_ZN1A1aEi(%class.A* %this, i32 %b) nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.A*, align 8
+  %b.addr = alloca i32, align 4
+  store %class.A* %this, %class.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !21), !dbg !23
+  store i32 %b, i32* %b.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !24), !dbg !25
+  %this1 = load %class.A** %this.addr
+  %0 = load i32* %b.addr, align 4, !dbg !26
+  ret i32 %0, !dbg !26
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/Users/echristo", metadata !"clang version 3.1 (trunk 152691) (llvm/trunk 152692)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !18} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, null, metadata !"a", metadata !"a", metadata !"_ZN1A1aEi", metadata !6, i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%class.A*, i32)* @_ZN1A1aEi, null, metadata !13, metadata !16} ; [ DW_TAG_subprogram ]
+!6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/Users/echristo", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!8 = metadata !{metadata !9, metadata !10, metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!10 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{i32 786434, null, metadata !"A", metadata !6, i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !12, i32 0, null, null} ; [ DW_TAG_class_type ]
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 786478, i32 0, metadata !11, metadata !"a", metadata !"a", metadata !"_ZN1A1aEi", metadata !6, i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 257, i1 false, null, null, i32 0, metadata !14} ; [ DW_TAG_subprogram ]
+!14 = metadata !{metadata !15}
+!15 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!16 = metadata !{metadata !17}
+!17 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!18 = metadata !{metadata !19}
+!19 = metadata !{metadata !20}
+!20 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 9, metadata !11, i32 0, i32 1, %class.A* @a} ; [ DW_TAG_variable ]
+!21 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777221, metadata !22, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
+!22 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
+!23 = metadata !{i32 5, i32 8, metadata !5, null}
+!24 = metadata !{i32 786689, metadata !5, metadata !"b", metadata !6, i32 33554437, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!25 = metadata !{i32 5, i32 14, metadata !5, null}
+!26 = metadata !{i32 6, i32 4, metadata !27, null}
+!27 = metadata !{i32 786443, metadata !5, i32 5, i32 17, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
diff --git a/test/DebugInfo/X86/pr13303.ll b/test/DebugInfo/X86/pr13303.ll
deleted file mode 100644
index e820cb5..0000000
--- a/test/DebugInfo/X86/pr13303.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc %s -o %t -filetype=obj -mtriple=x86_64-unknown-linux-gnu
-; RUN: llvm-dwarfdump %t | FileCheck %s
-; PR13303
-
-; Check that the prologue ends with is_stmt here.
-; CHECK: 0x0000000000000000 {{.*}} is_stmt
-
-define i32 @main() nounwind uwtable {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  ret i32 0, !dbg !10
-}
-
-!llvm.dbg.cu = !{!0}
-
-!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"PR13303.c", metadata !"/home/probinson", metadata !"clang version 3.2 (trunk 160143)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/home/probinson/PR13303.c] [DW_LANG_C99]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
-!6 = metadata !{i32 786473, metadata !"PR13303.c", metadata !"/home/probinson", null} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 1, i32 14, metadata !11, null}
-!11 = metadata !{i32 786443, metadata !5, i32 1, i32 12, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/home/probinson/PR13303.c]
diff --git a/test/DebugInfo/X86/prologue-stack.ll b/test/DebugInfo/X86/prologue-stack.ll
new file mode 100644
index 0000000..929db51
--- /dev/null
+++ b/test/DebugInfo/X86/prologue-stack.ll
@@ -0,0 +1,35 @@
+; RUN: llc -disable-fp-elim -O0 %s -mtriple x86_64-unknown-linux-gnu -o - | FileCheck %s
+
+; int callme(int);
+; int isel_line_test2() {
+;   callme(400);
+;   return 0;
+; }
+
+define i32 @isel_line_test2() nounwind uwtable {
+  ; The stack adjustment should be part of the prologue.
+  ; CHECK: isel_line_test2:
+  ; CHECK: {{subq|leaq}} {{.*}}, %rsp
+  ; CHECK: .loc 1 5 3 prologue_end
+entry:
+  %call = call i32 @callme(i32 400), !dbg !10
+  ret i32 0, !dbg !12
+}
+
+declare i32 @callme(i32)
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"bar.c", metadata !"/usr/local/google/home/echristo/tmp", metadata !"clang version 3.2 (trunk 164980) (llvm/trunk 164979)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/bar.c] [DW_LANG_C99]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"isel_line_test2", metadata !"isel_line_test2", metadata !"", metadata !6, i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @isel_line_test2, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [isel_line_test2]
+!6 = metadata !{i32 786473, metadata !"bar.c", metadata !"/usr/local/google/home/echristo/tmp", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 5, i32 3, metadata !11, null}
+!11 = metadata !{i32 786443, metadata !5, i32 4, i32 1, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/bar.c]
+!12 = metadata !{i32 6, i32 3, metadata !11, null}
diff --git a/test/DebugInfo/X86/stringpool.ll b/test/DebugInfo/X86/stringpool.ll
index 2cd1001..caf12c2 100644
--- a/test/DebugInfo/X86/stringpool.ll
+++ b/test/DebugInfo/X86/stringpool.ll
@@ -16,8 +16,8 @@
 
 ; Verify that we refer to 'yyyy' with a relocation.
 ; LINUX:      .long   .Lstring3               # DW_AT_name
-; LINUX-NEXT: .long   39                      # DW_AT_type
-; LINUX-NEXT: .byte   1                       # DW_AT_external
+; LINUX-NEXT: .long   38                      # DW_AT_type
+; LINUX-NEXT:                                 # DW_AT_external
 ; LINUX-NEXT: .byte   1                       # DW_AT_decl_file
 ; LINUX-NEXT: .byte   1                       # DW_AT_decl_line
 ; LINUX-NEXT: .byte   9                       # DW_AT_location
diff --git a/test/DebugInfo/bug_null_debuginfo.ll b/test/DebugInfo/bug_null_debuginfo.ll
index a7fdf70..b17affe 100644
--- a/test/DebugInfo/bug_null_debuginfo.ll
+++ b/test/DebugInfo/bug_null_debuginfo.ll
@@ -1,5 +1,4 @@
-; RUN: llc
-
+; RUN: llc < %s
 
 !llvm.dbg.cu = !{!0}
 
diff --git a/test/DebugInfo/dwarfdump-inlining.test b/test/DebugInfo/dwarfdump-inlining.test
new file mode 100644
index 0000000..d3a7e12
--- /dev/null
+++ b/test/DebugInfo/dwarfdump-inlining.test
@@ -0,0 +1,28 @@
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x613 \
+RUN:   --inlining --functions | FileCheck %s -check-prefix DEEP_STACK
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x6de \
+RUN:   --inlining | FileCheck %s -check-prefix SHORTER_STACK
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x685 \
+RUN:   --inlining | FileCheck %s -check-prefix SHORT_STACK
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x640 \
+RUN:   --functions | FileCheck %s -check-prefix INL_FUNC_NAME
+
+DEEP_STACK:      inlined_h
+DEEP_STACK-NEXT: header.h:2:21
+DEEP_STACK-NEXT: inlined_g
+DEEP_STACK-NEXT: header.h:7
+DEEP_STACK-NEXT: inlined_f
+DEEP_STACK-NEXT: main.cc:3
+DEEP_STACK-NEXT: main
+DEEP_STACK-NEXT: main.cc:8
+
+SHORTER_STACK:      header.h:7:20
+SHORTER_STACK-NEXT: main.cc:3
+SHORTER_STACK-NEXT: main.cc:8
+
+SHORT_STACK:      main.cc:3:20
+SHORT_STACK-NEXT: main.cc:8
+
+INL_FUNC_NAME:      inlined_g
+INL_FUNC_NAME-NEXT: header.h:7:20
+
diff --git a/test/DebugInfo/dwarfdump-test.test b/test/DebugInfo/dwarfdump-test.test
index de23dcd..973c344 100644
--- a/test/DebugInfo/dwarfdump-test.test
+++ b/test/DebugInfo/dwarfdump-test.test
@@ -17,6 +17,8 @@ RUN:   --address=0x56d --functions | FileCheck %s -check-prefix INCLUDE_TEST_2
 RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test4.elf-x86-64 \
 RUN:   --address=0x55c --functions \
 RUN:   | FileCheck %s -check-prefix MANY_SEQ_IN_LINE_TABLE
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test4.elf-x86-64 \
+RUN:   | FileCheck %s -check-prefix DEBUG_RANGES
 
 MAIN: main
 MAIN-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:16:10
@@ -44,3 +46,11 @@ INCLUDE_TEST_2-NEXT: /tmp/include{{[/\\]}}decl.h:5:0
 
 MANY_SEQ_IN_LINE_TABLE: _Z1cv
 MANY_SEQ_IN_LINE_TABLE-NEXT: /tmp/dbginfo/sequences{{[/\\]}}c.cc:2:0
+
+DEBUG_RANGES:      .debug_ranges contents:
+DEBUG_RANGES-NEXT: 00000000 000000000000055c 0000000000000567
+DEBUG_RANGES-NEXT: 00000000 0000000000000567 000000000000056d
+DEBUG_RANGES-NEXT: 00000000 <End of list>
+DEBUG_RANGES-NEXT: 00000030 0000000000000570 000000000000057b
+DEBUG_RANGES-NEXT: 00000030 0000000000000567 000000000000056d
+DEBUG_RANGES-NEXT: 00000030 <End of list>
diff --git a/test/ExecutionEngine/2002-12-16-ArgTest.ll b/test/ExecutionEngine/2002-12-16-ArgTest.ll
index eb2fe8c..4c03519 100644
--- a/test/ExecutionEngine/2002-12-16-ArgTest.ll
+++ b/test/ExecutionEngine/2002-12-16-ArgTest.ll
@@ -1,4 +1,5 @@
 ; RUN: %lli %s > /dev/null
+; XFAIL: arm
 
 @.LC0 = internal global [10 x i8] c"argc: %d\0A\00"		; <[10 x i8]*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll b/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll
index 46273d3..28cc54a 100644
--- a/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll
+++ b/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 @.LC0 = internal global [10 x i8] c"argc: %d\0A\00"		; <[10 x i8]*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll b/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll
index 88bfbb3..9f89598 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @foo(i32 %X, i32 %Y, double %A) {
 	%cond212 = fcmp une double %A, 1.000000e+00		; <i1> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll b/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll
index d5f860d..997b2a9 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	call i32 @mylog( i32 4 )		; <i32>:1 [#uses=0]
diff --git a/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll b/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll
index 721f2e8..ba35b5b 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 ; <label>:0
diff --git a/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll b/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll
index d17df99..f3c88ad 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; We were accidentally inverting the signedness of right shifts.  Whoops.
 
diff --git a/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll b/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll
index e55cb06..f925e79 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	%X = fadd double 0.000000e+00, 1.000000e+00		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll b/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll
index 663dc40..5b426f6 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @bar(i8* %X) {
         ; pointer should be 4 byte aligned!
diff --git a/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll b/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll
index e95294b..c0a7393 100644
--- a/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll
+++ b/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll
@@ -1,6 +1,6 @@
 ; This testcase should return with an exit code of 1.
 ;
-; RUN: not %lli -use-mcjit %s
+; RUN: not %lli -mtriple=%mcjit_triple -use-mcjit %s
 
 @test = global i64 0		; <i64*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll b/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll
index a237194..d3e6204 100644
--- a/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s test
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s test
 
 declare i32 @puts(i8*)
 
diff --git a/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll b/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
index 70464a3..55a1697 100644
--- a/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 target datalayout = "e-p:32:32"
 
diff --git a/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll b/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
index 58d423f..79c6e7f 100644
--- a/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; Testcase distilled from 256.bzip2.
 
diff --git a/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll b/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
index a22fe07..ffd6df6 100644
--- a/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; Testcase distilled from 256.bzip2.
 
diff --git a/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll b/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll
index b3c6d8a..90839e9 100644
--- a/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll
+++ b/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; This testcase failed to work because two variable sized allocas confused the
 ; local register allocator.
diff --git a/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll b/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll
index bd32f30..29ef2c5 100644
--- a/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ;
 ; Regression Test: EnvironmentTest.ll
diff --git a/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll b/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll
index 1959534..2adb608 100644
--- a/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll
+++ b/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; This testcase exposes a bug in the local register allocator where it runs out
 ; of registers (due to too many overlapping live ranges), but then attempts to
diff --git a/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll b/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
index 1f8343f..91bde46 100644
--- a/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
+++ b/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 @A = global i32 0		; <i32*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll b/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll
index 79a7d58..a7462d9 100644
--- a/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll
+++ b/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll
@@ -1,5 +1,5 @@
 ; PR672
-; RUN: %lli -use-mcjit %s
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s
 ; XFAIL: mcjit-ia32
 
 define i32 @main() {
diff --git a/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll b/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll
index 52cef4d..2406596 100644
--- a/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll
+++ b/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit -force-interpreter %s
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -force-interpreter %s
 ; PR1836
 
 define i32 @main() {
diff --git a/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll b/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll
index a6e917f..d429d51 100644
--- a/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll
+++ b/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit -force-interpreter=true %s | grep 1
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -force-interpreter=true %s | grep 1
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 target triple = "i686-pc-linux-gnu"
diff --git a/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll b/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll
index 524a724..a6d18e7 100644
--- a/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll
+++ b/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit -force-interpreter=true %s
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -force-interpreter=true %s > /dev/null
 
 define i32 @main() {
        %a = add i32 0, undef
diff --git a/test/ExecutionEngine/MCJIT/fpbitcast.ll b/test/ExecutionEngine/MCJIT/fpbitcast.ll
index 9da908f..bb4957e 100644
--- a/test/ExecutionEngine/MCJIT/fpbitcast.ll
+++ b/test/ExecutionEngine/MCJIT/fpbitcast.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit -force-interpreter=true %s | grep 40091eb8
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -force-interpreter=true %s | grep 40091eb8
 ;
 define i32 @test(double %x) {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/hello.ll b/test/ExecutionEngine/MCJIT/hello.ll
index a52b6d4..ceb9c12 100644
--- a/test/ExecutionEngine/MCJIT/hello.ll
+++ b/test/ExecutionEngine/MCJIT/hello.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 @.LC0 = internal global [12 x i8] c"Hello World\00"		; <[12 x i8]*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/hello2.ll b/test/ExecutionEngine/MCJIT/hello2.ll
index 670a6dd..756fcad 100644
--- a/test/ExecutionEngine/MCJIT/hello2.ll
+++ b/test/ExecutionEngine/MCJIT/hello2.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 @X = global i32 7		; <i32*> [#uses=0]
 @msg = internal global [13 x i8] c"Hello World\0A\00"		; <[13 x i8]*> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/lit.local.cfg b/test/ExecutionEngine/MCJIT/lit.local.cfg
index 2980ce7..fc29f65 100644
--- a/test/ExecutionEngine/MCJIT/lit.local.cfg
+++ b/test/ExecutionEngine/MCJIT/lit.local.cfg
@@ -8,13 +8,17 @@ def getRoot(config):
 root = getRoot(config)
 
 targets = set(root.targets_to_build.split())
-if ('X86' in targets) | ('ARM' in targets) | ('Mips' in targets):
+if ('X86' in targets) | ('ARM' in targets) | ('Mips' in targets) | \
+   ('PowerPC' in targets):
     config.unsupported = False
 else:
     config.unsupported = True
 
-if root.host_arch not in ['x86', 'x86_64', 'ARM', 'Mips']:
+if root.host_arch not in ['x86', 'x86_64', 'ARM', 'Mips', 'PowerPC']:
     config.unsupported = True
 
-if root.host_os in ['Win32', 'Cygwin', 'MingW', 'Windows', 'Darwin']:
+if root.host_os in ['Darwin']:
+    config.unsupported = True
+
+if 'powerpc' in root.target_triple and not 'powerpc64' in root.target_triple:
     config.unsupported = True
diff --git a/test/ExecutionEngine/MCJIT/pr13727.ll b/test/ExecutionEngine/MCJIT/pr13727.ll
new file mode 100644
index 0000000..c33bf32
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/pr13727.ll
@@ -0,0 +1,88 @@
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -O0 -disable-lazy-compilation=false %s
+
+; The intention of this test is to verify that symbols mapped to COMMON in ELF
+; work as expected.
+;
+; Compiled from this C code:
+;
+; int zero_int;
+; double zero_double;
+; int zero_arr[10];
+; 
+; int main()
+; {
+;     zero_arr[zero_int + 5] = 40;
+; 
+;     if (zero_double < 1.1)
+;         zero_arr[zero_int + 2] = 70;
+; 
+;     for (int i = 1; i < 10; ++i) {
+;         zero_arr[i] = zero_arr[i - 1] + zero_arr[i];
+;     }
+;     return zero_arr[9] == 110 ? 0 : -1;
+; }
+
+@zero_int = common global i32 0, align 4
+@zero_arr = common global [10 x i32] zeroinitializer, align 16
+@zero_double = common global double 0.000000e+00, align 8
+
+define i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @zero_int, align 4
+  %add = add nsw i32 %0, 5
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom
+  store i32 40, i32* %arrayidx, align 4
+  %1 = load double* @zero_double, align 8
+  %cmp = fcmp olt double %1, 1.100000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %2 = load i32* @zero_int, align 4
+  %add1 = add nsw i32 %2, 2
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom2
+  store i32 70, i32* %arrayidx3, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  store i32 1, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %if.end
+  %3 = load i32* %i, align 4
+  %cmp4 = icmp slt i32 %3, 10
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %4 = load i32* %i, align 4
+  %sub = sub nsw i32 %4, 1
+  %idxprom5 = sext i32 %sub to i64
+  %arrayidx6 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom5
+  %5 = load i32* %arrayidx6, align 4
+  %6 = load i32* %i, align 4
+  %idxprom7 = sext i32 %6 to i64
+  %arrayidx8 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom7
+  %7 = load i32* %arrayidx8, align 4
+  %add9 = add nsw i32 %5, %7
+  %8 = load i32* %i, align 4
+  %idxprom10 = sext i32 %8 to i64
+  %arrayidx11 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom10
+  store i32 %add9, i32* %arrayidx11, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %9 = load i32* %i, align 4
+  %inc = add nsw i32 %9, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %10 = load i32* getelementptr inbounds ([10 x i32]* @zero_arr, i32 0, i64 9), align 4
+  %cmp12 = icmp eq i32 %10, 110
+  %cond = select i1 %cmp12, i32 0, i32 -1
+  ret i32 %cond
+}
diff --git a/test/ExecutionEngine/MCJIT/simplesttest.ll b/test/ExecutionEngine/MCJIT/simplesttest.ll
index a6688c2..02ad006 100644
--- a/test/ExecutionEngine/MCJIT/simplesttest.ll
+++ b/test/ExecutionEngine/MCJIT/simplesttest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	ret i32 0
diff --git a/test/ExecutionEngine/MCJIT/simpletest.ll b/test/ExecutionEngine/MCJIT/simpletest.ll
index 4562aa6..958b783 100644
--- a/test/ExecutionEngine/MCJIT/simpletest.ll
+++ b/test/ExecutionEngine/MCJIT/simpletest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @bar() {
 	ret i32 0
diff --git a/test/ExecutionEngine/MCJIT/stubs.ll b/test/ExecutionEngine/MCJIT/stubs.ll
index b285b0e..9e5d5b2 100644
--- a/test/ExecutionEngine/MCJIT/stubs.ll
+++ b/test/ExecutionEngine/MCJIT/stubs.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit -disable-lazy-compilation=false %s
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -disable-lazy-compilation=false %s
 
 define i32 @main() nounwind {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/test-arith.ll b/test/ExecutionEngine/MCJIT/test-arith.ll
index 3177760..b73227f 100644
--- a/test/ExecutionEngine/MCJIT/test-arith.ll
+++ b/test/ExecutionEngine/MCJIT/test-arith.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	%A = add i8 0, 12		; <i8> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-branch.ll b/test/ExecutionEngine/MCJIT/test-branch.ll
index 702c110..8f3c727 100644
--- a/test/ExecutionEngine/MCJIT/test-branch.ll
+++ b/test/ExecutionEngine/MCJIT/test-branch.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; test unconditional branch
 define i32 @main() {
diff --git a/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll b/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll
index 6f28405..20150b2 100644
--- a/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll
+++ b/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @_Z14func_exit_codev() nounwind uwtable {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/test-call.ll b/test/ExecutionEngine/MCJIT/test-call.ll
index 7a244ee..51d19fe 100644
--- a/test/ExecutionEngine/MCJIT/test-call.ll
+++ b/test/ExecutionEngine/MCJIT/test-call.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 declare void @exit(i32)
 
diff --git a/test/ExecutionEngine/MCJIT/test-cast.ll b/test/ExecutionEngine/MCJIT/test-cast.ll
index 75e7d1b..dcc97f4 100644
--- a/test/ExecutionEngine/MCJIT/test-cast.ll
+++ b/test/ExecutionEngine/MCJIT/test-cast.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @foo() {
 	ret i32 0
diff --git a/test/ExecutionEngine/MCJIT/test-common-symbols-alignment.ll b/test/ExecutionEngine/MCJIT/test-common-symbols-alignment.ll
new file mode 100644
index 0000000..d666a2a
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-common-symbols-alignment.ll
@@ -0,0 +1,32 @@
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -O0 %s
+
+; This test checks that common symbols have been allocated addresses honouring
+; the alignment requirement.
+
+@CS1 = common global i32 0, align 16
+@CS2 = common global i8 0, align 1
+@CS3 = common global i32 0, align 16
+
+define i32 @main() nounwind {
+entry:
+    %retval = alloca i32, align 4
+    %ptr = alloca i32, align 4
+    store i32 0, i32* %retval
+    store i32 ptrtoint (i32* @CS3 to i32), i32* %ptr, align 4
+    %0 = load i32* %ptr, align 4
+    %and = and i32 %0, 15
+    %tobool = icmp ne i32 %and, 0
+    br i1 %tobool, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+    store i32 1, i32* %retval
+    br label %return
+
+if.else:                                          ; preds = %entry
+    store i32 0, i32* %retval
+    br label %return
+
+return:                                           ; preds = %if.else, %if.then
+    %1 = load i32* %retval
+    ret i32 %1
+}
diff --git a/test/ExecutionEngine/MCJIT/test-common-symbols.ll b/test/ExecutionEngine/MCJIT/test-common-symbols.ll
index ac1d9ac..8c81902 100644
--- a/test/ExecutionEngine/MCJIT/test-common-symbols.ll
+++ b/test/ExecutionEngine/MCJIT/test-common-symbols.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit -O0 -disable-lazy-compilation=false %s
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -O0 -disable-lazy-compilation=false %s
 
 ; The intention of this test is to verify that symbols mapped to COMMON in ELF
 ; work as expected.
diff --git a/test/ExecutionEngine/MCJIT/test-constantexpr.ll b/test/ExecutionEngine/MCJIT/test-constantexpr.ll
index 6b46639..56c1290 100644
--- a/test/ExecutionEngine/MCJIT/test-constantexpr.ll
+++ b/test/ExecutionEngine/MCJIT/test-constantexpr.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; This tests to make sure that we can evaluate weird constant expressions
 
diff --git a/test/ExecutionEngine/MCJIT/test-data-align.ll b/test/ExecutionEngine/MCJIT/test-data-align.ll
new file mode 100644
index 0000000..0493cba
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-data-align.ll
@@ -0,0 +1,15 @@
+; RUN:  %lli -mtriple=%mcjit_triple -use-mcjit -O0 %s
+
+; Check that a variable is always aligned as specified.
+
+@var = global i32 0, align 32
+define i32 @main() {
+  %addr = ptrtoint i32* @var to i64
+  %mask = and i64 %addr, 31
+  %tst = icmp eq i64 %mask, 0
+  br i1 %tst, label %good, label %bad
+good:
+  ret i32 0
+bad:
+  ret i32 1
+}
diff --git a/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll b/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll
index 35491df..7af1d8b 100644
--- a/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll
+++ b/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-fp.ll b/test/ExecutionEngine/MCJIT/test-fp.ll
index 6fc5a50..f7e6fb9 100644
--- a/test/ExecutionEngine/MCJIT/test-fp.ll
+++ b/test/ExecutionEngine/MCJIT/test-fp.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll b/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll
index 4a790c6..ec6cbad 100644
--- a/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll
+++ b/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 @count = global i32 1, align 4
 
diff --git a/test/ExecutionEngine/MCJIT/test-global.ll b/test/ExecutionEngine/MCJIT/test-global.ll
index 94e0250..e7972f9 100644
--- a/test/ExecutionEngine/MCJIT/test-global.ll
+++ b/test/ExecutionEngine/MCJIT/test-global.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 @count = global i32 0, align 4
 
diff --git a/test/ExecutionEngine/MCJIT/test-loadstore.ll b/test/ExecutionEngine/MCJIT/test-loadstore.ll
index e917149..f450d0a 100644
--- a/test/ExecutionEngine/MCJIT/test-loadstore.ll
+++ b/test/ExecutionEngine/MCJIT/test-loadstore.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define void @test(i8* %P, i16* %P.upgrd.1, i32* %P.upgrd.2, i64* %P.upgrd.3) {
 	%V = load i8* %P		; <i8> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-local.ll b/test/ExecutionEngine/MCJIT/test-local.ll
index 4f5ae47..d4e9f44 100644
--- a/test/ExecutionEngine/MCJIT/test-local.ll
+++ b/test/ExecutionEngine/MCJIT/test-local.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() nounwind uwtable {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/test-logical.ll b/test/ExecutionEngine/MCJIT/test-logical.ll
index 0540c22..32f45ef 100644
--- a/test/ExecutionEngine/MCJIT/test-logical.ll
+++ b/test/ExecutionEngine/MCJIT/test-logical.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	%A = and i8 4, 8		; <i8> [#uses=2]
diff --git a/test/ExecutionEngine/MCJIT/test-loop.ll b/test/ExecutionEngine/MCJIT/test-loop.ll
index b1dbf40..ebc6896 100644
--- a/test/ExecutionEngine/MCJIT/test-loop.ll
+++ b/test/ExecutionEngine/MCJIT/test-loop.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 ; <label>:0
diff --git a/test/ExecutionEngine/MCJIT/test-phi.ll b/test/ExecutionEngine/MCJIT/test-phi.ll
index fbc0808..1408533 100644
--- a/test/ExecutionEngine/MCJIT/test-phi.ll
+++ b/test/ExecutionEngine/MCJIT/test-phi.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; test phi node
 @Y = global i32 6		; <i32*> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-ptr-reloc.ll b/test/ExecutionEngine/MCJIT/test-ptr-reloc.ll
new file mode 100644
index 0000000..93b6a6d
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-ptr-reloc.ll
@@ -0,0 +1,16 @@
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -O0 %s
+
+@.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
+@ptr = global i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), align 4
+@.str1 = private unnamed_addr constant [6 x i8] c"data2\00", align 1
+@ptr2 = global i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0), align 4
+
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind readonly {
+entry:
+  %0 = load i8** @ptr, align 4
+  %1 = load i8** @ptr2, align 4
+  %cmp = icmp eq i8* %0, %1
+  %. = zext i1 %cmp to i32
+  ret i32 %.
+}
+
diff --git a/test/ExecutionEngine/MCJIT/test-ret.ll b/test/ExecutionEngine/MCJIT/test-ret.ll
index 1b90ee0..af28292 100644
--- a/test/ExecutionEngine/MCJIT/test-ret.ll
+++ b/test/ExecutionEngine/MCJIT/test-ret.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; test return instructions
 define void @test1() {
diff --git a/test/ExecutionEngine/MCJIT/test-return.ll b/test/ExecutionEngine/MCJIT/test-return.ll
index 9c399ca..67f7107 100644
--- a/test/ExecutionEngine/MCJIT/test-return.ll
+++ b/test/ExecutionEngine/MCJIT/test-return.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() nounwind uwtable {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/test-setcond-fp.ll b/test/ExecutionEngine/MCJIT/test-setcond-fp.ll
index 030ff31..a8f4bd8 100644
--- a/test/ExecutionEngine/MCJIT/test-setcond-fp.ll
+++ b/test/ExecutionEngine/MCJIT/test-setcond-fp.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 
 define i32 @main() {
diff --git a/test/ExecutionEngine/MCJIT/test-setcond-int.ll b/test/ExecutionEngine/MCJIT/test-setcond-int.ll
index 1113efe..ed52b50 100644
--- a/test/ExecutionEngine/MCJIT/test-setcond-int.ll
+++ b/test/ExecutionEngine/MCJIT/test-setcond-int.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	%int1 = add i32 0, 0		; <i32> [#uses=6]
diff --git a/test/ExecutionEngine/MCJIT/test-shift.ll b/test/ExecutionEngine/MCJIT/test-shift.ll
index 2da824f..5a5c10d 100644
--- a/test/ExecutionEngine/MCJIT/test-shift.ll
+++ b/test/ExecutionEngine/MCJIT/test-shift.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	%shamt = add i8 0, 1		; <i8> [#uses=8]
diff --git a/test/ExecutionEngine/lit.local.cfg b/test/ExecutionEngine/lit.local.cfg
index 19eebc0..f034326 100644
--- a/test/ExecutionEngine/lit.local.cfg
+++ b/test/ExecutionEngine/lit.local.cfg
@@ -1 +1,12 @@
 config.suffixes = ['.ll', '.c', '.cpp']
+
+def getRoot(config):
+    if not config.parent:
+        return config
+    return getRoot(config.parent)
+
+root = getRoot(config)
+
+if root.host_arch in ['PowerPC']:
+    config.unsupported = True
+
diff --git a/test/ExecutionEngine/test-fp-no-external-funcs.ll b/test/ExecutionEngine/test-fp-no-external-funcs.ll
index 61b12c2..139b2ef 100644
--- a/test/ExecutionEngine/test-fp-no-external-funcs.ll
+++ b/test/ExecutionEngine/test-fp-no-external-funcs.ll
@@ -1,4 +1,5 @@
 ; RUN: %lli  %s > /dev/null
+; XFAIL: arm
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/test-fp.ll b/test/ExecutionEngine/test-fp.ll
index 2bf0210..c906450 100644
--- a/test/ExecutionEngine/test-fp.ll
+++ b/test/ExecutionEngine/test-fp.ll
@@ -1,4 +1,5 @@
 ; RUN: %lli %s > /dev/null
+; XFAIL: arm
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]
diff --git a/test/Feature/linker_private_linkages.ll b/test/Feature/linker_private_linkages.ll
index f9f2908..19bcbb4 100644
--- a/test/Feature/linker_private_linkages.ll
+++ b/test/Feature/linker_private_linkages.ll
@@ -4,4 +4,3 @@
 
 @foo = linker_private hidden global i32 0
 @bar = linker_private_weak hidden global i32 0
-@qux = linker_private_weak_def_auto global i32 0
diff --git a/test/Feature/minsize_attr.ll b/test/Feature/minsize_attr.ll
new file mode 100644
index 0000000..51b133c
--- /dev/null
+++ b/test/Feature/minsize_attr.ll
@@ -0,0 +1,7 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+define void @test1() minsize {
+; CHECK: define void @test1() minsize
+        ret void
+}
+
diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll
index d190001..655f69c 100644
--- a/test/Instrumentation/AddressSanitizer/basic.ll
+++ b/test/Instrumentation/AddressSanitizer/basic.ll
@@ -69,3 +69,23 @@ entry:
   store i32 42, i32* %a
   ret void
 }
+
+; Check that asan leaves just one alloca.
+
+declare void @alloca_test_use([10 x i8]*)
+define void @alloca_test() address_safety {
+entry:
+  %x = alloca [10 x i8], align 1
+  %y = alloca [10 x i8], align 1
+  %z = alloca [10 x i8], align 1
+  call void @alloca_test_use([10 x i8]* %x)
+  call void @alloca_test_use([10 x i8]* %y)
+  call void @alloca_test_use([10 x i8]* %z)
+  ret void
+}
+
+; CHECK: define void @alloca_test()
+; CHECK: = alloca
+; CHECK-NOT: = alloca
+; CHECK: ret void
+
diff --git a/test/Instrumentation/AddressSanitizer/do-not-instrument-internal-globals.ll b/test/Instrumentation/AddressSanitizer/do-not-instrument-internal-globals.ll
new file mode 100644
index 0000000..28d4ac0
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/do-not-instrument-internal-globals.ll
@@ -0,0 +1,19 @@
+; This test checks that we are not instrumenting globals
+; that we created ourselves.
+; RUN: opt < %s -asan -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @_Z3barv() uwtable address_safety {
+entry:
+  %a = alloca i32, align 4
+  call void @_Z3fooPi(i32* %a)
+  ret void
+}
+
+declare void @_Z3fooPi(i32*)
+; We create one global string constant for the stack frame above.
+; Make sure we don't create any other global constants.
+; CHECK: = private constant
+; CHECK-NOT: = private constant
diff --git a/test/Instrumentation/AddressSanitizer/instrument_global.ll b/test/Instrumentation/AddressSanitizer/instrument_global.ll
index ba8d65a..3d92946 100644
--- a/test/Instrumentation/AddressSanitizer/instrument_global.ll
+++ b/test/Instrumentation/AddressSanitizer/instrument_global.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; If a global is present, __asan_[un]register_globals should be called from
 ; module ctor/dtor
 
-; CHECK: llvm.global_dtors
 ; CHECK: llvm.global_ctors
+; CHECK: llvm.global_dtors
 
 ; CHECK: define internal void @asan.module_ctor
 ; CHECK-NOT: ret
diff --git a/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll b/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
new file mode 100644
index 0000000..4725516
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -asan -asan-initialization-order -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+@xxx = global i32 0, align 4
+; Clang will emit the following metadata identifying @xxx as dynamically
+; initialized.
+!0 = metadata !{i32* @xxx}
+!llvm.asan.dynamically_initialized_globals = !{!0}
+
+define i32 @initializer() uwtable {
+entry:
+  ret i32 42
+}
+
+define internal void @__cxx_global_var_init() section ".text.startup" {
+entry:
+  %call = call i32 @initializer()
+  store i32 %call, i32* @xxx, align 4
+  ret void
+}
+
+define internal void @_GLOBAL__I_a() address_safety section ".text.startup" {
+entry:
+  call void @__cxx_global_var_init()
+  ret void
+}
+
+; Clang indicated that @xxx was dynamically initailized.
+; __asan_{before,after}_dynamic_init should be called from _GLOBAL__I_a
+
+; CHECK: define internal void @_GLOBAL__I_a
+; CHECK-NOT: ret
+; CHECK: call void @__asan_before_dynamic_init
+; CHECK: call void @__cxx_global_var_init
+; CHECK: call void @__asan_after_dynamic_init
+; CHECK: ret
diff --git a/test/Instrumentation/ThreadSanitizer/atomic.ll b/test/Instrumentation/ThreadSanitizer/atomic.ll
index 02bf215..107dbdc 100644
--- a/test/Instrumentation/ThreadSanitizer/atomic.ll
+++ b/test/Instrumentation/ThreadSanitizer/atomic.ll
@@ -8,7 +8,7 @@ entry:
   ret i8 %0
 }
 ; CHECK: atomic8_load_unordered
-; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 1)
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 0)
 
 define i8 @atomic8_load_monotonic(i8* %a) nounwind uwtable {
 entry:
@@ -16,7 +16,7 @@ entry:
   ret i8 %0
 }
 ; CHECK: atomic8_load_monotonic
-; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 1)
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 0)
 
 define i8 @atomic8_load_acquire(i8* %a) nounwind uwtable {
 entry:
@@ -24,7 +24,7 @@ entry:
   ret i8 %0
 }
 ; CHECK: atomic8_load_acquire
-; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 4)
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 2)
 
 define i8 @atomic8_load_seq_cst(i8* %a) nounwind uwtable {
 entry:
@@ -32,7 +32,7 @@ entry:
   ret i8 %0
 }
 ; CHECK: atomic8_load_seq_cst
-; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 32)
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 5)
 
 define void @atomic8_store_unordered(i8* %a) nounwind uwtable {
 entry:
@@ -40,7 +40,7 @@ entry:
   ret void
 }
 ; CHECK: atomic8_store_unordered
-; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 1)
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 0)
 
 define void @atomic8_store_monotonic(i8* %a) nounwind uwtable {
 entry:
@@ -48,7 +48,7 @@ entry:
   ret void
 }
 ; CHECK: atomic8_store_monotonic
-; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 1)
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 0)
 
 define void @atomic8_store_release(i8* %a) nounwind uwtable {
 entry:
@@ -56,7 +56,7 @@ entry:
   ret void
 }
 ; CHECK: atomic8_store_release
-; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 8)
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 3)
 
 define void @atomic8_store_seq_cst(i8* %a) nounwind uwtable {
 entry:
@@ -64,7 +64,287 @@ entry:
   ret void
 }
 ; CHECK: atomic8_store_seq_cst
-; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 32)
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 5)
+
+define void @atomic8_xchg_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_xchg_monotonic
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 0)
+
+define void @atomic8_add_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_add_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 0)
+
+define void @atomic8_sub_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_sub_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 0)
+
+define void @atomic8_and_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_and_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 0)
+
+define void @atomic8_or_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_or_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 0)
+
+define void @atomic8_xor_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_xor_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 0)
+
+define void @atomic8_xchg_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_xchg_acquire
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 2)
+
+define void @atomic8_add_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_add_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 2)
+
+define void @atomic8_sub_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_sub_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 2)
+
+define void @atomic8_and_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_and_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 2)
+
+define void @atomic8_or_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_or_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 2)
+
+define void @atomic8_xor_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_xor_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 2)
+
+define void @atomic8_xchg_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_xchg_release
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 3)
+
+define void @atomic8_add_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_add_release
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 3)
+
+define void @atomic8_sub_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_sub_release
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 3)
+
+define void @atomic8_and_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_and_release
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 3)
+
+define void @atomic8_or_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_or_release
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 3)
+
+define void @atomic8_xor_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_xor_release
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 3)
+
+define void @atomic8_xchg_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_xchg_acq_rel
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 4)
+
+define void @atomic8_add_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_add_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 4)
+
+define void @atomic8_sub_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_sub_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 4)
+
+define void @atomic8_and_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_and_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 4)
+
+define void @atomic8_or_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_or_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 4)
+
+define void @atomic8_xor_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_xor_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 4)
+
+define void @atomic8_xchg_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_xchg_seq_cst
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 5)
+
+define void @atomic8_add_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_add_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 5)
+
+define void @atomic8_sub_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_sub_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 5)
+
+define void @atomic8_and_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_and_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 5)
+
+define void @atomic8_or_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_or_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 5)
+
+define void @atomic8_xor_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_xor_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 5)
+
+define void @atomic8_cas_monotonic(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 monotonic
+  ret void
+}
+; CHECK: atomic8_cas_monotonic
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 0)
+
+define void @atomic8_cas_acquire(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 acquire
+  ret void
+}
+; CHECK: atomic8_cas_acquire
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 2)
+
+define void @atomic8_cas_release(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 release
+  ret void
+}
+; CHECK: atomic8_cas_release
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 3)
+
+define void @atomic8_cas_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 acq_rel
+  ret void
+}
+; CHECK: atomic8_cas_acq_rel
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 4)
+
+define void @atomic8_cas_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 seq_cst
+  ret void
+}
+; CHECK: atomic8_cas_seq_cst
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 5)
 
 define i16 @atomic16_load_unordered(i16* %a) nounwind uwtable {
 entry:
@@ -72,7 +352,7 @@ entry:
   ret i16 %0
 }
 ; CHECK: atomic16_load_unordered
-; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 1)
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 0)
 
 define i16 @atomic16_load_monotonic(i16* %a) nounwind uwtable {
 entry:
@@ -80,7 +360,7 @@ entry:
   ret i16 %0
 }
 ; CHECK: atomic16_load_monotonic
-; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 1)
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 0)
 
 define i16 @atomic16_load_acquire(i16* %a) nounwind uwtable {
 entry:
@@ -88,7 +368,7 @@ entry:
   ret i16 %0
 }
 ; CHECK: atomic16_load_acquire
-; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 4)
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 2)
 
 define i16 @atomic16_load_seq_cst(i16* %a) nounwind uwtable {
 entry:
@@ -96,7 +376,7 @@ entry:
   ret i16 %0
 }
 ; CHECK: atomic16_load_seq_cst
-; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 32)
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 5)
 
 define void @atomic16_store_unordered(i16* %a) nounwind uwtable {
 entry:
@@ -104,7 +384,7 @@ entry:
   ret void
 }
 ; CHECK: atomic16_store_unordered
-; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 1)
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 0)
 
 define void @atomic16_store_monotonic(i16* %a) nounwind uwtable {
 entry:
@@ -112,7 +392,7 @@ entry:
   ret void
 }
 ; CHECK: atomic16_store_monotonic
-; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 1)
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 0)
 
 define void @atomic16_store_release(i16* %a) nounwind uwtable {
 entry:
@@ -120,7 +400,7 @@ entry:
   ret void
 }
 ; CHECK: atomic16_store_release
-; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 8)
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 3)
 
 define void @atomic16_store_seq_cst(i16* %a) nounwind uwtable {
 entry:
@@ -128,7 +408,287 @@ entry:
   ret void
 }
 ; CHECK: atomic16_store_seq_cst
-; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 32)
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 5)
+
+define void @atomic16_xchg_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_xchg_monotonic
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 0)
+
+define void @atomic16_add_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_add_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 0)
+
+define void @atomic16_sub_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_sub_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 0)
+
+define void @atomic16_and_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_and_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 0)
+
+define void @atomic16_or_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_or_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 0)
+
+define void @atomic16_xor_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_xor_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 0)
+
+define void @atomic16_xchg_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_xchg_acquire
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 2)
+
+define void @atomic16_add_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_add_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 2)
+
+define void @atomic16_sub_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_sub_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 2)
+
+define void @atomic16_and_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_and_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 2)
+
+define void @atomic16_or_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_or_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 2)
+
+define void @atomic16_xor_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_xor_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 2)
+
+define void @atomic16_xchg_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_xchg_release
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 3)
+
+define void @atomic16_add_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_add_release
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 3)
+
+define void @atomic16_sub_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_sub_release
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 3)
+
+define void @atomic16_and_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_and_release
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 3)
+
+define void @atomic16_or_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_or_release
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 3)
+
+define void @atomic16_xor_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_xor_release
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 3)
+
+define void @atomic16_xchg_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_xchg_acq_rel
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 4)
+
+define void @atomic16_add_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_add_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 4)
+
+define void @atomic16_sub_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_sub_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 4)
+
+define void @atomic16_and_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_and_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 4)
+
+define void @atomic16_or_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_or_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 4)
+
+define void @atomic16_xor_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_xor_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 4)
+
+define void @atomic16_xchg_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_xchg_seq_cst
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 5)
+
+define void @atomic16_add_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_add_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 5)
+
+define void @atomic16_sub_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_sub_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 5)
+
+define void @atomic16_and_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_and_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 5)
+
+define void @atomic16_or_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_or_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 5)
+
+define void @atomic16_xor_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_xor_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 5)
+
+define void @atomic16_cas_monotonic(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 monotonic
+  ret void
+}
+; CHECK: atomic16_cas_monotonic
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 0)
+
+define void @atomic16_cas_acquire(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 acquire
+  ret void
+}
+; CHECK: atomic16_cas_acquire
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 2)
+
+define void @atomic16_cas_release(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 release
+  ret void
+}
+; CHECK: atomic16_cas_release
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 3)
+
+define void @atomic16_cas_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 acq_rel
+  ret void
+}
+; CHECK: atomic16_cas_acq_rel
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 4)
+
+define void @atomic16_cas_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 seq_cst
+  ret void
+}
+; CHECK: atomic16_cas_seq_cst
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 5)
 
 define i32 @atomic32_load_unordered(i32* %a) nounwind uwtable {
 entry:
@@ -136,7 +696,7 @@ entry:
   ret i32 %0
 }
 ; CHECK: atomic32_load_unordered
-; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 1)
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 0)
 
 define i32 @atomic32_load_monotonic(i32* %a) nounwind uwtable {
 entry:
@@ -144,7 +704,7 @@ entry:
   ret i32 %0
 }
 ; CHECK: atomic32_load_monotonic
-; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 1)
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 0)
 
 define i32 @atomic32_load_acquire(i32* %a) nounwind uwtable {
 entry:
@@ -152,7 +712,7 @@ entry:
   ret i32 %0
 }
 ; CHECK: atomic32_load_acquire
-; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 4)
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 2)
 
 define i32 @atomic32_load_seq_cst(i32* %a) nounwind uwtable {
 entry:
@@ -160,7 +720,7 @@ entry:
   ret i32 %0
 }
 ; CHECK: atomic32_load_seq_cst
-; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 32)
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 5)
 
 define void @atomic32_store_unordered(i32* %a) nounwind uwtable {
 entry:
@@ -168,7 +728,7 @@ entry:
   ret void
 }
 ; CHECK: atomic32_store_unordered
-; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 1)
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 0)
 
 define void @atomic32_store_monotonic(i32* %a) nounwind uwtable {
 entry:
@@ -176,7 +736,7 @@ entry:
   ret void
 }
 ; CHECK: atomic32_store_monotonic
-; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 1)
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 0)
 
 define void @atomic32_store_release(i32* %a) nounwind uwtable {
 entry:
@@ -184,7 +744,7 @@ entry:
   ret void
 }
 ; CHECK: atomic32_store_release
-; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 8)
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 3)
 
 define void @atomic32_store_seq_cst(i32* %a) nounwind uwtable {
 entry:
@@ -192,7 +752,287 @@ entry:
   ret void
 }
 ; CHECK: atomic32_store_seq_cst
-; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 32)
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 5)
+
+define void @atomic32_xchg_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_xchg_monotonic
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 0)
+
+define void @atomic32_add_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_add_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 0)
+
+define void @atomic32_sub_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_sub_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 0)
+
+define void @atomic32_and_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_and_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 0)
+
+define void @atomic32_or_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_or_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 0)
+
+define void @atomic32_xor_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_xor_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 0)
+
+define void @atomic32_xchg_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_xchg_acquire
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 2)
+
+define void @atomic32_add_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_add_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 2)
+
+define void @atomic32_sub_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_sub_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 2)
+
+define void @atomic32_and_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_and_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 2)
+
+define void @atomic32_or_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_or_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 2)
+
+define void @atomic32_xor_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_xor_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 2)
+
+define void @atomic32_xchg_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_xchg_release
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 3)
+
+define void @atomic32_add_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_add_release
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 3)
+
+define void @atomic32_sub_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_sub_release
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 3)
+
+define void @atomic32_and_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_and_release
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 3)
+
+define void @atomic32_or_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_or_release
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 3)
+
+define void @atomic32_xor_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_xor_release
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 3)
+
+define void @atomic32_xchg_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_xchg_acq_rel
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 4)
+
+define void @atomic32_add_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_add_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 4)
+
+define void @atomic32_sub_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_sub_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 4)
+
+define void @atomic32_and_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_and_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 4)
+
+define void @atomic32_or_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_or_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 4)
+
+define void @atomic32_xor_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_xor_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 4)
+
+define void @atomic32_xchg_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_xchg_seq_cst
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 5)
+
+define void @atomic32_add_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_add_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 5)
+
+define void @atomic32_sub_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_sub_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 5)
+
+define void @atomic32_and_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_and_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 5)
+
+define void @atomic32_or_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_or_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 5)
+
+define void @atomic32_xor_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_xor_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 5)
+
+define void @atomic32_cas_monotonic(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 monotonic
+  ret void
+}
+; CHECK: atomic32_cas_monotonic
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 0)
+
+define void @atomic32_cas_acquire(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 acquire
+  ret void
+}
+; CHECK: atomic32_cas_acquire
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 2)
+
+define void @atomic32_cas_release(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 release
+  ret void
+}
+; CHECK: atomic32_cas_release
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 3)
+
+define void @atomic32_cas_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 acq_rel
+  ret void
+}
+; CHECK: atomic32_cas_acq_rel
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 4)
+
+define void @atomic32_cas_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 seq_cst
+  ret void
+}
+; CHECK: atomic32_cas_seq_cst
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 5)
 
 define i64 @atomic64_load_unordered(i64* %a) nounwind uwtable {
 entry:
@@ -200,7 +1040,7 @@ entry:
   ret i64 %0
 }
 ; CHECK: atomic64_load_unordered
-; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 1)
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 0)
 
 define i64 @atomic64_load_monotonic(i64* %a) nounwind uwtable {
 entry:
@@ -208,7 +1048,7 @@ entry:
   ret i64 %0
 }
 ; CHECK: atomic64_load_monotonic
-; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 1)
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 0)
 
 define i64 @atomic64_load_acquire(i64* %a) nounwind uwtable {
 entry:
@@ -216,7 +1056,7 @@ entry:
   ret i64 %0
 }
 ; CHECK: atomic64_load_acquire
-; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 4)
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 2)
 
 define i64 @atomic64_load_seq_cst(i64* %a) nounwind uwtable {
 entry:
@@ -224,7 +1064,7 @@ entry:
   ret i64 %0
 }
 ; CHECK: atomic64_load_seq_cst
-; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 32)
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 5)
 
 define void @atomic64_store_unordered(i64* %a) nounwind uwtable {
 entry:
@@ -232,7 +1072,7 @@ entry:
   ret void
 }
 ; CHECK: atomic64_store_unordered
-; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 1)
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 0)
 
 define void @atomic64_store_monotonic(i64* %a) nounwind uwtable {
 entry:
@@ -240,7 +1080,7 @@ entry:
   ret void
 }
 ; CHECK: atomic64_store_monotonic
-; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 1)
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 0)
 
 define void @atomic64_store_release(i64* %a) nounwind uwtable {
 entry:
@@ -248,7 +1088,7 @@ entry:
   ret void
 }
 ; CHECK: atomic64_store_release
-; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 8)
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 3)
 
 define void @atomic64_store_seq_cst(i64* %a) nounwind uwtable {
 entry:
@@ -256,7 +1096,287 @@ entry:
   ret void
 }
 ; CHECK: atomic64_store_seq_cst
-; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 32)
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 5)
+
+define void @atomic64_xchg_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_xchg_monotonic
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 0)
+
+define void @atomic64_add_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_add_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 0)
+
+define void @atomic64_sub_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_sub_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 0)
+
+define void @atomic64_and_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_and_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 0)
+
+define void @atomic64_or_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_or_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 0)
+
+define void @atomic64_xor_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_xor_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 0)
+
+define void @atomic64_xchg_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_xchg_acquire
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 2)
+
+define void @atomic64_add_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_add_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 2)
+
+define void @atomic64_sub_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_sub_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 2)
+
+define void @atomic64_and_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_and_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 2)
+
+define void @atomic64_or_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_or_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 2)
+
+define void @atomic64_xor_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_xor_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 2)
+
+define void @atomic64_xchg_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_xchg_release
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 3)
+
+define void @atomic64_add_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_add_release
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 3)
+
+define void @atomic64_sub_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_sub_release
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 3)
+
+define void @atomic64_and_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_and_release
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 3)
+
+define void @atomic64_or_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_or_release
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 3)
+
+define void @atomic64_xor_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_xor_release
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 3)
+
+define void @atomic64_xchg_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_xchg_acq_rel
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 4)
+
+define void @atomic64_add_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_add_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 4)
+
+define void @atomic64_sub_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_sub_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 4)
+
+define void @atomic64_and_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_and_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 4)
+
+define void @atomic64_or_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_or_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 4)
+
+define void @atomic64_xor_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_xor_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 4)
+
+define void @atomic64_xchg_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_xchg_seq_cst
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 5)
+
+define void @atomic64_add_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_add_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 5)
+
+define void @atomic64_sub_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_sub_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 5)
+
+define void @atomic64_and_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_and_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 5)
+
+define void @atomic64_or_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_or_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 5)
+
+define void @atomic64_xor_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_xor_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 5)
+
+define void @atomic64_cas_monotonic(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 monotonic
+  ret void
+}
+; CHECK: atomic64_cas_monotonic
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 0)
+
+define void @atomic64_cas_acquire(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 acquire
+  ret void
+}
+; CHECK: atomic64_cas_acquire
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 2)
+
+define void @atomic64_cas_release(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 release
+  ret void
+}
+; CHECK: atomic64_cas_release
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 3)
+
+define void @atomic64_cas_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 acq_rel
+  ret void
+}
+; CHECK: atomic64_cas_acq_rel
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 4)
+
+define void @atomic64_cas_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 seq_cst
+  ret void
+}
+; CHECK: atomic64_cas_seq_cst
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 5)
 
 define i128 @atomic128_load_unordered(i128* %a) nounwind uwtable {
 entry:
@@ -264,7 +1384,7 @@ entry:
   ret i128 %0
 }
 ; CHECK: atomic128_load_unordered
-; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 1)
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 0)
 
 define i128 @atomic128_load_monotonic(i128* %a) nounwind uwtable {
 entry:
@@ -272,7 +1392,7 @@ entry:
   ret i128 %0
 }
 ; CHECK: atomic128_load_monotonic
-; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 1)
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 0)
 
 define i128 @atomic128_load_acquire(i128* %a) nounwind uwtable {
 entry:
@@ -280,7 +1400,7 @@ entry:
   ret i128 %0
 }
 ; CHECK: atomic128_load_acquire
-; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 4)
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 2)
 
 define i128 @atomic128_load_seq_cst(i128* %a) nounwind uwtable {
 entry:
@@ -288,7 +1408,7 @@ entry:
   ret i128 %0
 }
 ; CHECK: atomic128_load_seq_cst
-; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 32)
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 5)
 
 define void @atomic128_store_unordered(i128* %a) nounwind uwtable {
 entry:
@@ -296,7 +1416,7 @@ entry:
   ret void
 }
 ; CHECK: atomic128_store_unordered
-; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 1)
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 0)
 
 define void @atomic128_store_monotonic(i128* %a) nounwind uwtable {
 entry:
@@ -304,7 +1424,7 @@ entry:
   ret void
 }
 ; CHECK: atomic128_store_monotonic
-; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 1)
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 0)
 
 define void @atomic128_store_release(i128* %a) nounwind uwtable {
 entry:
@@ -312,7 +1432,7 @@ entry:
   ret void
 }
 ; CHECK: atomic128_store_release
-; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 8)
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 3)
 
 define void @atomic128_store_seq_cst(i128* %a) nounwind uwtable {
 entry:
@@ -320,4 +1440,348 @@ entry:
   ret void
 }
 ; CHECK: atomic128_store_seq_cst
-; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 32)
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 5)
+
+define void @atomic128_xchg_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_xchg_monotonic
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 0)
+
+define void @atomic128_add_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_add_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 0)
+
+define void @atomic128_sub_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_sub_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 0)
+
+define void @atomic128_and_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_and_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 0)
+
+define void @atomic128_or_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_or_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 0)
+
+define void @atomic128_xor_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_xor_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 0)
+
+define void @atomic128_xchg_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_xchg_acquire
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 2)
+
+define void @atomic128_add_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_add_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 2)
+
+define void @atomic128_sub_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_sub_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 2)
+
+define void @atomic128_and_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_and_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 2)
+
+define void @atomic128_or_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_or_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 2)
+
+define void @atomic128_xor_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_xor_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 2)
+
+define void @atomic128_xchg_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_xchg_release
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 3)
+
+define void @atomic128_add_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_add_release
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 3)
+
+define void @atomic128_sub_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_sub_release
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 3)
+
+define void @atomic128_and_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_and_release
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 3)
+
+define void @atomic128_or_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_or_release
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 3)
+
+define void @atomic128_xor_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_xor_release
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 3)
+
+define void @atomic128_xchg_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_xchg_acq_rel
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 4)
+
+define void @atomic128_add_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_add_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 4)
+
+define void @atomic128_sub_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_sub_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 4)
+
+define void @atomic128_and_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_and_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 4)
+
+define void @atomic128_or_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_or_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 4)
+
+define void @atomic128_xor_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_xor_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 4)
+
+define void @atomic128_xchg_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_xchg_seq_cst
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 5)
+
+define void @atomic128_add_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_add_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 5)
+
+define void @atomic128_sub_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_sub_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 5)
+
+define void @atomic128_and_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_and_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 5)
+
+define void @atomic128_or_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_or_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 5)
+
+define void @atomic128_xor_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_xor_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 5)
+
+define void @atomic128_cas_monotonic(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 monotonic
+  ret void
+}
+; CHECK: atomic128_cas_monotonic
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 0)
+
+define void @atomic128_cas_acquire(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 acquire
+  ret void
+}
+; CHECK: atomic128_cas_acquire
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 2)
+
+define void @atomic128_cas_release(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 release
+  ret void
+}
+; CHECK: atomic128_cas_release
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 3)
+
+define void @atomic128_cas_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 acq_rel
+  ret void
+}
+; CHECK: atomic128_cas_acq_rel
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 4)
+
+define void @atomic128_cas_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 seq_cst
+  ret void
+}
+; CHECK: atomic128_cas_seq_cst
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 5)
+
+define void @atomic_signal_fence_acquire() nounwind uwtable {
+entry:
+  fence singlethread acquire
+  ret void
+}
+; CHECK: atomic_signal_fence_acquire
+; CHECK: call void @__tsan_atomic_signal_fence(i32 2)
+
+define void @atomic_thread_fence_acquire() nounwind uwtable {
+entry:
+  fence  acquire
+  ret void
+}
+; CHECK: atomic_thread_fence_acquire
+; CHECK: call void @__tsan_atomic_thread_fence(i32 2)
+
+define void @atomic_signal_fence_release() nounwind uwtable {
+entry:
+  fence singlethread release
+  ret void
+}
+; CHECK: atomic_signal_fence_release
+; CHECK: call void @__tsan_atomic_signal_fence(i32 3)
+
+define void @atomic_thread_fence_release() nounwind uwtable {
+entry:
+  fence  release
+  ret void
+}
+; CHECK: atomic_thread_fence_release
+; CHECK: call void @__tsan_atomic_thread_fence(i32 3)
+
+define void @atomic_signal_fence_acq_rel() nounwind uwtable {
+entry:
+  fence singlethread acq_rel
+  ret void
+}
+; CHECK: atomic_signal_fence_acq_rel
+; CHECK: call void @__tsan_atomic_signal_fence(i32 4)
+
+define void @atomic_thread_fence_acq_rel() nounwind uwtable {
+entry:
+  fence  acq_rel
+  ret void
+}
+; CHECK: atomic_thread_fence_acq_rel
+; CHECK: call void @__tsan_atomic_thread_fence(i32 4)
+
+define void @atomic_signal_fence_seq_cst() nounwind uwtable {
+entry:
+  fence singlethread seq_cst
+  ret void
+}
+; CHECK: atomic_signal_fence_seq_cst
+; CHECK: call void @__tsan_atomic_signal_fence(i32 5)
+
+define void @atomic_thread_fence_seq_cst() nounwind uwtable {
+entry:
+  fence  seq_cst
+  ret void
+}
+; CHECK: atomic_thread_fence_seq_cst
+; CHECK: call void @__tsan_atomic_thread_fence(i32 5)
diff --git a/test/MC/ARM/arm-arithmetic-aliases.s b/test/MC/ARM/arm-arithmetic-aliases.s
index 9895cfc..3ed4448 100644
--- a/test/MC/ARM/arm-arithmetic-aliases.s
+++ b/test/MC/ARM/arm-arithmetic-aliases.s
@@ -124,3 +124,7 @@ bicseq r2, r3
 @ CHECK: bicseq r2, r2, #6              @ encoding: [0x06,0x20,0xd2,0x03]
 @ CHECK: bicseq r2, r2, r3              @ encoding: [0x03,0x20,0xd2,0x01]
 @ CHECK: bicseq r2, r2, r3              @ encoding: [0x03,0x20,0xd2,0x01]
+
+add r0, pc, #123
+
+@ CHECK: adr	r0, #123                @ encoding: [0x7b,0x00,0x8f,0xe2]
diff --git a/test/MC/ARM/arm-shift-encoding.s b/test/MC/ARM/arm-shift-encoding.s
new file mode 100644
index 0000000..3c57b67
--- /dev/null
+++ b/test/MC/ARM/arm-shift-encoding.s
@@ -0,0 +1,119 @@
+@ RUN: llvm-mc -mcpu=cortex-a8 -triple armv7 -show-encoding < %s | FileCheck %s
+
+	ldr r0, [r0, r0]
+	ldr r0, [r0, r0, lsr #32]
+	ldr r0, [r0, r0, lsr #16]
+	ldr r0, [r0, r0, lsl #0]
+	ldr r0, [r0, r0, lsl #16]
+	ldr r0, [r0, r0, asr #32]
+	ldr r0, [r0, r0, asr #16]
+	ldr r0, [r0, r0, rrx]
+	ldr r0, [r0, r0, ror #16]
+
+@ CHECK: ldr r0, [r0, r0]          @ encoding: [0x00,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, lsr #32] @ encoding: [0x20,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, lsr #16] @ encoding: [0x20,0x08,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0]          @ encoding: [0x00,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, lsl #16] @ encoding: [0x00,0x08,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, asr #32] @ encoding: [0x40,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, asr #16] @ encoding: [0x40,0x08,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, rrx]     @ encoding: [0x60,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, ror #16] @ encoding: [0x60,0x08,0x90,0xe7]
+
+	pld [r0, r0]
+	pld [r0, r0, lsr #32]
+	pld [r0, r0, lsr #16]
+	pld [r0, r0, lsl #0]
+	pld [r0, r0, lsl #16]
+	pld [r0, r0, asr #32]
+	pld [r0, r0, asr #16]
+	pld [r0, r0, rrx]
+	pld [r0, r0, ror #16]
+
+@ CHECK: [r0, r0]          @ encoding: [0x00,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, lsr #32] @ encoding: [0x20,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, lsr #16] @ encoding: [0x20,0xf8,0xd0,0xf7]
+@ CHECK: [r0, r0]          @ encoding: [0x00,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, lsl #16] @ encoding: [0x00,0xf8,0xd0,0xf7]
+@ CHECK: [r0, r0, asr #32] @ encoding: [0x40,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, asr #16] @ encoding: [0x40,0xf8,0xd0,0xf7]
+@ CHECK: [r0, r0, rrx]     @ encoding: [0x60,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, ror #16] @ encoding: [0x60,0xf8,0xd0,0xf7]
+
+	str r0, [r0, r0]
+	str r0, [r0, r0, lsr #32]
+	str r0, [r0, r0, lsr #16]
+	str r0, [r0, r0, lsl #0]
+	str r0, [r0, r0, lsl #16]
+	str r0, [r0, r0, asr #32]
+	str r0, [r0, r0, asr #16]
+	str r0, [r0, r0, rrx]
+	str r0, [r0, r0, ror #16]
+
+@ CHECK: str r0, [r0, r0]          @ encoding: [0x00,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, lsr #32] @ encoding: [0x20,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, lsr #16] @ encoding: [0x20,0x08,0x80,0xe7]
+@ CHECK: str r0, [r0, r0]          @ encoding: [0x00,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, lsl #16] @ encoding: [0x00,0x08,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, asr #32] @ encoding: [0x40,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, asr #16] @ encoding: [0x40,0x08,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, rrx]     @ encoding: [0x60,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, ror #16] @ encoding: [0x60,0x08,0x80,0xe7]
+
+@ Uses printAddrMode2OffsetOperand(), used by LDRBT_POST_IMM LDRBT_POST_REG
+@ LDRB_POST_IMM LDRB_POST_REG LDRT_POST_IMM LDRT_POST_REG LDR_POST_IMM
+@ LDR_POST_REG STRBT_POST_IMM STRBT_POST_REG STRB_POST_IMM STRB_POST_REG
+@ STRT_POST_IMM STRT_POST_REG STR_POST_IMM STR_POST_REG
+
+	ldr r0, [r1], r2, rrx
+	ldr r3, [r4], r5, ror #0
+	str r6, [r7], r8, lsl #0
+	str r9, [r10], r11
+
+@ CHECK: ldr r0, [r1], r2, rrx    @ encoding: [0x62,0x00,0x91,0xe6]
+@ CHECK: ldr r3, [r4], r5         @ encoding: [0x05,0x30,0x94,0xe6]
+@ CHECK: str r6, [r7], r8         @ encoding: [0x08,0x60,0x87,0xe6]
+@ CHECK: str r9, [r10], r11       @ encoding: [0x0b,0x90,0x8a,0xe6]
+
+@ Uses printSORegImmOperand(), used by ADCrsi ADDrsi ANDrsi BICrsi EORrsi
+@ ORRrsi RSBrsi RSCrsi SBCrsi SUBrsi CMNzrsi CMPrsi MOVsi MVNsi TEQrsi TSTrsi
+
+	adc sp, lr, pc
+	adc r1, r8, r9, lsr #32
+	adc r2, r7, pc, lsr #16
+	adc r3, r6, r10, lsl #0
+	adc r4, r5, lr, lsl #16
+	adc r5, r4, r11, asr #32
+	adc r6, r3, sp, asr #16
+	adc r7, r2, r12, rrx
+	adc r8, r1, r0, ror #16
+
+@ CHECK: adc sp, lr, pc           @ encoding: [0x0f,0xd0,0xae,0xe0]
+@ CHECK: adc r1, r8, r9, lsr #32  @ encoding: [0x29,0x10,0xa8,0xe0]
+@ CHECK: adc r2, r7, pc, lsr #16  @ encoding: [0x2f,0x28,0xa7,0xe0]
+@ CHECK: adc r3, r6, r10          @ encoding: [0x0a,0x30,0xa6,0xe0]
+@ CHECK: adc r4, r5, lr, lsl #16  @ encoding: [0x0e,0x48,0xa5,0xe0]
+@ CHECK: adc r5, r4, r11, asr #32 @ encoding: [0x4b,0x50,0xa4,0xe0]
+@ CHECK: adc r6, r3, sp, asr #16  @ encoding: [0x4d,0x68,0xa3,0xe0]
+@ CHECK: adc r7, r2, r12, rrx     @ encoding: [0x6c,0x70,0xa2,0xe0]
+@ CHECK: adc r8, r1, r0, ror #16  @ encoding: [0x60,0x88,0xa1,0xe0]
+
+	cmp sp, lr
+	cmp r1, r8, lsr #32
+	cmp r2, r7, lsr #16
+	cmp r3, r6, lsl #0
+	cmp r4, r5, lsl #16
+	cmp r5, r4, asr #32
+	cmp r6, r3, asr #16
+	cmp r7, r2, rrx
+	cmp r8, r1, ror #16
+
+@ CHECK: cmp sp, lr           @ encoding: [0x0e,0x00,0x5d,0xe1]
+@ CHECK: cmp r1, r8, lsr #32  @ encoding: [0x28,0x00,0x51,0xe1]
+@ CHECK: cmp r2, r7, lsr #16  @ encoding: [0x27,0x08,0x52,0xe1]
+@ CHECK: cmp r3, r6           @ encoding: [0x06,0x00,0x53,0xe1]
+@ CHECK: cmp r4, r5, lsl #16  @ encoding: [0x05,0x08,0x54,0xe1]
+@ CHECK: cmp r5, r4, asr #32  @ encoding: [0x44,0x00,0x55,0xe1]
+@ CHECK: cmp r6, r3, asr #16  @ encoding: [0x43,0x08,0x56,0xe1]
+@ CHECK: cmp r7, r2, rrx      @ encoding: [0x62,0x00,0x57,0xe1]
+@ CHECK: cmp r8, r1, ror #16  @ encoding: [0x61,0x08,0x58,0xe1]
diff --git a/test/MC/ARM/basic-thumb-instructions.s b/test/MC/ARM/basic-thumb-instructions.s
index 4ee34ce..22e21da 100644
--- a/test/MC/ARM/basic-thumb-instructions.s
+++ b/test/MC/ARM/basic-thumb-instructions.s
@@ -259,8 +259,8 @@ _func:
 
 @ CHECK: ldr	r1, _foo                @ encoding: [A,0x49]
              @   fixup A - offset: 0, value: _foo, kind: fixup_arm_thumb_cp
-@ CHECK: ldr     r3, #604                @ encoding: [0x97,0x4b]
-@ CHECK: ldr     r3, #368                @ encoding: [0x5c,0x4b]
+@ CHECK: ldr     r3, [pc, #604]         @ encoding: [0x97,0x4b]
+@ CHECK: ldr     r3, [pc, #368]         @ encoding: [0x5c,0x4b]
 
 @------------------------------------------------------------------------------
 @ LDR (register)
diff --git a/test/MC/ARM/diagnostics.s b/test/MC/ARM/diagnostics.s
index 499e05501..d65cfd7 100644
--- a/test/MC/ARM/diagnostics.s
+++ b/test/MC/ARM/diagnostics.s
@@ -47,7 +47,47 @@
 @ CHECK-ERRORS: error: immediate shift value out of range
 @ CHECK-ERRORS:         adc r4, r5, r6, ror #32
 
+        @ Out of range shift immediate values for load/store.
+        str r1, [r2, r3, lsl #invalid]
+        ldr r4, [r5], r6, lsl #-1
+        pld r4, [r5, r6, lsl #32]
+        str r4, [r5], r6, lsr #-1
+        ldr r4, [r5, r6, lsr #33]
+        pld r4, [r5, r6, asr #-1]
+        str r4, [r5, r6, asr #33]
+        ldr r4, [r5, r6, ror #-1]
+        pld r4, [r5, r6, ror #32]
+        pld r4, [r5, r6, rrx #0]
 
+@ CHECK-ERRORS: error: shift amount must be an immediate
+@ CHECK-ERRORS:         str r1, [r2, r3, lsl #invalid]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         ldr r4, [r5], r6, lsl #-1
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         pld r4, [r5, r6, lsl #32]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         str r4, [r5], r6, lsr #-1
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         ldr r4, [r5, r6, lsr #33]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         pld r4, [r5, r6, asr #-1]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         str r4, [r5, r6, asr #33]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         ldr r4, [r5, r6, ror #-1]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         pld r4, [r5, r6, ror #32]
+@ CHECK-ERRORS: error: ']' expected
+@ CHECK-ERRORS:         pld r4, [r5, r6, rrx #0]
+        
         @ Out of range 16-bit immediate on BKPT
         bkpt #65536
 
@@ -321,3 +361,13 @@
 @ CHECK-ERRORS: error: invalid operand for instruction
 @ CHECK-ERRORS:         cps f,#1
 @ CHECK-ERRORS:               ^
+
+        @ Bad operands for msr
+        msr #0, #0
+        msr foo, #0
+@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS:         msr #0, #0
+@ CHECK-ERRORS:             ^
+@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS:         msr foo, #0
+@ CHECK-ERRORS:             ^
diff --git a/test/MC/ARM/elf-jump24-fixup.s b/test/MC/ARM/elf-jump24-fixup.s
new file mode 100644
index 0000000..75a4b86
--- /dev/null
+++ b/test/MC/ARM/elf-jump24-fixup.s
@@ -0,0 +1,9 @@
+@ RUN: llvm-mc %s -triple=thumbv7-linux-gnueabi -filetype=obj -o - < %s | llvm-objdump -r - | FileCheck %s
+	.syntax unified
+	.text
+	.code	16
+	.thumb_func
+foo:
+	b.w	bar
+
+@ CHECK: {{[0-9]+}} R_ARM_THM_JUMP24 bar
diff --git a/test/MC/ARM/thumb-shift-encoding.s b/test/MC/ARM/thumb-shift-encoding.s
new file mode 100644
index 0000000..5428413
--- /dev/null
+++ b/test/MC/ARM/thumb-shift-encoding.s
@@ -0,0 +1,45 @@
+@ RUN: llvm-mc -mcpu=cortex-a8 -triple thumbv7 -show-encoding < %s | FileCheck %s
+
+@ Uses printT2SOOperand(), used by t2ADCrs t2ADDrs t2ANDrs t2BICrs t2EORrs
+@ t2ORNrs t2ORRrs t2RSBrs t2SBCrs t2SUBrs t2CMNzrs t2CMPrs t2MOVSsi t2MOVsi
+@ t2MVNs t2TEQrs t2TSTrs
+
+	sbc.w r12, lr, r0
+	sbc.w r1, r8, r9, lsr #32
+	sbc.w r2, r7, pc, lsr #16
+	sbc.w r3, r6, r10, lsl #0
+	sbc.w r4, r5, lr, lsl #16
+	sbc.w r5, r4, r11, asr #32
+	sbc.w r6, r3, sp, asr #16
+	sbc.w r7, r2, r12, rrx
+	sbc.w r8, r1, r0, ror #16
+
+@ CHECK: sbc.w r12, lr, r0          @ encoding: [0x6e,0xeb,0x00,0x0c]
+@ CHECK: sbc.w r1, r8, r9, lsr #32  @ encoding: [0x68,0xeb,0x19,0x01]
+@ CHECK: sbc.w r2, r7, pc, lsr #16  @ encoding: [0x67,0xeb,0x1f,0x42]
+@ CHECK: sbc.w r3, r6, r10          @ encoding: [0x66,0xeb,0x0a,0x03]
+@ CHECK: sbc.w r4, r5, lr, lsl #16  @ encoding: [0x65,0xeb,0x0e,0x44]
+@ CHECK: sbc.w r5, r4, r11, asr #32 @ encoding: [0x64,0xeb,0x2b,0x05]
+@ CHECK: sbc.w r6, r3, sp, asr #16  @ encoding: [0x63,0xeb,0x2d,0x46]
+@ CHECK: sbc.w r7, r2, r12, rrx     @ encoding: [0x62,0xeb,0x3c,0x07]
+@ CHECK: sbc.w r8, r1, r0, ror #16  @ encoding: [0x61,0xeb,0x30,0x48]
+
+	and.w r12, lr, r0
+	and.w r1, r8, r9, lsr #32
+	and.w r2, r7, pc, lsr #16
+	and.w r3, r6, r10, lsl #0
+	and.w r4, r5, lr, lsl #16
+	and.w r5, r4, r11, asr #32
+	and.w r6, r3, sp, asr #16
+	and.w r7, r2, r12, rrx
+	and.w r8, r1, r0, ror #16
+
+@ CHECK: and.w r12, lr, r0          @ encoding: [0x0e,0xea,0x00,0x0c]
+@ CHECK: and.w r1, r8, r9, lsr #32  @ encoding: [0x08,0xea,0x19,0x01]
+@ CHECK: and.w r2, r7, pc, lsr #16  @ encoding: [0x07,0xea,0x1f,0x42]
+@ CHECK: and.w r3, r6, r10          @ encoding: [0x06,0xea,0x0a,0x03]
+@ CHECK: and.w r4, r5, lr, lsl #16  @ encoding: [0x05,0xea,0x0e,0x44]
+@ CHECK: and.w r5, r4, r11, asr #32 @ encoding: [0x04,0xea,0x2b,0x05]
+@ CHECK: and.w r6, r3, sp, asr #16  @ encoding: [0x03,0xea,0x2d,0x46]
+@ CHECK: and.w r7, r2, r12, rrx     @ encoding: [0x02,0xea,0x3c,0x07]
+@ CHECK: and.w r8, r1, r0, ror #16  @ encoding: [0x01,0xea,0x30,0x48]
diff --git a/test/MC/ARM/thumb2-b.w-encodingT4.s b/test/MC/ARM/thumb2-b.w-encodingT4.s
new file mode 100644
index 0000000..be77b06
--- /dev/null
+++ b/test/MC/ARM/thumb2-b.w-encodingT4.s
@@ -0,0 +1,12 @@
+@ RUN: llvm-mc -triple=thumbv7-apple-darwin -mcpu=cortex-a8 -show-encoding < %s | FileCheck %s
+  .syntax unified
+  .globl _func
+.thumb_func _foo
+.space 0x37c6
+_foo:
+@------------------------------------------------------------------------------
+@ B (thumb2 b.w encoding T4) rdar://12585795
+@------------------------------------------------------------------------------
+        b.w   0x3680c
+
+@ CHECK: b.w	#223244                    @ encoding: [0x6d,0xf0,0x0c,0xb0]
diff --git a/test/MC/AsmParser/bad-macro.s b/test/MC/AsmParser/bad-macro.s
new file mode 100644
index 0000000..313607b
--- /dev/null
+++ b/test/MC/AsmParser/bad-macro.s
@@ -0,0 +1,9 @@
+// RUN: not llvm-mc -triple x86_64-apple-darwin10 %s 2>&1 | FileCheck %s
+
+.macro 23
+
+// CHECK: expected identifier in '.macro' directive
+
+.macro abc 33
+
+// CHECK: expected identifier in '.macro' directive
diff --git a/test/MC/AsmParser/directive_lcomm.s b/test/MC/AsmParser/directive_lcomm.s
index 0a0add5..37a350c 100644
--- a/test/MC/AsmParser/directive_lcomm.s
+++ b/test/MC/AsmParser/directive_lcomm.s
@@ -1,9 +1,14 @@
 # RUN: llvm-mc -triple i386-apple-darwin10 %s | FileCheck %s
+# RUN: llvm-mc -triple i386-pc-mingw32 %s | FileCheck %s
+# RUN: not llvm-mc -triple i386-linux-gnu %s 2>&1 | FileCheck %s -check-prefix=ERROR
 
 # CHECK: TEST0:
-# CHECK: .zerofill __DATA,__bss,a,7,4
-# CHECK: .zerofill __DATA,__bss,b,8
-# CHECK: .zerofill __DATA,__bss,c,0
+# CHECK: .lcomm a,7,4
+# CHECK: .lcomm b,8
+# CHECK: .lcomm c,0
+
+# ELF doesn't like alignment on .lcomm.
+# ERROR: alignment not supported on this target
 TEST0:  
         .lcomm a, 8-1, 4
         .lcomm b,8
diff --git a/test/MC/AsmParser/labels.s b/test/MC/AsmParser/labels.s
index 5609175..6a9870b 100644
--- a/test/MC/AsmParser/labels.s
+++ b/test/MC/AsmParser/labels.s
@@ -41,7 +41,7 @@ foo:
 // CHECK: .comm "a 6",1
         .comm "a 6", 1
 
-// CHECK: .zerofill __DATA,__bss,"a 7",1,0
+// CHECK: .lcomm "a 7",1
         .lcomm "a 7", 1
 
 // FIXME: We don't bother to support .lsym.
diff --git a/test/MC/AsmParser/macro-args.s b/test/MC/AsmParser/macro-args.s
index 6d08421..3269369 100644
--- a/test/MC/AsmParser/macro-args.s
+++ b/test/MC/AsmParser/macro-args.s
@@ -4,10 +4,18 @@
     movl   \var@GOTOFF(%ebx),\re2g
 .endm
 
+.macro GET_DEFAULT var, re2g=%ebx, re3g=%ecx
+movl 2(\re2g, \re3g, 2), \var
+.endm
+
+GET         is_sse, %eax
+// CHECK: movl  is_sse@GOTOFF(%ebx), %eax
 
-GET    is_sse, %eax
+GET_DEFAULT %ebx, , %edx
+// CHECK: movl  2(%ebx,%edx,2), %ebx
 
-// CHECK: movl	is_sse@GOTOFF(%ebx), %eax
+GET_DEFAULT %ebx, %edx
+// CHECK: movl  2(%edx,%ecx,2), %ebx
 
 .macro bar
     .long $n
diff --git a/test/MC/AsmParser/macro-rept-err1.s b/test/MC/AsmParser/macro-rept-err1.s
index db92856..cfa6687 100644
--- a/test/MC/AsmParser/macro-rept-err1.s
+++ b/test/MC/AsmParser/macro-rept-err1.s
@@ -3,4 +3,4 @@
 
 .endr
 
-// CHECK: unexpected '.endr' directive, no current .rept
+// CHECK: unmatched '.endr' directive
diff --git a/test/MC/AsmParser/macros-darwin.s b/test/MC/AsmParser/macros-darwin.s
new file mode 100644
index 0000000..31b9edb
--- /dev/null
+++ b/test/MC/AsmParser/macros-darwin.s
@@ -0,0 +1,9 @@
+// RUN: not llvm-mc -triple i386-apple-darwin10 %s 2> %t.err | FileCheck %s
+
+.macro test1
+.globl "$0 $1 $2 $$3 $n"
+.endmacro
+
+// CHECK: .globl "1 23  $3 2"
+test1 1, 2 3
+
diff --git a/test/MC/AsmParser/macros.s b/test/MC/AsmParser/macros.s
index 2957592..b1cb851 100644
--- a/test/MC/AsmParser/macros.s
+++ b/test/MC/AsmParser/macros.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple x86_64-apple-darwin10 %s 2> %t.err | FileCheck %s
+// RUN: not llvm-mc -triple i386-unknown-unknown %s 2> %t.err | FileCheck %s
 // RUN: FileCheck --check-prefix=CHECK-ERRORS %s < %t.err
 
 .macro .test0
@@ -28,12 +28,66 @@ test2 10
 .globl "$0 $1 $2 $$3 $n"
 .endmacro
 
-// CHECK: .globl	"1 23  $3 2"
-test3 1,2 3
+// CHECK: .globl	"1 (23)  $3 2"
+test3 1, (2 3)
+
+// CHECK: .globl "1 2  $3 2"
+test3 1 2
 
 .macro test4
 .globl "$0 -- $1"
 .endmacro
 
-// CHECK: .globl	"ab)(,) -- (cd)"
-test4 a b)(,),(cd)
+// CHECK: .globl  "(ab)(,)) -- (cd)"
+test4 (a b)(,)),(cd)
+
+// CHECK: .globl  "(ab)(,)) -- (cd)"
+test4 (a b)(,)),(cd)
+
+.macro test5 _a
+.globl "\_a"
+.endm
+
+// CHECK: .globl zed1
+test5 zed1
+
+.macro test6 $a
+.globl "\$a"
+.endm
+
+// CHECK: .globl zed2
+test6 zed2
+
+.macro test7 .a
+.globl "\.a"
+.endm
+
+// CHECK: .globl zed3
+test7 zed3
+
+.macro test8 _a, _b, _c
+.globl "\_a,\_b,\_c"
+.endmacro
+
+.macro test9 _a _b _c
+.globl "\_a \_b \_c"
+.endmacro
+
+// CHECK: .globl  "a,b,c"
+test8 a, b, c
+// CHECK: .globl  "%1,%2,%3"
+test8 %1 %2 %3 #a comment
+// CHECK: .globl "x-y,z,1"
+test8 x - y z 1
+// CHECK: .globl  "1 2 3"
+test9 1, 2,3
+
+test8 1,2 3
+// CHECK-ERRORS: error: macro argument '_c' is missing
+// CHECK-ERRORS-NEXT: test8 1,2 3
+// CHECK-ERRORS-NEXT:           ^
+
+test8 1 2, 3
+// CHECK-ERRORS: error: expected ' ' for macro argument separator
+// CHECK-ERRORS-NEXT:test8 1 2, 3
+// CHECK-ERRORS-NEXT:         ^
diff --git a/test/MC/COFF/comm.ll b/test/MC/COFF/comm.ll
new file mode 100644
index 0000000..74da557
--- /dev/null
+++ b/test/MC/COFF/comm.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple i386-pc-mingw32 < %s | FileCheck %s
+
+@a = internal global i8 0, align 1
+@b = internal global double 0.000000e+00, align 8
+@c = common global i8 0, align 1
+@d = common global double 0.000000e+00, align 8
+
+; .lcomm uses byte alignment
+; CHECK: .lcomm	_a,1
+; CHECK: .lcomm	_b,8,8
+; .comm uses log2 alignment
+; CHECK: .comm	_c,1,0
+; CHECK: .comm	_d,8,3
diff --git a/test/MC/COFF/global_ctors.ll b/test/MC/COFF/global_ctors_dtors.ll
index 4d6b1c7..2a25219 100644
--- a/test/MC/COFF/global_ctors.ll
+++ b/test/MC/COFF/global_ctors_dtors.ll
@@ -1,14 +1,16 @@
 ; Test that global ctors are emitted into the proper COFF section for the
 ; target. Mingw uses .ctors, whereas MSVC uses .CRT$XC*.
-; RUN: llc < %s -mtriple i686-pc-win32 | FileCheck %s --check-prefix WIN32 
-; RUN: llc < %s -mtriple x86_64-pc-win32 | FileCheck %s --check-prefix WIN32 
-; RUN: llc < %s -mtriple i686-pc-mingw32 | FileCheck %s --check-prefix MINGW32 
-; RUN: llc < %s -mtriple x86_64-pc-mingw32 | FileCheck %s --check-prefix MINGW32 
+; RUN: llc < %s -mtriple i686-pc-win32 | FileCheck %s --check-prefix WIN32
+; RUN: llc < %s -mtriple x86_64-pc-win32 | FileCheck %s --check-prefix WIN32
+; RUN: llc < %s -mtriple i686-pc-mingw32 | FileCheck %s --check-prefix MINGW32
+; RUN: llc < %s -mtriple x86_64-pc-mingw32 | FileCheck %s --check-prefix MINGW32
 
 @.str = private unnamed_addr constant [13 x i8] c"constructing\00", align 1
-@.str2 = private unnamed_addr constant [5 x i8] c"main\00", align 1
+@.str2 = private unnamed_addr constant [12 x i8] c"destructing\00", align 1
+@.str3 = private unnamed_addr constant [5 x i8] c"main\00", align 1
 
 @llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @a_global_ctor }]
+@llvm.global_dtors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @a_global_dtor }]
 
 declare i32 @puts(i8*)
 
@@ -17,12 +19,21 @@ define void @a_global_ctor() nounwind {
   ret void
 }
 
+define void @a_global_dtor() nounwind {
+  %1 = call i32 @puts(i8* getelementptr inbounds ([12 x i8]* @.str2, i32 0, i32 0))
+  ret void
+}
+
 define i32 @main() nounwind {
-  %1 = call i32 @puts(i8* getelementptr inbounds ([5 x i8]* @.str2, i32 0, i32 0))
+  %1 = call i32 @puts(i8* getelementptr inbounds ([5 x i8]* @.str3, i32 0, i32 0))
   ret i32 0
 }
 
 ; WIN32: .section .CRT$XCU,"r"
 ; WIN32: a_global_ctor
+; WIN32: .section .CRT$XTX,"r"
+; WIN32: a_global_dtor
 ; MINGW32: .section .ctors,"w"
 ; MINGW32: a_global_ctor
+; MINGW32: .section .dtors,"w"
+; MINGW32: a_global_dtor
diff --git a/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt b/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt
index 5ba7d61..00b8526 100644
--- a/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt
@@ -1,5 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=armv7-unknown-unknwon -mcpu=cortex-a8 2>&1 | grep "invalid instruction encoding"
-# XFAIL: *
+# RUN: llvm-mc --disassemble %s -triple=armv7-unknown-unknwon -mcpu=cortex-a8 2>&1 | FileCheck %s
 
 # Opcode=737 Name=VLD1DUPq8_UPD Format=ARM_FORMAT_NLdSt(30)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
@@ -9,3 +8,4 @@
 # 
 # 'a' == 1 and data_size == 8 is invalid
 0x3d 0x3c 0xa0 0xf4
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/invalid-VLD1LNd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VLD1LNd32_UPD-thumb.txt
new file mode 100644
index 0000000..9bb0995
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VLD1LNd32_UPD-thumb.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0xa0 0xf9 0x10 0x08
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/invalid-VLD4DUPd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VLD4DUPd32_UPD-thumb.txt
new file mode 100644
index 0000000..84c98bf
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VLD4DUPd32_UPD-thumb.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0xa0 0xf9 0xc0 0x0f
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/invalid-VLD4LNd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VLD4LNd32_UPD-thumb.txt
new file mode 100644
index 0000000..9024b09
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VLD4LNd32_UPD-thumb.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0xa0 0xf9 0x30 0x0b
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/invalid-VST1LNd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VST1LNd32_UPD-thumb.txt
new file mode 100644
index 0000000..9462812
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VST1LNd32_UPD-thumb.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0x80 0xf9 0x10 0x08
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/invalid-VST4LNd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VST4LNd32_UPD-thumb.txt
new file mode 100644
index 0000000..f6e71bc
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VST4LNd32_UPD-thumb.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0x80 0xf9 0x30 0x0b
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/marked-up-thumb.txt b/test/MC/Disassembler/ARM/marked-up-thumb.txt
new file mode 100644
index 0000000..65be286
--- /dev/null
+++ b/test/MC/Disassembler/ARM/marked-up-thumb.txt
@@ -0,0 +1,7 @@
+# RUN: llvm-mc -triple=thumbv7-apple-darwin -mcpu=cortex-a8 -mdis < %s | FileCheck %s
+# CHECK: ldr  <reg:r4>, <mem:[pc, <imm:#32>]>
+0x08 0x4c
+# CHECK: push	{<reg:r1>, <reg:r2>, <reg:r7>}
+0x86 0xb4
+# CHECK: sub	<reg:sp>, <imm:#132>
+0xa1 0xb0
diff --git a/test/MC/Disassembler/ARM/neont-VLD-reencoding.txt b/test/MC/Disassembler/ARM/neont-VLD-reencoding.txt
new file mode 100644
index 0000000..e53739e
--- /dev/null
+++ b/test/MC/Disassembler/ARM/neont-VLD-reencoding.txt
@@ -0,0 +1,77 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s | FileCheck %s
+
+0xa0 0xf9 0x00 0x00
+0xa0 0xf9 0x20 0x00
+0xa0 0xf9 0x40 0x00
+0xa0 0xf9 0x60 0x00
+0xa0 0xf9 0x80 0x00
+0xa0 0xf9 0xa0 0x00
+0xa0 0xf9 0xc0 0x00
+0xa0 0xf9 0xe0 0x00
+
+# CHECK: vld1.8  {d0[0]}, [r0], r0 @ encoding: [0xa0,0xf9,0x00,0x00]
+# CHECK: vld1.8  {d0[1]}, [r0], r0 @ encoding: [0xa0,0xf9,0x20,0x00]
+# CHECK: vld1.8  {d0[2]}, [r0], r0 @ encoding: [0xa0,0xf9,0x40,0x00]
+# CHECK: vld1.8  {d0[3]}, [r0], r0 @ encoding: [0xa0,0xf9,0x60,0x00]
+# CHECK: vld1.8  {d0[4]}, [r0], r0 @ encoding: [0xa0,0xf9,0x80,0x00]
+# CHECK: vld1.8  {d0[5]}, [r0], r0 @ encoding: [0xa0,0xf9,0xa0,0x00]
+# CHECK: vld1.8  {d0[6]}, [r0], r0 @ encoding: [0xa0,0xf9,0xc0,0x00]
+# CHECK: vld1.8  {d0[7]}, [r0], r0 @ encoding: [0xa0,0xf9,0xe0,0x00]
+
+0xa0 0xf9 0x00 0x04
+0xa0 0xf9 0x10 0x04
+0xa0 0xf9 0x40 0x04
+0xa0 0xf9 0x50 0x04
+0xa0 0xf9 0x80 0x04
+0xa0 0xf9 0x90 0x04
+0xa0 0xf9 0xc0 0x04
+0xa0 0xf9 0xd0 0x04
+
+# CHECK: vld1.16 {d0[0]}, [r0], r0      @ encoding: [0xa0,0xf9,0x00,0x04]
+# CHECK: vld1.16 {d0[0]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0x10,0x04]
+# CHECK: vld1.16 {d0[1]}, [r0], r0      @ encoding: [0xa0,0xf9,0x40,0x04]
+# CHECK: vld1.16 {d0[1]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0x50,0x04]
+# CHECK: vld1.16 {d0[2]}, [r0], r0      @ encoding: [0xa0,0xf9,0x80,0x04]
+# CHECK: vld1.16 {d0[2]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0x90,0x04]
+# CHECK: vld1.16 {d0[3]}, [r0], r0      @ encoding: [0xa0,0xf9,0xc0,0x04]
+# CHECK: vld1.16 {d0[3]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0xd0,0x04]
+
+0xa0 0xf9 0x00 0x08
+0xa0 0xf9 0x30 0x08
+0xa0 0xf9 0x80 0x08
+0xa0 0xf9 0xb0 0x08
+
+# CHECK: vld1.32 {d0[0]}, [r0], r0      @ encoding: [0xa0,0xf9,0x00,0x08]
+# CHECK: vld1.32 {d0[0]}, [r0, :32], r0 @ encoding: [0xa0,0xf9,0x30,0x08]
+# CHECK: vld1.32 {d0[1]}, [r0], r0      @ encoding: [0xa0,0xf9,0x80,0x08]
+# CHECK: vld1.32 {d0[1]}, [r0, :32], r0 @ encoding: [0xa0,0xf9,0xb0,0x08]
+
+0xa0 0xf9 0x1f 0x04
+0xa0 0xf9 0x8f 0x00
+
+# CHECK: vld1.16 {d0[0]}, [r0, :16] @ encoding: [0xa0,0xf9,0x1f,0x04]
+# CHECK: vld1.8  {d0[4]}, [r0]      @ encoding: [0xa0,0xf9,0x8f,0x00]
+
+0xa0 0xf9 0x1d 0x04
+0xa0 0xf9 0x8d 0x00
+
+# CHECK: vld1.16 {d0[0]}, [r0, :16]! @ encoding: [0xa0,0xf9,0x1d,0x04]
+# CHECK: vld1.8  {d0[4]}, [r0]!      @ encoding: [0xa0,0xf9,0x8d,0x00]
+
+0xa5 0xf9 0x10 0x04
+0xa5 0xf9 0x1a 0x04
+0xae 0xf9 0x1a 0x04
+0xa5 0xf9 0x1a 0x94
+
+# CHECK: vld1.16 {d0[0]}, [r5, :16], r0  @ encoding: [0xa5,0xf9,0x10,0x04]
+# CHECK: vld1.16 {d0[0]}, [r5, :16], r10 @ encoding: [0xa5,0xf9,0x1a,0x04]
+# CHECK: vld1.16 {d0[0]}, [lr, :16], r10 @ encoding: [0xae,0xf9,0x1a,0x04]
+# CHECK: vld1.16 {d9[0]}, [r5, :16], r10 @ encoding: [0xa5,0xf9,0x1a,0x94]
+
+0xa0 0xf9 0x20 0x0b
+0xa0 0xf9 0x20 0x07
+0xa0 0xf9 0x20 0x03
+
+# CHECK: vld4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0, :128], r0 @ encoding: [0xa0,0xf9,0x20,0x0b]
+# CHECK: vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r0       @ encoding: [0xa0,0xf9,0x20,0x07]
+# CHECK: vld4.8  {d0[1], d1[1], d2[1], d3[1]}, [r0], r0       @ encoding: [0xa0,0xf9,0x20,0x03]
diff --git a/test/MC/Disassembler/ARM/neont-VST-reencoding.txt b/test/MC/Disassembler/ARM/neont-VST-reencoding.txt
new file mode 100644
index 0000000..eb3722c
--- /dev/null
+++ b/test/MC/Disassembler/ARM/neont-VST-reencoding.txt
@@ -0,0 +1,77 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s | FileCheck %s
+
+0x80 0xf9 0x00 0x00
+0x81 0xf9 0x21 0x10
+0x81 0xf9 0x42 0x10
+0x81 0xf9 0x61 0x20
+0x82 0xf9 0x82 0x20
+0x82 0xf9 0xa1 0x10
+0x82 0xf9 0xc2 0x20
+0x83 0xf9 0xe3 0x30
+
+# CHECK: vst1.8  {d0[0]}, [r0], r0 @ encoding: [0x80,0xf9,0x00,0x00]
+# CHECK: vst1.8  {d1[1]}, [r1], r1 @ encoding: [0x81,0xf9,0x21,0x10]
+# CHECK: vst1.8  {d1[2]}, [r1], r2 @ encoding: [0x81,0xf9,0x42,0x10]
+# CHECK: vst1.8  {d2[3]}, [r1], r1 @ encoding: [0x81,0xf9,0x61,0x20]
+# CHECK: vst1.8  {d2[4]}, [r2], r2 @ encoding: [0x82,0xf9,0x82,0x20]
+# CHECK: vst1.8  {d1[5]}, [r2], r1 @ encoding: [0x82,0xf9,0xa1,0x10]
+# CHECK: vst1.8  {d2[6]}, [r2], r2 @ encoding: [0x82,0xf9,0xc2,0x20]
+# CHECK: vst1.8  {d3[7]}, [r3], r3 @ encoding: [0x83,0xf9,0xe3,0x30]
+
+0x80 0xf9 0x00 0x04
+0xc3 0xf9 0x13 0x04
+0xc4 0xf9 0x43 0x04
+0xc5 0xf9 0x55 0x04
+0xc6 0xf9 0x85 0x04
+0xc7 0xf9 0x95 0x74
+0xc8 0xf9 0xc7 0x84
+0xc9 0xf9 0xd9 0x94
+
+# CHECK: vst1.16 {d0[0]},  [r0], r0      @ encoding: [0x80,0xf9,0x00,0x04]
+# CHECK: vst1.16 {d16[0]}, [r3, :16], r3 @ encoding: [0xc3,0xf9,0x13,0x04]
+# CHECK: vst1.16 {d16[1]}, [r4], r3      @ encoding: [0xc4,0xf9,0x43,0x04]
+# CHECK: vst1.16 {d16[1]}, [r5, :16], r5 @ encoding: [0xc5,0xf9,0x55,0x04]
+# CHECK: vst1.16 {d16[2]}, [r6], r5      @ encoding: [0xc6,0xf9,0x85,0x04]
+# CHECK: vst1.16 {d23[2]}, [r7, :16], r5 @ encoding: [0xc7,0xf9,0x95,0x74]
+# CHECK: vst1.16 {d24[3]}, [r8], r7      @ encoding: [0xc8,0xf9,0xc7,0x84]
+# CHECK: vst1.16 {d25[3]}, [r9, :16], r9 @ encoding: [0xc9,0xf9,0xd9,0x94]
+
+0x8a 0xf9 0x01 0xa8
+0xcb 0xf9 0x32 0x18
+0x8c 0xf9 0x83 0xb8
+0xcd 0xf9 0xb4 0x28
+
+# CHECK: vst1.32 {d10[0]}, [r10], r1      @ encoding: [0x8a,0xf9,0x01,0xa8]
+# CHECK: vst1.32 {d17[0]}, [r11, :32], r2 @ encoding: [0xcb,0xf9,0x32,0x18]
+# CHECK: vst1.32 {d11[1]}, [r12], r3      @ encoding: [0x8c,0xf9,0x83,0xb8]
+# CHECK: vst1.32 {d18[1]}, [sp, :32], r4  @ encoding: [0xcd,0xf9,0xb4,0x28]
+
+0x81 0xf9 0x1f 0x44
+0x82 0xf9 0x8f 0x30
+
+# CHECK: vst1.16 {d4[0]}, [r1, :16] @ encoding: [0x81,0xf9,0x1f,0x44]
+# CHECK: vst1.8  {d3[4]}, [r2]      @ encoding: [0x82,0xf9,0x8f,0x30]
+
+0x83 0xf9 0x1d 0x24
+0x84 0xf9 0x8d 0x10
+
+# CHECK: vst1.16 {d2[0]}, [r3, :16]! @ encoding: [0x83,0xf9,0x1d,0x24]
+# CHECK: vst1.8  {d1[4]}, [r4]!      @ encoding: [0x84,0xf9,0x8d,0x10]
+
+0x85 0xf9 0x10 0x04
+0x85 0xf9 0x1a 0x74
+0x8e 0xf9 0x1a 0x84
+0x85 0xf9 0x1a 0x94
+
+# CHECK: vst1.16 {d0[0]}, [r5, :16], r0  @ encoding: [0x85,0xf9,0x10,0x04]
+# CHECK: vst1.16 {d7[0]}, [r5, :16], r10 @ encoding: [0x85,0xf9,0x1a,0x74]
+# CHECK: vst1.16 {d8[0]}, [lr, :16], r10 @ encoding: [0x8e,0xf9,0x1a,0x84]
+# CHECK: vst1.16 {d9[0]}, [r5, :16], r10 @ encoding: [0x85,0xf9,0x1a,0x94]
+
+0x81 0xf9 0x24 0x0b
+0x82 0xf9 0x25 0x07
+0x83 0xf9 0x26 0x03
+
+# CHECK: vst4.32 {d0[0], d1[0], d2[0], d3[0]}, [r1, :128], r4 @ encoding: [0x81,0xf9,0x24,0x0b]
+# CHECK: vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r2], r5       @ encoding: [0x82,0xf9,0x25,0x07]
+# CHECK: vst4.8  {d0[1], d1[1], d2[1], d3[1]}, [r3], r6       @ encoding: [0x83,0xf9,0x26,0x03]
diff --git a/test/MC/Disassembler/ARM/thumb-printf.txt b/test/MC/Disassembler/ARM/thumb-printf.txt
index 8158a73..ca82044 100644
--- a/test/MC/Disassembler/ARM/thumb-printf.txt
+++ b/test/MC/Disassembler/ARM/thumb-printf.txt
@@ -7,17 +7,17 @@
 # CHECK-NEXT:	add	r3, sp, #20
 # CHECK-NEXT:	ldr	r5, [r3], #4
 # CHECK-NEXT:	str	r3, [sp]
-# CHECK-NEXT:	ldr	r3, #52
+# CHECK-NEXT:	ldr	r3, [pc, #52]
 # CHECK-NEXT:	add	r3, pc
 # CHECK-NEXT:	ldr	r0, [r3]
 # CHECK-NEXT:	ldr	r4, [r0]
-# CHECK-NEXT:	ldr	r0, #48
+# CHECK-NEXT:	ldr	r0, [pc, #48]
 # CHECK-NEXT:	add	r0, pc
 # CHECK-NEXT:	ldr	r0, [r0]
 # CHECK-NEXT:	ldr	r0, [r0]
 # CHECK-NEXT:	blx	#191548
 # CHECK-NEXT:	cbnz	r0, #6
-# CHECK-NEXT:	ldr	r1, #40
+# CHECK-NEXT:	ldr	r1, [pc, #40]
 # CHECK-NEXT:	add	r1, pc
 # CHECK-NEXT:	ldr	r1, [r1]
 # CHECK-NEXT:	b	#0
diff --git a/test/MC/Disassembler/ARM/thumb-tests.txt b/test/MC/Disassembler/ARM/thumb-tests.txt
index c08585a..757ce6e 100644
--- a/test/MC/Disassembler/ARM/thumb-tests.txt
+++ b/test/MC/Disassembler/ARM/thumb-tests.txt
@@ -30,7 +30,7 @@
 # CHECK:	ldm	r0!, {r1}
 0x02 0xc8
 
-# CHECK:	ldr	r5, #432
+# CHECK:	ldr	r5, [pc, #432]
 0x6c 0x4d
 
 # CHECK:	str	r0, [r3]
diff --git a/test/MC/Disassembler/ARM/thumb1.txt b/test/MC/Disassembler/ARM/thumb1.txt
index 5b70262..de9596a 100644
--- a/test/MC/Disassembler/ARM/thumb1.txt
+++ b/test/MC/Disassembler/ARM/thumb1.txt
@@ -160,6 +160,7 @@
 # CHECK: ldr r1, [sp]
 # CHECK: ldr r2, [sp, #24]
 # CHECK: ldr r3, [sp, #1020]
+# CHECK: ldr r1, [pc, #12]
 
 
 0x29 0x68
@@ -168,6 +169,7 @@
 0x00 0x99
 0x06 0x9a
 0xff 0x9b
+0x03 0x49
 
 #------------------------------------------------------------------------------
 # LDR (register)
diff --git a/test/MC/Disassembler/ARM/thumb2.txt b/test/MC/Disassembler/ARM/thumb2.txt
index 42ebe58..45dace3 100644
--- a/test/MC/Disassembler/ARM/thumb2.txt
+++ b/test/MC/Disassembler/ARM/thumb2.txt
@@ -169,6 +169,9 @@
 
 0x13 0xf5 0xce 0xa9
 
+# CHECK: b.w   #208962
+
+0x33 0xf0 0x21 0xb8 # rdar://12585795
 
 #------------------------------------------------------------------------------
 # BFC
diff --git a/test/MC/Disassembler/Mips/mips64.txt b/test/MC/Disassembler/Mips/mips64.txt
index 095ed18..0a88c40 100644
--- a/test/MC/Disassembler/Mips/mips64.txt
+++ b/test/MC/Disassembler/Mips/mips64.txt
@@ -3,7 +3,7 @@
 # CHECK: daddiu $11, $26, 31949
 0x67 0x4b 0x7c 0xcd
 
-# CHECK: daddu $26, $at, $11
+# CHECK: daddu $26, $1, $11
 0x00 0x2b 0xd0 0x2d
 
 # CHECK: ddiv $zero, $26, $22
@@ -30,10 +30,10 @@
 # CHECK: dsllv $gp, $27, $24
 0x03 0x1b 0xe0 0x14
 
-# CHECK: dsra $at, $at, 30
+# CHECK: dsra $1, $1, 30
 0x00 0x01 0x0f 0xbb
 
-# CHECK: dsrav $at, $at, $fp
+# CHECK: dsrav $1, $1, $fp
 0x03 0xc1 0x08 0x17
 
 # CHECK: dsrl $10, $gp, 24
@@ -45,10 +45,10 @@
 # CHECK: dsubu $gp, $27, $24
 0x03 0x78 0xe0 0x2f
 
-# CHECK: lw $27, -15155($at)
+# CHECK: lw $27, -15155($1)
 0x8c 0x3b 0xc4 0xcd
 
-# CHECK: lui $at, 1
+# CHECK: lui $1, 1
 0x3c 0x01 0x00 0x01
 
 # CHECK: lwu $3, -1746($3)
@@ -57,7 +57,7 @@
 # CHECK: lui $ra, 1
 0x3c 0x1f 0x00 0x01
 
-# CHECK: sw $26, -15159($at)
+# CHECK: sw $26, -15159($1)
 0xac 0x3a 0xc4 0xc9
 
 # CHECK: ld $26, 3958($zero)
diff --git a/test/MC/Disassembler/Mips/mips64_le.txt b/test/MC/Disassembler/Mips/mips64_le.txt
index c4e5591..fe8faff 100644
--- a/test/MC/Disassembler/Mips/mips64_le.txt
+++ b/test/MC/Disassembler/Mips/mips64_le.txt
@@ -3,7 +3,7 @@
 # CHECK: daddiu $11, $26, 31949
 0xcd 0x7c 0x4b 0x67
 
-# CHECK: daddu $26, $at, $11
+# CHECK: daddu $26, $1, $11
 0x2d 0xd0 0x2b 0x00
 
 # CHECK: ddiv $zero, $26, $22
@@ -30,10 +30,10 @@
 # CHECK: dsllv $gp, $27, $24
 0x14 0xe0 0x1b 0x03
 
-# CHECK: dsra $at, $at, 30
+# CHECK: dsra $1, $1, 30
 0xbb 0x0f 0x01 0x00
 
-# CHECK: dsrav $at, $at, $fp
+# CHECK: dsrav $1, $1, $fp
 0x17 0x08 0xc1 0x03
 
 # CHECK: dsrl $10, $gp, 24
@@ -45,10 +45,10 @@
 # CHECK: dsubu $gp, $27, $24
 0x2f 0xe0 0x78 0x03
 
-# CHECK: lw $27, -15155($at)
+# CHECK: lw $27, -15155($1)
 0xcd 0xc4 0x3b 0x8c
 
-# CHECK: lui $at, 1
+# CHECK: lui $1, 1
 0x01 0x00 0x01 0x3c
 
 # CHECK: lwu $3, -1746($3)
@@ -57,7 +57,7 @@
 # CHECK: lui $ra, 1
 0x01 0x00 0x1f 0x3c
 
-# CHECK: sw $26, -15159($at)
+# CHECK: sw $26, -15159($1)
 0xc9 0xc4 0x3a 0xac
 
 # CHECK: ld $26, 3958($zero)
diff --git a/test/MC/Disassembler/Mips/mips64r2.txt b/test/MC/Disassembler/Mips/mips64r2.txt
index 41808c7..2dfde0d 100644
--- a/test/MC/Disassembler/Mips/mips64r2.txt
+++ b/test/MC/Disassembler/Mips/mips64r2.txt
@@ -3,7 +3,7 @@
 # CHECK: daddiu $11, $26, 31949
 0x67 0x4b 0x7c 0xcd
 
-# CHECK: daddu $26, $at, $11
+# CHECK: daddu $26, $1, $11
 0x00 0x2b 0xd0 0x2d
 
 # CHECK: ddiv $zero, $26, $22
@@ -30,10 +30,10 @@
 # CHECK: dsllv $gp, $27, $24
 0x03 0x1b 0xe0 0x14
 
-# CHECK: dsra $at, $at, 30
+# CHECK: dsra $1, $1, 30
 0x00 0x01 0x0f 0xbb
 
-# CHECK: dsrav $at, $at, $fp
+# CHECK: dsrav $1, $1, $fp
 0x03 0xc1 0x08 0x17
 
 # CHECK: dsrl $10, $gp, 24
@@ -45,10 +45,10 @@
 # CHECK: dsubu $gp, $27, $24
 0x03 0x78 0xe0 0x2f
 
-# CHECK: lw $27, -15155($at)
+# CHECK: lw $27, -15155($1)
 0x8c 0x3b 0xc4 0xcd
 
-# CHECK: lui $at, 1
+# CHECK: lui $1, 1
 0x3c 0x01 0x00 0x01
 
 # CHECK: lwu $3, -1746($3)
@@ -57,7 +57,7 @@
 # CHECK: lui $ra, 1
 0x3c 0x1f 0x00 0x01
 
-# CHECK: sw $26, -15159($at)
+# CHECK: sw $26, -15159($1)
 0xac 0x3a 0xc4 0xc9
 
 # CHECK: ld $26, 3958($zero)
diff --git a/test/MC/Disassembler/Mips/mips64r2_le.txt b/test/MC/Disassembler/Mips/mips64r2_le.txt
index 4987f80..620d9eb 100644
--- a/test/MC/Disassembler/Mips/mips64r2_le.txt
+++ b/test/MC/Disassembler/Mips/mips64r2_le.txt
@@ -3,7 +3,7 @@
 # CHECK: daddiu $11, $26, 31949
 0xcd 0x7c 0x4b 0x67
 
-# CHECK: daddu $26, $at, $11
+# CHECK: daddu $26, $1, $11
 0x2d 0xd0 0x2b 0x00
 
 # CHECK: ddiv $zero, $26, $22
@@ -30,10 +30,10 @@
 # CHECK: dsllv $gp, $27, $24
 0x14 0xe0 0x1b 0x03
 
-# CHECK: dsra $at, $at, 30
+# CHECK: dsra $1, $1, 30
 0xbb 0x0f 0x01 0x00
 
-# CHECK: dsrav $at, $at, $fp
+# CHECK: dsrav $1, $1, $fp
 0x17 0x08 0xc1 0x03
 
 # CHECK: dsrl $10, $gp, 24
@@ -45,10 +45,10 @@
 # CHECK: dsubu $gp, $27, $24
 0x2f 0xe0 0x78 0x03
 
-# CHECK: lw $27, -15155($at)
+# CHECK: lw $27, -15155($1)
 0xcd 0xc4 0x3b 0x8c
 
-# CHECK: lui $at, 1
+# CHECK: lui $1, 1
 0x01 0x00 0x01 0x3c
 
 # CHECK: lwu $3, -1746($3)
@@ -57,7 +57,7 @@
 # CHECK: lui $ra, 1
 0x01 0x00 0x1f 0x3c
 
-# CHECK: sw $26, -15159($at)
+# CHECK: sw $26, -15159($1)
 0xc9 0xc4 0x3a 0xac
 
 # CHECK: ld $26, 3958($zero)
diff --git a/test/MC/Disassembler/X86/marked-up.txt b/test/MC/Disassembler/X86/marked-up.txt
new file mode 100644
index 0000000..f0e5125
--- /dev/null
+++ b/test/MC/Disassembler/X86/marked-up.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --mdis %s -triple=x86_64-apple-darwin9 2>&1 | FileCheck %s
+
+# CHECK: movq	<mem:<reg:%gs>:8>, <reg:%rcx>
+0x65 0x48 0x8b 0x0c 0x25 0x08 0x00 0x00 0x00
+# CHECK: xorps	<reg:%xmm1>, <reg:%xmm2>
+0x0f 0x57 0xd1
diff --git a/test/MC/ELF/cfi-reg.s b/test/MC/ELF/cfi-reg.s
new file mode 100644
index 0000000..fd68d6d
--- /dev/null
+++ b/test/MC/ELF/cfi-reg.s
@@ -0,0 +1,18 @@
+// RUN: llvm-mc -triple x86_64-pc-linux-gnu %s -o - | FileCheck %s
+// PR13754
+
+f:
+	.cfi_startproc
+        nop
+	.cfi_offset 6, -16
+        nop
+	.cfi_offset %rsi, -16
+        nop
+	.cfi_offset rbx, -16
+        nop
+	.cfi_endproc
+
+// CHECK: f:
+// CHECK: .cfi_offset %rbp, -16
+// CHECK: .cfi_offset %rsi, -16
+// CHECK: .cfi_offset %rbx, -16
diff --git a/test/MC/ELF/lcomm.s b/test/MC/ELF/lcomm.s
new file mode 100644
index 0000000..ae8d0ba
--- /dev/null
+++ b/test/MC/ELF/lcomm.s
@@ -0,0 +1,21 @@
+// RUN: llvm-mc -triple i386-pc-linux-gnu %s -filetype=obj -o - | elf-dump | FileCheck %s
+
+.lcomm A, 5
+.lcomm B, 32 << 20
+
+// CHECK: (('st_name', 0x00000001) # 'A'
+// CHECK:  ('st_value', 0x00000000)
+// CHECK:  ('st_size', 0x00000005)
+// CHECK:  ('st_bind', 0x0)
+// CHECK:  ('st_type', 0x1)
+// CHECK:  ('st_other', 0x00)
+// CHECK:  ('st_shndx', 0x0003)
+// CHECK: ),
+// CHECK: (('st_name', 0x00000003) # 'B'
+// CHECK:  ('st_value', 0x00000005)
+// CHECK:  ('st_size', 0x02000000)
+// CHECK:  ('st_bind', 0x0)
+// CHECK:  ('st_type', 0x1)
+// CHECK:  ('st_other', 0x00)
+// CHECK:  ('st_shndx', 0x0003)
+// CHECK: ),
diff --git a/test/MC/MachO/ARM/long-call-branch-island-relocation.s b/test/MC/MachO/ARM/long-call-branch-island-relocation.s
new file mode 100644
index 0000000..8ee7da5
--- /dev/null
+++ b/test/MC/MachO/ARM/long-call-branch-island-relocation.s
@@ -0,0 +1,43 @@
+@ RUN: llvm-mc -n -triple armv7-apple-darwin10 %s -filetype=obj -o %t.o
+@ RUN: macho-dump --dump-section-data < %t.o | FileCheck %s
+
+@ rdar://12359919
+
+	.syntax unified
+	.text
+
+	.globl	_bar
+	.align	2
+	.code	16
+	.thumb_func	_bar
+_bar:
+	push	{r7, lr}
+	mov	r7, sp
+	bl	_foo
+	pop	{r7, pc}
+
+
+_junk:
+@ Make the _foo symbol sufficiently far away to force the 'bl' relocation
+@ above to be out of range. On Darwin, the assembler deals with this by
+@ generating an external relocation so the linker can create a branch
+@ island.
+
+  .space 20000000
+
+  .section	__TEXT,initcode,regular,pure_instructions
+
+	.globl	_foo
+	.align	2
+	.code	16
+_foo:
+	push	{r7, lr}
+	mov	r7, sp
+	pop	{r7, pc}
+
+
+@ CHECK:  ('_relocations', [
+@ CHECK:    # Relocation 0
+@ CHECK:    (('word-0', 0x4),
+@ CHECK:     ('word-1', 0x6d000002)),
+@ CHECK:  ])
diff --git a/test/MC/MachO/absolute.s b/test/MC/MachO/absolute.s
new file mode 100644
index 0000000..784e32a
--- /dev/null
+++ b/test/MC/MachO/absolute.s
@@ -0,0 +1,158 @@
+// RUN: llvm-mc -triple x86_64-apple-darwin10 %s -filetype=obj -o - | macho-dump | FileCheck %s
+
+_bar:
+  nop
+_foo:
+  nop
+
+  .set foo_set1, (_foo + 0xffff0000)
+  .set foo_set2, (_foo - _bar + 0xffff0000)
+
+foo_equals = (_foo + 0xffff0000)
+foo_equals2 = (_foo - _bar + 0xffff0000)
+
+  .globl foo_set1_global;
+  .set foo_set1_global, (_foo + 0xffff0000)
+
+  .globl foo_set2_global;
+  .set foo_set2_global, (_foo - _bar + 0xffff0000)
+
+// CHECK: ('cputype', 16777223)
+// CHECK: ('cpusubtype', 3)
+// CHECK: ('filetype', 1)
+// CHECK: ('num_load_commands', 3)
+// CHECK: ('load_commands_size', 256)
+// CHECK: ('flag', 0)
+// CHECK: ('reserved', 0)
+// CHECK: ('load_commands', [
+// CHECK:   # Load Command 0
+// CHECK:  (('command', 25)
+// CHECK:   ('size', 152)
+// CHECK:   ('segment_name', '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:   ('vm_addr', 0)
+// CHECK:   ('vm_size', 2)
+// CHECK:   ('file_offset', 288)
+// CHECK:   ('file_size', 2)
+// CHECK:   ('maxprot', 7)
+// CHECK:   ('initprot', 7)
+// CHECK:   ('num_sections', 1)
+// CHECK:   ('flags', 0)
+// CHECK:   ('sections', [
+// CHECK:     # Section 0
+// CHECK:    (('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:     ('address', 0)
+// CHECK:     ('size', 2)
+// CHECK:     ('offset', 288)
+// CHECK:     ('alignment', 0)
+// CHECK:     ('reloc_offset', 0)
+// CHECK:     ('num_reloc', 0)
+// CHECK:     ('flags', 0x80000400)
+// CHECK:     ('reserved1', 0)
+// CHECK:     ('reserved2', 0)
+// CHECK:     ('reserved3', 0)
+// CHECK:    ),
+// CHECK:   ('_relocations', [
+// CHECK:   ])
+// CHECK:   ])
+// CHECK:  ),
+// CHECK:   # Load Command 1
+// CHECK:  (('command', 2)
+// CHECK:   ('size', 24)
+// CHECK:   ('symoff', 292)
+// CHECK:   ('nsyms', 8)
+// CHECK:   ('stroff', 420)
+// CHECK:   ('strsize', 84)
+// CHECK:   ('_string_data', '\x00foo_set1_global\x00foo_set2_global\x00_bar\x00_foo\x00foo_set1\x00foo_set2\x00foo_equals\x00foo_equals2\x00')
+// CHECK:   ('_symbols', [
+// CHECK:     # Symbol 0
+// CHECK:    (('n_strx', 33)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 0)
+// CHECK:     ('_string', '_bar')
+// CHECK:    ),
+// CHECK:     # Symbol 1
+// CHECK:    (('n_strx', 38)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 1)
+// CHECK:     ('_string', '_foo')
+// CHECK:    ),
+// CHECK:     # Symbol 2
+// CHECK:    (('n_strx', 43)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 32)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_set1')
+// CHECK:    ),
+// CHECK:     # Symbol 3
+// CHECK:    (('n_strx', 52)
+// CHECK:     ('n_type', 0x2)
+// CHECK:     ('n_sect', 0)
+// CHECK:     ('n_desc', 32)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_set2')
+// CHECK:    ),
+// CHECK:     # Symbol 4
+// CHECK:    (('n_strx', 61)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_equals')
+// CHECK:    ),
+// CHECK:     # Symbol 5
+// CHECK:    (('n_strx', 72)
+// CHECK:     ('n_type', 0x2)
+// CHECK:     ('n_sect', 0)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_equals2')
+// CHECK:    ),
+// CHECK:     # Symbol 6
+// CHECK:    (('n_strx', 1)
+// CHECK:     ('n_type', 0xf)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 32)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_set1_global')
+// CHECK:    ),
+// CHECK:     # Symbol 7
+// CHECK:    (('n_strx', 17)
+// CHECK:     ('n_type', 0x3)
+// CHECK:     ('n_sect', 0)
+// CHECK:     ('n_desc', 32)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_set2_global')
+// CHECK:    ),
+// CHECK:   ])
+// CHECK:  ),
+// CHECK:   # Load Command 2
+// CHECK:  (('command', 11)
+// CHECK:   ('size', 80)
+// CHECK:   ('ilocalsym', 0)
+// CHECK:   ('nlocalsym', 6)
+// CHECK:   ('iextdefsym', 6)
+// CHECK:   ('nextdefsym', 2)
+// CHECK:   ('iundefsym', 8)
+// CHECK:   ('nundefsym', 0)
+// CHECK:   ('tocoff', 0)
+// CHECK:   ('ntoc', 0)
+// CHECK:   ('modtaboff', 0)
+// CHECK:   ('nmodtab', 0)
+// CHECK:   ('extrefsymoff', 0)
+// CHECK:   ('nextrefsyms', 0)
+// CHECK:   ('indirectsymoff', 0)
+// CHECK:   ('nindirectsyms', 0)
+// CHECK:   ('extreloff', 0)
+// CHECK:   ('nextrel', 0)
+// CHECK:   ('locreloff', 0)
+// CHECK:   ('nlocrel', 0)
+// CHECK:   ('_indirect_symbols', [
+// CHECK:   ])
+// CHECK:  ),
+// CHECK: ])
diff --git a/test/MC/MachO/gen-dwarf-cpp.s b/test/MC/MachO/gen-dwarf-cpp.s
new file mode 100644
index 0000000..cb749f4
--- /dev/null
+++ b/test/MC/MachO/gen-dwarf-cpp.s
@@ -0,0 +1,22 @@
+// RUN: llvm-mc -g -triple i386-apple-darwin10 %s -filetype=obj -o %t
+// RUN: llvm-dwarfdump %t | FileCheck %s
+
+# 100 "t.s" 1
+.globl _bar
+_bar:
+	movl	$0, %eax
+L1:	leave
+	ret
+
+// rdar://9275556
+
+// We check that the source name "t.s" is picked up
+// CHECK:                 Dir  Mod Time   File Len   File Name
+// CHECK:                 ---- ---------- ---------- ---------------------------
+// CHECK: file_names[  1]    1 0x00000000 0x00000000 gen-dwarf-cpp.s
+// CHECK: file_names[  2]    0 0x00000000 0x00000000 t.s
+
+// We check that the source line number 100 is picked up before the "movl"
+// CHECK: Address            Line   Column File   ISA Flags
+// CHECK: ------------------ ------ ------ ------ --- -------------
+// CHECK: 0x0000000000000000    102      0      2   0  is_stmt
diff --git a/test/MC/MachO/gen-dwarf-macro-cpp.s b/test/MC/MachO/gen-dwarf-macro-cpp.s
new file mode 100644
index 0000000..05a449b
--- /dev/null
+++ b/test/MC/MachO/gen-dwarf-macro-cpp.s
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -g -triple i386-apple-darwin10 %s -filetype=obj -o %t
+// RUN: llvm-dwarfdump %t | FileCheck %s
+
+# 1 "foo.S" 2
+.macro switcher
+        ljmp *0x38(%ecx)
+.endmacro
+        switcher NaClSwitchNoSSE, 0
+
+// PR14264 was a crash in the code caused by the .macro not handled correctly
+// rdar://12637628
+
+// We check that the source name "foo.S" is picked up
+// CHECK:                 Dir  Mod Time   File Len   File Name
+// CHECK:                 ---- ---------- ---------- ---------------------------
+// CHECK: file_names[  1]    1 0x00000000 0x00000000 gen-dwarf-macro-cpp.s
+// CHECK: file_names[  2]    0 0x00000000 0x00000000 foo.S
diff --git a/test/MC/MachO/i386-large-relocations.s b/test/MC/MachO/i386-large-relocations.s
new file mode 100644
index 0000000..e5a1cfb
--- /dev/null
+++ b/test/MC/MachO/i386-large-relocations.s
@@ -0,0 +1,36 @@
+// RUN: llvm-mc -triple i386-apple-darwin10 %s -filetype=obj -o - | macho-dump | FileCheck %s
+
+.space 0x1ed280
+       .section        __DATA,__const
+       .align  4
+.space 0x5181020
+_foo:
+       .long   _bar
+       .long   0
+       .long   _bar+8
+       .long   _bar+24
+       .long   0
+       .long   _bar+16
+
+.zerofill __DATA,__bss,__dummy,0x5d780
+.zerofill __DATA,__bss,_bar,48,4
+
+// Normally scattered relocations are used for sym+offset expressions. When
+// the value exceeds 24-bits, however, it's outside what MachO can encode,
+// so the assembler falls back to non-scattered relocations.
+// rdar://12358909
+
+// CHECK: ('_relocations', [
+// CHECK:   # Relocation 0
+// CHECK:   (('word-0', 0x5181034),
+// CHECK:    ('word-1', 0x4000003)),
+// CHECK:   # Relocation 1
+// CHECK:   (('word-0', 0x518102c),
+// CHECK:    ('word-1', 0x4000003)),
+// CHECK:   # Relocation 2
+// CHECK:   (('word-0', 0x5181028),
+// CHECK:    ('word-1', 0x4000003)),
+// CHECK:   # Relocation 3
+// CHECK:   (('word-0', 0x5181020),
+// CHECK:    ('word-1', 0x4000003)),
+// CHECK: ])
diff --git a/test/MC/MachO/lit.local.cfg b/test/MC/MachO/lit.local.cfg
index 6c49f08..41a8434 100644
--- a/test/MC/MachO/lit.local.cfg
+++ b/test/MC/MachO/lit.local.cfg
@@ -1,4 +1,4 @@
-config.suffixes = ['.s']
+config.suffixes = ['.s', '.ll']
 
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
diff --git a/test/MC/MachO/x86-data-in-code.ll b/test/MC/MachO/x86-data-in-code.ll
new file mode 100644
index 0000000..2410974
--- /dev/null
+++ b/test/MC/MachO/x86-data-in-code.ll
@@ -0,0 +1,108 @@
+; RUN: llc -O0 -mtriple=x86_64-apple-darwin -filetype=obj -o - %s | macho-dump | FileCheck %s
+
+; There should not be a data-in-code load command (type 0x29) for x86_64
+; jump tables, even though they are in the text section.
+; CHECK: 'num_load_commands'
+; CHECK-NOT: (('command', 41)
+
+define void @foo(i32* %ptr) nounwind ssp {
+  %tmp = load i32* %ptr, align 4
+  switch i32 %tmp, label %default [
+    i32 11, label %bb0
+    i32 10, label %bb1
+    i32 8, label %bb2
+    i32 4, label %bb3
+    i32 2, label %bb4
+    i32 6, label %bb5
+    i32 9, label %bb6
+    i32 15, label %bb7
+    i32 1, label %bb8
+    i32 3, label %bb9
+    i32 5, label %bb10
+    i32 30, label %bb11
+    i32 31, label %bb12
+    i32 13, label %bb13
+    i32 14, label %bb14
+    i32 20, label %bb15
+    i32 19, label %bb16
+    i32 17, label %bb17
+    i32 18, label %bb18
+    i32 21, label %bb19
+    i32 22, label %bb20
+    i32 16, label %bb21
+    i32 24, label %bb22
+    i32 25, label %bb23
+    i32 26, label %bb24
+    i32 27, label %bb25
+    i32 28, label %bb26
+    i32 23, label %bb27
+    i32 12, label %bb28
+  ]
+
+default:
+  br label %exit
+bb0:
+  br label %exit
+bb1:
+  br label %exit
+bb2:
+  br label %exit
+bb3:
+  br label %exit
+bb4:
+  br label %exit
+bb5:
+  br label %exit
+bb6:
+  br label %exit
+bb7:
+  br label %exit
+bb8:
+  br label %exit
+bb9:
+  br label %exit
+bb10:
+  br label %exit
+bb11:
+  br label %exit
+bb12:
+  br label %exit
+bb13:
+  br label %exit
+bb14:
+  br label %exit
+bb15:
+  br label %exit
+bb16:
+  br label %exit
+bb17:
+  br label %exit
+bb18:
+  br label %exit
+bb19:
+  br label %exit
+bb20:
+  br label %exit
+bb21:
+  br label %exit
+bb22:
+  br label %exit
+bb23:
+  br label %exit
+bb24:
+  br label %exit
+bb25:
+  br label %exit
+bb26:
+  br label %exit
+bb27:
+  br label %exit
+bb28:
+  br label %exit
+
+
+exit:
+
+  ret void
+}
+
diff --git a/test/MC/Markup/basic-markup.mc b/test/MC/Markup/basic-markup.mc
new file mode 100644
index 0000000..2fa5ebb
--- /dev/null
+++ b/test/MC/Markup/basic-markup.mc
@@ -0,0 +1,16 @@
+// RUN: llvm-mcmarkup %s | FileCheck %s
+
+	push	{<reg:r1>, <reg:r2>, <reg:r7>}
+	sub	<reg:sp>, <imm:#132>
+	ldr	<reg:r0>, <mem:[<reg:r0>, <imm:#4>]>
+
+
+// CHECK: reg
+// CHECK: reg
+// CHECK: reg
+// CHECK: reg
+// CHECK: imm
+// CHECK: reg
+// CHECK: mem
+// CHECK: reg
+// CHECK: imm
diff --git a/test/MC/Markup/lit.local.cfg b/test/MC/Markup/lit.local.cfg
new file mode 100644
index 0000000..ab28eed
--- /dev/null
+++ b/test/MC/Markup/lit.local.cfg
@@ -0,0 +1,2 @@
+config.suffixes = ['.mc']
+
diff --git a/test/MC/Mips/do_switch.ll b/test/MC/Mips/do_switch.ll
new file mode 100644
index 0000000..7eda1b4
--- /dev/null
+++ b/test/MC/Mips/do_switch.ll
@@ -0,0 +1,39 @@
+; This test case will cause an internal EK_GPRel64BlockAddress to be 
+; produced. This was not handled for direct object and an assertion
+; to occur. This is a variation on test case test/CodeGen/Mips/do_switch.ll
+
+; RUN: llc < %s -filetype=obj -march=mips -relocation-model=static
+
+; RUN: llc < %s -filetype=obj -march=mips -relocation-model=pic
+
+; RUN: llc < %s -filetype=obj -march=mips64 -relocation-model=pic -mcpu=mips64 -mattr=n64 
+
+define i32 @main() nounwind readnone {
+entry:
+  %x = alloca i32, align 4                        ; <i32*> [#uses=2]
+  store volatile i32 2, i32* %x, align 4
+  %0 = load volatile i32* %x, align 4             ; <i32> [#uses=1]
+
+  switch i32 %0, label %bb4 [
+    i32 0, label %bb5
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+  ]
+
+bb1:                                              ; preds = %entry
+  ret i32 2
+
+bb2:                                              ; preds = %entry
+  ret i32 0
+
+bb3:                                              ; preds = %entry
+  ret i32 3
+
+bb4:                                              ; preds = %entry
+  ret i32 4
+
+bb5:                                              ; preds = %entry
+  ret i32 1
+}
+
diff --git a/test/MC/Mips/elf-N64.ll b/test/MC/Mips/elf-N64.ll
index 23ec53a..ae6de78 100644
--- a/test/MC/Mips/elf-N64.ll
+++ b/test/MC/Mips/elf-N64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -filetype=obj -march=mips64el -mcpu=mips64 %s -o - | elf-dump --dump-section-data  | FileCheck %s
+; RUN: llc -filetype=obj -march=mips64el -mcpu=mips64 -disable-mips-delay-filler %s -o - | elf-dump --dump-section-data  | FileCheck %s
 
 ; Check for N64 relocation production.
 ;
diff --git a/test/MC/Mips/higher_highest.ll b/test/MC/Mips/higher_highest.ll
index 81a89e3..0c66522 100644
--- a/test/MC/Mips/higher_highest.ll
+++ b/test/MC/Mips/higher_highest.ll
@@ -1,5 +1,8 @@
-; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64  -force-mips-long-branch -filetype=obj < %s -o - | elf-dump --dump-section-data | FileCheck %s
-
+; DISABLE: llc -march=mips64el -mcpu=mips64 -mattr=n64  -force-mips-long-branch -filetype=obj < %s -o - | elf-dump --dump-section-data | FileCheck %s
+; RUN: false
+; XFAIL: *
+; Disabled because currently we don't have a way to generate these relocations.
+;
 ; Check that the R_MIPS_HIGHER and R_MIPS_HIGHEST relocations were created.
 
 ; CHECK:     ('r_type', 0x1d)
diff --git a/test/MC/Mips/mips-alu-instructions.s b/test/MC/Mips/mips-alu-instructions.s
new file mode 100644
index 0000000..2997782
--- /dev/null
+++ b/test/MC/Mips/mips-alu-instructions.s
@@ -0,0 +1,100 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for arithmetic and logical instructions.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# Logical instructions
+#------------------------------------------------------------------------------
+# CHECK:  and    $9, $6, $7      # encoding: [0x24,0x48,0xc7,0x00]
+# CHECK:  andi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x30]
+# CHECK:  andi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x30]
+# CHECK:  clo    $6, $7          # encoding: [0x21,0x30,0xe6,0x70]
+# CHECK:  clz    $6, $7          # encoding: [0x20,0x30,0xe6,0x70]
+# CHECK:  ins    $19, $9, 6, 7   # encoding: [0x84,0x61,0x33,0x7d]
+# CHECK:  nor    $9, $6, $7      # encoding: [0x27,0x48,0xc7,0x00]
+# CHECK:  or     $3, $3, $5      # encoding: [0x25,0x18,0x65,0x00]
+# CHECK:  ori    $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x34]
+# CHECK:  rotr   $9, $6, 7       # encoding: [0xc2,0x49,0x26,0x00]
+# CHECK:  rotrv  $9, $6, $7      # encoding: [0x46,0x48,0xe6,0x00]
+# CHECK:  sll    $4, $3, 7       # encoding: [0xc0,0x21,0x03,0x00]
+# CHECK:  sllv   $2, $3, $5      # encoding: [0x04,0x10,0xa3,0x00]
+# CHECK:  slt    $3, $3, $5      # encoding: [0x2a,0x18,0x65,0x00]
+# CHECK:  slti   $3, $3, 103     # encoding: [0x67,0x00,0x63,0x28]
+# CHECK:  slti   $3, $3, 103     # encoding: [0x67,0x00,0x63,0x28]
+# CHECK:  sltiu  $3, $3, 103     # encoding: [0x67,0x00,0x63,0x2c]
+# CHECK:  sltu   $3, $3, $5      # encoding: [0x2b,0x18,0x65,0x00]
+# CHECK:  sra    $4, $3, 7       # encoding: [0xc3,0x21,0x03,0x00]
+# CHECK:  srav   $2, $3, $5      # encoding: [0x07,0x10,0xa3,0x00]
+# CHECK:  srl    $4, $3, 7       # encoding: [0xc2,0x21,0x03,0x00]
+# CHECK:  srlv   $2, $3, $5      # encoding: [0x06,0x10,0xa3,0x00]
+# CHECK:  xor    $3, $3, $5      # encoding: [0x26,0x18,0x65,0x00]
+# CHECK:  xori    $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x38]
+# CHECK:  xori   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x38]
+# CHECK:  wsbh   $6, $7          # encoding: [0xa0,0x30,0x07,0x7c]
+# CHECK:  nor    $7, $8, $zero   # encoding: [0x27,0x38,0x00,0x01]
+     and    $9,  $6, $7
+     and    $9,  $6, 17767
+     andi   $9,  $6, 17767
+     clo    $6,  $7
+     clz    $6,  $7
+     ins    $19, $9, 6,7
+     nor    $9,  $6, $7
+     or     $3,  $3, $5
+     ori    $9,  $6, 17767
+     rotr   $9,  $6, 7
+     rotrv  $9,  $6, $7
+     sll    $4,  $3, 7
+     sllv   $2,  $3, $5
+     slt    $3,  $3, $5
+     slt    $3,  $3, 103
+     slti   $3,  $3, 103
+     sltiu  $3,  $3, 103
+     sltu   $3,  $3, $5
+     sra    $4,  $3, 7
+     srav   $2,  $3, $5
+     srl    $4,  $3, 7
+     srlv   $2,  $3, $5
+     xor    $3,  $3, $5
+     xor    $9,  $6, 17767
+     xori   $9,  $6, 17767
+     wsbh   $6,  $7
+     not    $7  ,$8
+
+#------------------------------------------------------------------------------
+# Arithmetic instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  add    $9, $6, $7      # encoding: [0x20,0x48,0xc7,0x00]
+# CHECK:  addi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x20]
+# CHECK:  addiu  $9, $6, -15001  # encoding: [0x67,0xc5,0xc9,0x24]
+# CHECK:  addi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x20]
+# CHECK:  addiu  $9, $6, -15001  # encoding: [0x67,0xc5,0xc9,0x24]
+# CHECK:  addu   $9, $6, $7      # encoding: [0x21,0x48,0xc7,0x00]
+# CHECK:  madd   $6, $7          # encoding: [0x00,0x00,0xc7,0x70]
+# CHECK:  maddu  $6, $7          # encoding: [0x01,0x00,0xc7,0x70]
+# CHECK:  msub   $6, $7          # encoding: [0x04,0x00,0xc7,0x70]
+# CHECK:  msubu  $6, $7          # encoding: [0x05,0x00,0xc7,0x70]
+# CHECK:  mult   $3, $5          # encoding: [0x18,0x00,0x65,0x00]
+# CHECK:  multu  $3, $5          # encoding: [0x19,0x00,0x65,0x00]
+# CHECK:  sub    $9, $6, $7      # encoding: [0x22,0x48,0xc7,0x00]
+# CHECK:  subu   $4, $3, $5      # encoding: [0x23,0x20,0x65,0x00]
+# CHECK:  sub     $6, $zero, $7  # encoding: [0x22,0x30,0x07,0x00]
+# CHECK:  subu    $6, $zero, $7  # encoding: [0x23,0x30,0x07,0x00]
+# CHECK:  add     $7, $8, $zero  # encoding: [0x20,0x38,0x00,0x01]
+    add    $9,$6,$7
+    add    $9,$6,17767
+    addu   $9,$6,-15001
+    addi   $9,$6,17767
+    addiu  $9,$6,-15001
+    addu   $9,$6,$7
+    madd   $6,$7
+    maddu  $6,$7
+    msub   $6,$7
+    msubu  $6,$7
+    mult   $3,$5
+    multu  $3,$5
+    sub    $9,$6,$7
+    subu   $4,$3,$5
+    neg    $6,$7
+    negu   $6,$7
+    move   $7,$8
diff --git a/test/MC/Mips/mips-coprocessor-encodings.s b/test/MC/Mips/mips-coprocessor-encodings.s
new file mode 100644
index 0000000..bad9163
--- /dev/null
+++ b/test/MC/Mips/mips-coprocessor-encodings.s
@@ -0,0 +1,37 @@
+# RUN: llvm-mc %s -triple=mips64-unknown-freebsd -show-encoding | FileCheck --check-prefix=MIPS64 %s
+
+# MIPS64:	dmtc0	$12, $16, 2             # encoding: [0x40,0xac,0x80,0x02]
+# MIPS64:	dmtc0	$12, $16, 0             # encoding: [0x40,0xac,0x80,0x00]
+# MIPS64:	mtc0	$12, $16, 2             # encoding: [0x40,0x8c,0x80,0x02]
+# MIPS64:	mtc0	$12, $16, 0             # encoding: [0x40,0x8c,0x80,0x00]
+# MIPS64:	dmfc0	$12, $16, 2             # encoding: [0x40,0x2c,0x80,0x02]
+# MIPS64:	dmfc0	$12, $16, 0             # encoding: [0x40,0x2c,0x80,0x00]
+# MIPS64:	mfc0	$12, $16, 2             # encoding: [0x40,0x0c,0x80,0x02]
+# MIPS64:	mfc0	$12, $16, 0             # encoding: [0x40,0x0c,0x80,0x00]
+
+	dmtc0	$12, $16, 2
+	dmtc0	$12, $16
+	mtc0	$12, $16, 2
+	mtc0	$12, $16
+	dmfc0	$12, $16, 2
+	dmfc0	$12, $16
+	mfc0	$12, $16, 2
+	mfc0	$12, $16
+
+# MIPS64:	dmtc2	$12, $16, 2             # encoding: [0x48,0xac,0x80,0x02]
+# MIPS64:	dmtc2	$12, $16, 0             # encoding: [0x48,0xac,0x80,0x00]
+# MIPS64:	mtc2	$12, $16, 2             # encoding: [0x48,0x8c,0x80,0x02]
+# MIPS64:	mtc2	$12, $16, 0             # encoding: [0x48,0x8c,0x80,0x00]
+# MIPS64:	dmfc2	$12, $16, 2             # encoding: [0x48,0x2c,0x80,0x02]
+# MIPS64:	dmfc2	$12, $16, 0             # encoding: [0x48,0x2c,0x80,0x00]
+# MIPS64:	mfc2	$12, $16, 2             # encoding: [0x48,0x0c,0x80,0x02]
+# MIPS64:	mfc2	$12, $16, 0             # encoding: [0x48,0x0c,0x80,0x00]
+
+	dmtc2	$12, $16, 2
+	dmtc2	$12, $16
+	mtc2	$12, $16, 2
+	mtc2	$12, $16
+	dmfc2	$12, $16, 2
+	dmfc2	$12, $16
+	mfc2	$12, $16, 2
+	mfc2	$12, $16
diff --git a/test/MC/Mips/mips-expansions.s b/test/MC/Mips/mips-expansions.s
new file mode 100644
index 0000000..cfc15e8
--- /dev/null
+++ b/test/MC/Mips/mips-expansions.s
@@ -0,0 +1,27 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for macro instructions
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# Load immediate instructions
+#------------------------------------------------------------------------------
+# CHECK: ori     $5, $zero, 123      # encoding: [0x7b,0x00,0x05,0x34]
+# CHECK: addiu   $6, $zero, -2345    # encoding: [0xd7,0xf6,0x06,0x24]
+# CHECK: lui     $7, 1               # encoding: [0x01,0x00,0x07,0x3c]
+# CHECK: ori     $7, $7, 2           # encoding: [0x02,0x00,0xe7,0x34]
+# CHECK: addiu   $4, $zero, 20       # encoding: [0x14,0x00,0x04,0x24]
+# CHECK: lui     $7, 1               # encoding: [0x01,0x00,0x07,0x3c]
+# CHECK: ori     $7, $7, 2           # encoding: [0x02,0x00,0xe7,0x34]
+# CHECK: addiu   $4, $5, 20          # encoding: [0x14,0x00,0xa4,0x24]
+# CHECK: lui     $7, 1               # encoding: [0x01,0x00,0x07,0x3c]
+# CHECK: ori     $7, $7, 2           # encoding: [0x02,0x00,0xe7,0x34]
+# CHECK: addu    $7, $7, $8          # encoding: [0x21,0x38,0xe8,0x00]
+
+    li $5,123
+    li $6,-2345
+    li $7,65538
+
+    la $a0, 20
+    la $7,65538
+    la $a0, 20($a1)
+    la $7,65538($8)
diff --git a/test/MC/Mips/mips-fpu-instructions.s b/test/MC/Mips/mips-fpu-instructions.s
new file mode 100644
index 0000000..a126c6f
--- /dev/null
+++ b/test/MC/Mips/mips-fpu-instructions.s
@@ -0,0 +1,178 @@
+# RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for FPU instructions.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# FP aritmetic  instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  abs.d      $f12, $f14         # encoding: [0x05,0x73,0x20,0x46]
+# CHECK:  abs.s      $f6, $f7           # encoding: [0x85,0x39,0x00,0x46]
+# CHECK:  add.d      $f8, $f12, $f14    # encoding: [0x00,0x62,0x2e,0x46]
+# CHECK:  add.s      $f9, $f6, $f7      # encoding: [0x40,0x32,0x07,0x46]
+# CHECK:  floor.w.d  $f12, $f14         # encoding: [0x0f,0x73,0x20,0x46]
+# CHECK:  floor.w.s  $f6, $f7           # encoding: [0x8f,0x39,0x00,0x46]
+# CHECK:  ceil.w.d   $f12, $f14         # encoding: [0x0e,0x73,0x20,0x46]
+# CHECK:  ceil.w.s   $f6, $f7           # encoding: [0x8e,0x39,0x00,0x46]
+# CHECK:  mul.d      $f8, $f12, $f14    # encoding: [0x02,0x62,0x2e,0x46]
+# CHECK:  mul.s      $f9, $f6, $f7      # encoding: [0x42,0x32,0x07,0x46]
+# CHECK:  neg.d      $f12, $f14         # encoding: [0x07,0x73,0x20,0x46]
+# CHECK:  neg.s      $f6, $f7           # encoding: [0x87,0x39,0x00,0x46]
+# CHECK:  round.w.d  $f12, $f14         # encoding: [0x0c,0x73,0x20,0x46]
+# CHECK:  round.w.s  $f6, $f7           # encoding: [0x8c,0x39,0x00,0x46]
+# CHECK:  sqrt.d     $f12, $f14         # encoding: [0x04,0x73,0x20,0x46]
+# CHECK:  sqrt.s     $f6, $f7           # encoding: [0x84,0x39,0x00,0x46]
+# CHECK:  sub.d      $f8, $f12, $f14    # encoding: [0x01,0x62,0x2e,0x46]
+# CHECK:  sub.s      $f9, $f6, $f7      # encoding: [0x41,0x32,0x07,0x46]
+# CHECK:  trunc.w.d  $f12, $f14         # encoding: [0x0d,0x73,0x20,0x46]
+# CHECK:  trunc.w.s  $f6, $f7           # encoding: [0x8d,0x39,0x00,0x46]
+
+    abs.d      $f12,$f14
+    abs.s      $f6,$f7
+    add.d      $f8,$f12,$f14
+    add.s      $f9,$f6,$f7
+    floor.w.d  $f12,$f14
+    floor.w.s  $f6,$f7
+    ceil.w.d   $f12,$f14
+    ceil.w.s   $f6,$f7
+    mul.d      $f8,$f12,$f14
+    mul.s      $f9,$f6, $f7
+    neg.d      $f12,$f14
+    neg.s      $f6,$f7
+    round.w.d  $f12,$f14
+    round.w.s  $f6,$f7
+    sqrt.d     $f12,$f14
+    sqrt.s     $f6,$f7
+    sub.d      $f8,$f12,$f14
+    sub.s      $f9,$f6,$f7
+    trunc.w.d  $f12,$f14
+    trunc.w.s  $f6,$f7
+
+#------------------------------------------------------------------------------
+# FP compare instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  c.eq.d    $f12, $f14        # encoding: [0x32,0x60,0x2e,0x46]
+# CHECK:  c.eq.s    $f6, $f7          # encoding: [0x32,0x30,0x07,0x46]
+# CHECK:  c.f.d     $f12, $f14        # encoding: [0x30,0x60,0x2e,0x46]
+# CHECK:  c.f.s     $f6, $f7          # encoding: [0x30,0x30,0x07,0x46]
+# CHECK:  c.le.d    $f12, $f14        # encoding: [0x3e,0x60,0x2e,0x46]
+# CHECK:  c.le.s    $f6, $f7          # encoding: [0x3e,0x30,0x07,0x46]
+# CHECK:  c.lt.d    $f12, $f14        # encoding: [0x3c,0x60,0x2e,0x46]
+# CHECK:  c.lt.s    $f6, $f7          # encoding: [0x3c,0x30,0x07,0x46]
+# CHECK:  c.nge.d   $f12, $f14        # encoding: [0x3d,0x60,0x2e,0x46]
+# CHECK:  c.nge.s   $f6, $f7          # encoding: [0x3d,0x30,0x07,0x46]
+# CHECK:  c.ngl.d   $f12, $f14        # encoding: [0x3b,0x60,0x2e,0x46]
+# CHECK:  c.ngl.s   $f6, $f7          # encoding: [0x3b,0x30,0x07,0x46]
+# CHECK:  c.ngle.d  $f12, $f14        # encoding: [0x39,0x60,0x2e,0x46]
+# CHECK:  c.ngle.s  $f6, $f7          # encoding: [0x39,0x30,0x07,0x46]
+# CHECK:  c.ngt.d   $f12, $f14        # encoding: [0x3f,0x60,0x2e,0x46]
+# CHECK:  c.ngt.s   $f6, $f7          # encoding: [0x3f,0x30,0x07,0x46]
+# CHECK:  c.ole.d   $f12, $f14        # encoding: [0x36,0x60,0x2e,0x46]
+# CHECK:  c.ole.s   $f6, $f7          # encoding: [0x36,0x30,0x07,0x46]
+# CHECK:  c.olt.d   $f12, $f14        # encoding: [0x34,0x60,0x2e,0x46]
+# CHECK:  c.olt.s   $f6, $f7          # encoding: [0x34,0x30,0x07,0x46]
+# CHECK:  c.seq.d   $f12, $f14        # encoding: [0x3a,0x60,0x2e,0x46]
+# CHECK:  c.seq.s   $f6, $f7          # encoding: [0x3a,0x30,0x07,0x46]
+# CHECK:  c.sf.d    $f12, $f14        # encoding: [0x38,0x60,0x2e,0x46]
+# CHECK:  c.sf.s    $f6, $f7          # encoding: [0x38,0x30,0x07,0x46]
+# CHECK:  c.ueq.d   $f12, $f14        # encoding: [0x33,0x60,0x2e,0x46]
+# CHECK:  c.ueq.s   $f28, $f18        # encoding: [0x33,0xe0,0x12,0x46]
+# CHECK:  c.ule.d   $f12, $f14        # encoding: [0x37,0x60,0x2e,0x46]
+# CHECK:  c.ule.s   $f6, $f7          # encoding: [0x37,0x30,0x07,0x46]
+# CHECK:  c.ult.d   $f12, $f14        # encoding: [0x35,0x60,0x2e,0x46]
+# CHECK:  c.ult.s   $f6, $f7          # encoding: [0x35,0x30,0x07,0x46]
+# CHECK:  c.un.d    $f12, $f14        # encoding: [0x31,0x60,0x2e,0x46]
+# CHECK:  c.un.s    $f6, $f7          # encoding: [0x31,0x30,0x07,0x46]
+
+     c.eq.d    $f12,$f14
+     c.eq.s    $f6,$f7
+     c.f.d     $f12,$f14
+     c.f.s     $f6,$f7
+     c.le.d    $f12,$f14
+     c.le.s    $f6,$f7
+     c.lt.d    $f12,$f14
+     c.lt.s    $f6,$f7
+     c.nge.d   $f12,$f14
+     c.nge.s   $f6,$f7
+     c.ngl.d   $f12,$f14
+     c.ngl.s   $f6,$f7
+     c.ngle.d  $f12,$f14
+     c.ngle.s  $f6,$f7
+     c.ngt.d   $f12,$f14
+     c.ngt.s   $f6,$f7
+     c.ole.d   $f12,$f14
+     c.ole.s   $f6,$f7
+     c.olt.d   $f12,$f14
+     c.olt.s   $f6,$f7
+     c.seq.d   $f12,$f14
+     c.seq.s   $f6,$f7
+     c.sf.d    $f12,$f14
+     c.sf.s    $f6,$f7
+     c.ueq.d   $f12,$f14
+     c.ueq.s   $f28,$f18
+     c.ule.d   $f12,$f14
+     c.ule.s   $f6,$f7
+     c.ult.d   $f12,$f14
+     c.ult.s   $f6,$f7
+     c.un.d    $f12,$f14
+     c.un.s    $f6,$f7
+
+#------------------------------------------------------------------------------
+# FP convert instructions
+#------------------------------------------------------------------------------
+# CHECK:  cvt.d.s   $f6, $f7          # encoding: [0xa1,0x39,0x00,0x46]
+# CHECK:  cvt.d.w   $f12, $f14        # encoding: [0x21,0x73,0x80,0x46]
+# CHECK:  cvt.s.d   $f12, $f14        # encoding: [0x20,0x73,0x20,0x46]
+# CHECK:  cvt.s.w   $f6, $f7          # encoding: [0xa0,0x39,0x80,0x46]
+# CHECK:  cvt.w.d   $f12, $f14        # encoding: [0x24,0x73,0x20,0x46]
+# CHECK:  cvt.w.s   $f6, $f7          # encoding: [0xa4,0x39,0x00,0x46]
+
+  cvt.d.s   $f6,$f7
+  cvt.d.w   $f12,$f14
+  cvt.s.d   $f12,$f14
+  cvt.s.w   $f6,$f7
+  cvt.w.d   $f12,$f14
+  cvt.w.s   $f6,$f7
+
+#------------------------------------------------------------------------------
+# FP move instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  cfc1    $6, $fcc0            # encoding: [0x00,0x00,0x46,0x44]
+# CHECK:  mfc1    $6, $f7              # encoding: [0x00,0x38,0x06,0x44]
+# CHECK:  mfhi    $5                   # encoding: [0x10,0x28,0x00,0x00]
+# CHECK:  mflo    $5                   # encoding: [0x12,0x28,0x00,0x00]
+# CHECK:  mov.d   $f6, $f8             # encoding: [0x86,0x41,0x20,0x46]
+# CHECK:  mov.s   $f6, $f7             # encoding: [0x86,0x39,0x00,0x46]
+# CHECK:  mtc1    $6, $f7              # encoding: [0x00,0x38,0x86,0x44]
+# CHECK:  mthi    $7                   # encoding: [0x11,0x00,0xe0,0x00]
+# CHECK:  mtlo    $7                   # encoding: [0x13,0x00,0xe0,0x00]
+# CHECK:  swc1    $f9, 9158($7)        # encoding: [0xc6,0x23,0xe9,0xe4]
+# CHECK:  mfc0    $6, $7, 0               # encoding: [0x00,0x38,0x06,0x40]
+# CHECK:  mtc0    $9, $8, 0               # encoding: [0x00,0x40,0x89,0x40]
+# CHECK:  mfc2    $5, $7, 0               # encoding: [0x00,0x38,0x05,0x48]
+# CHECK:  mtc2    $9, $4, 0               # encoding: [0x00,0x20,0x89,0x48]
+# CHECK:  mfc0    $6, $7, 2               # encoding: [0x02,0x38,0x06,0x40]
+# CHECK:  mtc0    $9, $8, 3               # encoding: [0x03,0x40,0x89,0x40]
+# CHECK:  mfc2    $5, $7, 4               # encoding: [0x04,0x38,0x05,0x48]
+# CHECK:  mtc2    $9, $4, 5               # encoding: [0x05,0x20,0x89,0x48]
+
+   cfc1    $a2,$0
+   mfc1    $a2,$f7
+   mfhi    $a1
+   mflo    $a1
+   mov.d   $f6,$f8
+   mov.s   $f6,$f7
+   mtc1    $a2,$f7
+   mthi    $a3
+   mtlo    $a3
+   swc1    $f9,9158($a3)
+   mfc0    $6, $7
+   mtc0    $9, $8
+   mfc2    $5, $7
+   mtc2    $9, $4
+   mfc0    $6, $7, 2
+   mtc0    $9, $8, 3
+   mfc2    $5, $7, 4
+   mtc2    $9, $4, 5
diff --git a/test/MC/Mips/mips-jump-instructions.s b/test/MC/Mips/mips-jump-instructions.s
new file mode 100644
index 0000000..998be41
--- /dev/null
+++ b/test/MC/Mips/mips-jump-instructions.s
@@ -0,0 +1,72 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for jumps and branches.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# Branch instructions
+#------------------------------------------------------------------------------
+# CHECK:   b 1332                 # encoding: [0x34,0x05,0x00,0x10]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bc1f 1332              # encoding: [0x34,0x05,0x00,0x45]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bc1t 1332              # encoding: [0x34,0x05,0x01,0x45]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   beq $9, $6, 1332       # encoding: [0x34,0x05,0x26,0x11]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bgez $6, 1332          # encoding: [0x34,0x05,0xc1,0x04]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bgezal $6, 1332        # encoding: [0x34,0x05,0xd1,0x04]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bgtz $6, 1332          # encoding: [0x34,0x05,0xc0,0x1c]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   blez $6, 1332          # encoding: [0x34,0x05,0xc0,0x18]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bne $9, $6, 1332       # encoding: [0x34,0x05,0x26,0x15]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bal     1332           # encoding: [0x34,0x05,0x00,0x04]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+         b 1332
+         nop
+         bc1f 1332
+         nop
+         bc1t 1332
+         nop
+         beq $9,$6,1332
+         nop
+         bgez $6,1332
+         nop
+         bgezal $6,1332
+         nop
+         bgtz $6,1332
+         nop
+         blez $6,1332
+         nop
+         bne $9,$6,1332
+         nop
+         bal 1332
+         nop
+
+end_of_code:
+#------------------------------------------------------------------------------
+# Jump instructions
+#------------------------------------------------------------------------------
+# CHECK:   j 1328               # encoding: [0x30,0x05,0x00,0x08]
+# CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   jal 1328             # encoding: [0x30,0x05,0x00,0x0c]
+# CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   jalr $6              # encoding: [0x09,0xf8,0xc0,0x00]
+# CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   jr $7                # encoding: [0x08,0x00,0xe0,0x00]
+# CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   jr $7                # encoding: [0x08,0x00,0xe0,0x00]
+
+
+   j 1328
+   nop
+   jal 1328
+   nop
+   jalr $6
+   nop
+   jr $7
+   nop
+   j $7
diff --git a/test/MC/Mips/mips-memory-instructions.s b/test/MC/Mips/mips-memory-instructions.s
new file mode 100644
index 0000000..b5f1267
--- /dev/null
+++ b/test/MC/Mips/mips-memory-instructions.s
@@ -0,0 +1,45 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for loads and stores.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# Memory store instructions
+#------------------------------------------------------------------------------
+# CHECK:  sb      $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xa0]
+# CHECK:  sc      $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xe0]
+# CHECK:  sh      $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xa4]
+# CHECK:  sw      $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xac]
+# CHECK:  sw      $7,  0($5)      # encoding: [0x00,0x00,0xa7,0xac]
+# CHECK:  swc1    $f2, 16($5)     # encoding: [0x10,0x00,0xa2,0xe4]
+# CHECK:  swl     $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xa8]
+     sb   $4, 16($5)
+     sc   $4, 16($5)
+     sh   $4, 16($5)
+     sw   $4, 16($5)
+     sw   $7,   ($5)
+     swc1 $f2, 16($5)
+     swl  $4, 16($5)
+
+#------------------------------------------------------------------------------
+# Memory load instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  lb  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x80]
+# CHECK:  lw  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x8c]
+# CHECK:  lbu $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x90]
+# CHECK:  lh  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x84]
+# CHECK:  lhu $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x94]
+# CHECK:  ll  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0xc0]
+# CHECK:  lw  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x8c]
+# CHECK:  lw  $7, 0($7)       # encoding: [0x00,0x00,0xe7,0x8c]
+# CHECK:  lw  $2, 16($sp)     # encoding: [0x10,0x00,0xa2,0x8f]
+
+      lb      $4, 4($5)
+      lw      $4, 4($5)
+      lbu     $4, 4($5)
+      lh      $4, 4($5)
+      lhu     $4, 4($5)
+      ll      $4, 4($5)
+      lw      $4, 4($5)
+      lw      $7,    ($7)
+      lw      $2, 16($sp)
diff --git a/test/MC/Mips/mips-register-names.s b/test/MC/Mips/mips-register-names.s
new file mode 100644
index 0000000..26187ce
--- /dev/null
+++ b/test/MC/Mips/mips-register-names.s
@@ -0,0 +1,71 @@
+# RUN: llvm-mc %s -triple=mips-unknown-freebsd -show-encoding | FileCheck %s
+
+# Check that the register names are mapped to their correct numbers for o32
+# Second byte of addiu with $zero at rt contains the number of the source
+# register.
+
+# CHECK: encoding: [0x24,0x00,0x00,0x00]
+# CHECK: encoding: [0x24,0x01,0x00,0x00]
+# CHECK: encoding: [0x24,0x02,0x00,0x00]
+# CHECK: encoding: [0x24,0x03,0x00,0x00]
+# CHECK: encoding: [0x24,0x04,0x00,0x00]
+# CHECK: encoding: [0x24,0x05,0x00,0x00]
+# CHECK: encoding: [0x24,0x06,0x00,0x00]
+# CHECK: encoding: [0x24,0x07,0x00,0x00]
+# CHECK: encoding: [0x24,0x08,0x00,0x00]
+# CHECK: encoding: [0x24,0x09,0x00,0x00]
+# CHECK: encoding: [0x24,0x0a,0x00,0x00]
+# CHECK: encoding: [0x24,0x0b,0x00,0x00]
+# CHECK: encoding: [0x24,0x0c,0x00,0x00]
+# CHECK: encoding: [0x24,0x0d,0x00,0x00]
+# CHECK: encoding: [0x24,0x0e,0x00,0x00]
+# CHECK: encoding: [0x24,0x0f,0x00,0x00]
+# CHECK: encoding: [0x24,0x10,0x00,0x00]
+# CHECK: encoding: [0x24,0x11,0x00,0x00]
+# CHECK: encoding: [0x24,0x12,0x00,0x00]
+# CHECK: encoding: [0x24,0x13,0x00,0x00]
+# CHECK: encoding: [0x24,0x14,0x00,0x00]
+# CHECK: encoding: [0x24,0x15,0x00,0x00]
+# CHECK: encoding: [0x24,0x16,0x00,0x00]
+# CHECK: encoding: [0x24,0x17,0x00,0x00]
+# CHECK: encoding: [0x24,0x18,0x00,0x00]
+# CHECK: encoding: [0x24,0x19,0x00,0x00]
+# CHECK: encoding: [0x24,0x1a,0x00,0x00]
+# CHECK: encoding: [0x24,0x1b,0x00,0x00]
+# CHECK: encoding: [0x24,0x1c,0x00,0x00]
+# CHECK: encoding: [0x24,0x1d,0x00,0x00]
+# CHECK: encoding: [0x24,0x1e,0x00,0x00]
+# CHECK: encoding: [0x24,0x1f,0x00,0x00]
+addiu	$zero, $zero, 0
+addiu	$at, $zero, 0
+addiu	$v0, $zero, 0
+addiu	$v1, $zero, 0
+addiu	$a0, $zero, 0
+addiu	$a1, $zero, 0
+addiu	$a2, $zero, 0
+addiu	$a3, $zero, 0
+addiu	$t0, $zero, 0
+addiu	$t1, $zero, 0
+addiu	$t2, $zero, 0
+addiu	$t3, $zero, 0
+addiu	$t4, $zero, 0
+addiu	$t5, $zero, 0
+addiu	$t6, $zero, 0
+addiu	$t7, $zero, 0
+addiu	$s0, $zero, 0
+addiu	$s1, $zero, 0
+addiu	$s2, $zero, 0
+addiu	$s3, $zero, 0
+addiu	$s4, $zero, 0
+addiu	$s5, $zero, 0
+addiu	$s6, $zero, 0
+addiu	$s7, $zero, 0
+addiu	$t8, $zero, 0
+addiu	$t9, $zero, 0
+addiu	$k0, $zero, 0
+addiu	$k1, $zero, 0
+addiu	$gp, $zero, 0
+addiu	$sp, $zero, 0
+addiu	$fp, $zero, 0
+addiu	$sp, $zero, 0
+addiu	$ra, $zero, 0
diff --git a/test/MC/Mips/mips-relocations.s b/test/MC/Mips/mips-relocations.s
new file mode 100644
index 0000000..ff71c75
--- /dev/null
+++ b/test/MC/Mips/mips-relocations.s
@@ -0,0 +1,41 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for relocations.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+# CHECK:  lui   $2, %hi(_gp_disp)     # encoding: [A,A,0x02,0x3c]
+# CHECK:                              #   fixup A - offset: 0, value: _gp_disp@ABS_HI, kind: fixup_Mips_HI16
+# CHECK:  addiu $2, $2, %lo(_gp_disp) # encoding: [A,A,0x42,0x24]
+# CHECK:                              #   fixup A - offset: 0, value: _gp_disp@ABS_LO, kind: fixup_Mips_LO16
+# CHECK:  lw    $25, %call16(strchr)($gp)   # encoding: [A,A,0x99,0x8f]
+# CHECK:                                    #   fixup A - offset: 0, value: strchr@GOT_CALL, kind: fixup_Mips_CALL16
+# CHECK:  lw      $3, %got(loop_1)($2)    # encoding: [A,A,0x43,0x8c]
+# CHECK:                                  #   fixup A - offset: 0, value: loop_1@GOT, kind: fixup_Mips_GOT_Local
+# CHECK:  lui     $2, %dtprel_hi(_gp_disp) # encoding: [A,A,0x02,0x3c]
+# CHECK:                                        #   fixup A - offset: 0, value: _gp_disp@DTPREL_HI, kind: fixup_Mips_DTPREL_HI
+# CHECK:  addiu   $2, $2, %dtprel_hi(_gp_disp) # encoding: [A,A,0x42,0x24]
+# CHECK:                                  #   fixup A - offset: 0, value: _gp_disp@DTPREL_HI, kind: fixup_Mips_DTPREL_HI
+# CHECK:  lw      $3, %got(loop_1)($2)      # encoding: [A,A,0x43,0x8c]
+# CHECK:                                    #   fixup A - offset: 0, value: loop_1@GOT, kind: fixup_Mips_GOT_Local
+# CHECK:  lw      $4, %got_disp(loop_2)($3) # encoding: [A,A,0x64,0x8c]
+# CHECK:                                    #   fixup A - offset: 0, value: loop_2@GOT_DISP, kind: fixup_Mips_GOT_DISP
+# CHECK:  lw      $5, %got_page(loop_3)($4) # encoding: [A,A,0x85,0x8c]
+# CHECK:                                    #   fixup A - offset: 0, value: loop_3@GOT_PAGE, kind: fixup_Mips_GOT_PAGE
+# CHECK:  lw      $6, %got_ofst(loop_4)($5) # encoding: [A,A,0xa6,0x8c]
+# CHECK:                                    #   fixup A - offset: 0, value: loop_4@GOT_OFST, kind: fixup_Mips_GOT_OFST
+# CHECK:  lui     $2, %tprel_hi(_gp_disp)   # encoding: [A,A,0x02,0x3c]
+# CHECK:                                    #   fixup A - offset: 0, value: _gp_disp@TPREL_HI, kind: fixup_Mips_TPREL_HI
+# CHECK:  addiu   $2, $2, %tprel_lo(_gp_disp) # encoding: [A,A,0x42,0x24]
+# CHECK:                                      #   fixup A - offset: 0, value: _gp_disp@TPREL_LO, kind: fixup_Mips_TPREL_LO
+
+    lui	$2, %hi(_gp_disp)
+	  addiu	$2, $2, %lo(_gp_disp)
+    lw	$25, %call16(strchr)($gp)
+    lw      $3, %got(loop_1)($2)
+    lui	$2, %dtprel_hi(_gp_disp)
+	  addiu	$2, $2, %dtprel_hi(_gp_disp)
+    lw	$3, %got(loop_1)($2)
+    lw	$4, %got_disp(loop_2)($3)
+    lw	$5, %got_page(loop_3)($4)
+    lw	$6, %got_ofst(loop_4)($5)
+    lui	$2, %tprel_hi(_gp_disp)
+	  addiu	$2, $2, %tprel_lo(_gp_disp)
diff --git a/test/MC/Mips/mips64-register-names.s b/test/MC/Mips/mips64-register-names.s
new file mode 100644
index 0000000..16783ee
--- /dev/null
+++ b/test/MC/Mips/mips64-register-names.s
@@ -0,0 +1,70 @@
+# RUN: llvm-mc %s -triple=mips64-unknown-freebsd -show-encoding | FileCheck %s
+
+# Check that the register names are mapped to their correct numbers for n64
+# Second byte of addiu with $zero at rt contains the number of the source
+# register.
+
+# CHECK: encoding: [0x64,0x00,0x00,0x00]
+# CHECK: encoding: [0x64,0x01,0x00,0x00]
+# CHECK: encoding: [0x64,0x02,0x00,0x00]
+# CHECK: encoding: [0x64,0x03,0x00,0x00]
+# CHECK: encoding: [0x64,0x04,0x00,0x00]
+# CHECK: encoding: [0x64,0x05,0x00,0x00]
+# CHECK: encoding: [0x64,0x06,0x00,0x00]
+# CHECK: encoding: [0x64,0x07,0x00,0x00]
+# CHECK: encoding: [0x64,0x08,0x00,0x00]
+# CHECK: encoding: [0x64,0x09,0x00,0x00]
+# CHECK: encoding: [0x64,0x0a,0x00,0x00]
+# CHECK: encoding: [0x64,0x0b,0x00,0x00]
+# CHECK: encoding: [0x64,0x0c,0x00,0x00]
+# CHECK: encoding: [0x64,0x0d,0x00,0x00]
+# CHECK: encoding: [0x64,0x0e,0x00,0x00]
+# CHECK: encoding: [0x64,0x0f,0x00,0x00]
+# CHECK: encoding: [0x64,0x10,0x00,0x00]
+# CHECK: encoding: [0x64,0x11,0x00,0x00]
+# CHECK: encoding: [0x64,0x12,0x00,0x00]
+# CHECK: encoding: [0x64,0x13,0x00,0x00]
+# CHECK: encoding: [0x64,0x14,0x00,0x00]
+# CHECK: encoding: [0x64,0x15,0x00,0x00]
+# CHECK: encoding: [0x64,0x16,0x00,0x00]
+# CHECK: encoding: [0x64,0x17,0x00,0x00]
+# CHECK: encoding: [0x64,0x18,0x00,0x00]
+# CHECK: encoding: [0x64,0x19,0x00,0x00]
+# CHECK: encoding: [0x64,0x1a,0x00,0x00]
+# CHECK: encoding: [0x64,0x1b,0x00,0x00]
+# CHECK: encoding: [0x64,0x1c,0x00,0x00]
+# CHECK: encoding: [0x64,0x1d,0x00,0x00]
+# CHECK: encoding: [0x64,0x1e,0x00,0x00]
+# CHECK: encoding: [0x64,0x1f,0x00,0x00]
+daddiu	$zero, $zero, 0
+daddiu	$at, $zero, 0
+daddiu	$v0, $zero, 0
+daddiu	$v1, $zero, 0
+daddiu	$a0, $zero, 0
+daddiu	$a1, $zero, 0
+daddiu	$a2, $zero, 0
+daddiu	$a3, $zero, 0
+daddiu	$a4, $zero, 0
+daddiu	$a5, $zero, 0
+daddiu	$a6, $zero, 0
+daddiu	$a7, $zero, 0
+daddiu	$t4, $zero, 0
+daddiu	$t5, $zero, 0
+daddiu	$t6, $zero, 0
+daddiu	$t7, $zero, 0
+daddiu	$s0, $zero, 0
+daddiu	$s1, $zero, 0
+daddiu	$s2, $zero, 0
+daddiu	$s3, $zero, 0
+daddiu	$s4, $zero, 0
+daddiu	$s5, $zero, 0
+daddiu	$s6, $zero, 0
+daddiu	$s7, $zero, 0
+daddiu	$t8, $zero, 0
+daddiu	$t9, $zero, 0
+daddiu	$kt0, $zero, 0
+daddiu	$kt1, $zero, 0
+daddiu	$gp, $zero, 0
+daddiu	$sp, $zero, 0
+daddiu	$s8, $zero, 0
+daddiu	$ra, $zero, 0
diff --git a/test/MC/Mips/mips64extins.ll b/test/MC/Mips/mips64extins.ll
new file mode 100644
index 0000000..ebe8f86
--- /dev/null
+++ b/test/MC/Mips/mips64extins.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -mattr=n64 %s -o - \
+; RUN: | llvm-objdump -disassemble -triple mips64el -mattr +mips64r2 - \
+; RUN: | FileCheck %s
+
+define i64 @dext(i64 %i) nounwind readnone {
+entry:
+; CHECK: dext ${{[0-9]+}}, ${{[0-9]+}}, 5, 10
+  %shr = lshr i64 %i, 5
+  %and = and i64 %shr, 1023
+  ret i64 %and
+}
+
+define i64 @dextu(i64 %i) nounwind readnone {
+entry:
+; CHECK: dextu ${{[0-9]+}}, ${{[0-9]+}}, 2, 6
+  %shr = lshr i64 %i, 34
+  %and = and i64 %shr, 63
+  ret i64 %and
+}
+
+define i64 @dextm(i64 %i) nounwind readnone {
+entry:
+; CHECK: dextm ${{[0-9]+}}, ${{[0-9]+}}, 5, 2
+  %shr = lshr i64 %i, 5
+  %and = and i64 %shr, 17179869183
+  ret i64 %and
+}
+
+define i64 @dins(i64 %i, i64 %j) nounwind readnone {
+entry:
+; CHECK: dins ${{[0-9]+}}, ${{[0-9]+}}, 8, 10
+  %shl2 = shl i64 %j, 8
+  %and = and i64 %shl2, 261888
+  %and3 = and i64 %i, -261889
+  %or = or i64 %and3, %and
+  ret i64 %or
+}
+
+define i64 @dinsm(i64 %i, i64 %j) nounwind readnone {
+entry:
+; CHECK: dinsm ${{[0-9]+}}, ${{[0-9]+}}, 10, 1
+  %shl4 = shl i64 %j, 10
+  %and = and i64 %shl4, 8796093021184
+  %and5 = and i64 %i, -8796093021185
+  %or = or i64 %and5, %and
+  ret i64 %or
+}
+
+define i64 @dinsu(i64 %i, i64 %j) nounwind readnone {
+entry:
+; CHECK: dinsu ${{[0-9]+}}, ${{[0-9]+}}, 8, 13
+  %shl4 = shl i64 %j, 40
+  %and = and i64 %shl4, 9006099743113216
+  %and5 = and i64 %i, -9006099743113217
+  %or = or i64 %and5, %and
+  ret i64 %or
+}
diff --git a/test/MC/Mips/mips64shift.ll b/test/MC/Mips/mips64shift.ll
index 7817b96..99cac7b 100644
--- a/test/MC/Mips/mips64shift.ll
+++ b/test/MC/Mips/mips64shift.ll
@@ -1,5 +1,8 @@
-; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 %s -o - | llvm-objdump -disassemble -triple mips64el - | FileCheck %s
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -disable-mips-delay-filler %s -o - \
+; RUN: | llvm-objdump -disassemble -triple mips64el - | FileCheck %s 
 
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 %s -o - \
+; RUN: | llvm-objdump -disassemble -triple mips64el - | FileCheck %s 
 
 define i64 @f3(i64 %a0) nounwind readnone {
 entry:
diff --git a/test/MC/Mips/mips_directives.s b/test/MC/Mips/mips_directives.s
new file mode 100644
index 0000000..e2f75a8
--- /dev/null
+++ b/test/MC/Mips/mips_directives.s
@@ -0,0 +1,16 @@
+# RUN: llvm-mc -triple mips-unknown-unknown %s
+#this test produces no output so there isS no FileCheck call
+$BB0_2:
+  .ent directives_test
+	.frame	$sp,0,$ra
+	.mask 	0x00000000,0
+	.fmask	0x00000000,0
+	.set	noreorder
+	.set	nomacro
+	.set	noat
+$JTI0_0:
+	.gpword	($BB0_2)
+	.set  at=$12
+	.set macro
+	.set reorder
+	.end directives_test
diff --git a/test/MC/Mips/multi-64bit-func.ll b/test/MC/Mips/multi-64bit-func.ll
index 6e0d784..83577aa 100644
--- a/test/MC/Mips/multi-64bit-func.ll
+++ b/test/MC/Mips/multi-64bit-func.ll
@@ -1,8 +1,8 @@
 ; There is no real check here. If the test doesn't 
 ; assert it passes.
-; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 < %s 
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -disable-mips-delay-filler < %s 
 ; Run it again without extra nop in delay slot
-; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -enable-mips-delay-filler < %s 
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 < %s 
 
 define i32 @bosco1(i32 %x) nounwind readnone {
 entry:
diff --git a/test/MC/Mips/sext_64_32.ll b/test/MC/Mips/sext_64_32.ll
index e5c57b8..9e0cfa0 100644
--- a/test/MC/Mips/sext_64_32.ll
+++ b/test/MC/Mips/sext_64_32.ll
@@ -2,7 +2,7 @@
 
 ; Sign extend from 32 to 64 was creating nonsense opcodes
 
-; CHECK: sll ${{[0-9]+}}, ${{[0-9]+}}, 0
+; CHECK: sll ${{[a-z0-9]+}}, ${{[a-z0-9]+}}, 0
 
 define i64 @foo(i32 %ival) nounwind readnone {
 entry:
@@ -10,7 +10,7 @@ entry:
   ret i64 %conv
 }
 
-; CHECK: dsll32 ${{[0-9]+}}, ${{[0-9]+}}, 0
+; CHECK: dsll32 ${{[a-z0-9]+}}, ${{[a-z0-9]+}}, 0
 
 define i64 @foo_2(i32 %ival_2) nounwind readnone {
 entry:
diff --git a/test/MC/PowerPC/lit.local.cfg b/test/MC/PowerPC/lit.local.cfg
new file mode 100644
index 0000000..88488cd
--- /dev/null
+++ b/test/MC/PowerPC/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.ll', '.c', '.cpp', '.s']
+
+targets = set(config.root.targets_to_build.split())
+if not 'PowerPC' in targets:
+    config.unsupported = True
diff --git a/test/MC/PowerPC/ppc64-initial-cfa.ll b/test/MC/PowerPC/ppc64-initial-cfa.ll
new file mode 100644
index 0000000..3936cf2
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-initial-cfa.ll
@@ -0,0 +1,41 @@
+;; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -filetype=obj %s -o - | \
+;; RUN: elf-dump --dump-section-data | FileCheck %s
+
+;; FIXME: this file should be in .s form, change when asm parser is available.
+
+define void @f() {
+entry:
+  ret void
+}
+
+;; CHECK:      ('sh_name', 0x{{.*}}) # '.eh_frame'
+;; CHECK-NEXT: ('sh_type', 0x00000001)
+;; CHECK-NEXT: ('sh_flags', 0x0000000000000002)
+;; CHECK-NEXT: ('sh_addr', 0x{{.*}})
+;; CHECK-NEXT: ('sh_offset', 0x{{.*}})
+;; CHECK-NEXT: ('sh_size', 0x0000000000000030)
+;; CHECK-NEXT: ('sh_link', 0x00000000)
+;; CHECK-NEXT: ('sh_info', 0x00000000)
+;; CHECK-NEXT: ('sh_addralign', 0x0000000000000008)
+;; CHECK-NEXT: ('sh_entsize', 0x0000000000000000)
+;; CHECK-NEXT: ('_section_data', '00000010 00000000 017a5200 01784101 000c0100 00000018 00000018 00000000 00000000 00000000 00000010 00000000')
+
+;; CHECK:      ('sh_name', 0x{{.*}}) # '.rela.eh_frame'
+;; CHECK-NEXT: ('sh_type', 0x00000004)
+;; CHECK-NEXT: ('sh_flags', 0x0000000000000000)
+;; CHECK-NEXT: ('sh_addr', 0x{{.*}})
+;; CHECK-NEXT: ('sh_offset', 0x{{.*}})
+;; CHECK-NEXT: ('sh_size', 0x0000000000000018)
+;; CHECK-NEXT: ('sh_link', 0x{{.*}})
+;; CHECK-NEXT: ('sh_info', 0x{{.*}})
+;; CHECK-NEXT: ('sh_addralign', 0x0000000000000008)
+;; CHECK-NEXT: ('sh_entsize', 0x0000000000000018)
+;; CHECK-NEXT: ('_relocations', [
+;; CHECK-NEXT:  # Relocation 0
+;; CHECK-NEXT:  (('r_offset', 0x000000000000001c)
+;; CHECK-NEXT:   ('r_sym', 0x{{.*}})
+;; CHECK-NEXT:   ('r_type', 0x00000026)
+;; CHECK-NEXT:   ('r_addend', 0x0000000000000000)
+;; CHECK-NEXT:  ),
+;; CHECK-NEXT: ])
+
diff --git a/test/MC/PowerPC/ppc64-relocs-01.ll b/test/MC/PowerPC/ppc64-relocs-01.ll
new file mode 100644
index 0000000..5996af8
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-relocs-01.ll
@@ -0,0 +1,66 @@
+;; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -O3  \
+;; RUN:  -filetype=obj %s -o - | \
+;; RUN: elf-dump --dump-section-data | FileCheck %s
+
+;; FIXME: this file need to be in .s form, change when asm parse is done.
+
+@number64 = global i64 10, align 8
+
+define i64 @access_int64(i64 %a) nounwind readonly {
+entry:
+  %0 = load i64* @number64, align 8
+  %cmp = icmp eq i64 %0, %a
+  %conv1 = zext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+declare double @sin(double) nounwind
+
+define double @test_branch24 (double %x) nounwind readonly {
+entry:
+  %add = call double @sin(double %x) nounwind
+  ret double %add
+}
+
+;; The relocations in .rela.text are the 'number64' load using a
+;; R_PPC64_TOC16_DS against the .toc and the 'sin' external function
+;; address using a R_PPC64_REL24
+;; CHECK:       '.rela.text'
+;; CHECK:       Relocation 0
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000006
+;; CHECK-NEXT:  'r_type', 0x0000003f
+;; CHECK:       Relocation 1
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x0000000a
+;; CHECK-NEXT:  'r_type', 0x0000000a
+
+;; The .opd entry for the 'access_int64' function creates 2 relocations:
+;; 1. A R_PPC64_ADDR64 against the .text segment plus addend (the function
+;    address itself);
+;; 2. And a R_PPC64_TOC against no symbol (the linker will replace for the
+;;    module's TOC base).
+;; CHECK:       '.rela.opd'
+;; CHECK:       Relocation 0
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000002
+;; CHECK-NEXT:  'r_type', 0x00000026
+;; CHECK:       Relocation 1
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000000
+;; CHECK-NEXT:  'r_type', 0x00000033
+
+;; Finally the TOC creates the relocation for the 'number64'.
+;; CHECK:       '.rela.toc'
+;; CHECK:       Relocation 0
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000008
+;; CHECK-NEXT:  'r_type', 0x00000026
+
+;; Check if the relocation references are for correct symbols.
+;; CHECK:       Symbol 7
+;; CHECK-NEXT:  'access_int64'
+;; CHECK:       Symbol 8
+;; CHECK-NEXT:  'number64'
+;; CHECK:       Symbol 10
+;; CHECK-NEXT:  'sin'
diff --git a/test/MC/PowerPC/ppc64-tls-relocs-01.ll b/test/MC/PowerPC/ppc64-tls-relocs-01.ll
new file mode 100644
index 0000000..5e37311
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-tls-relocs-01.ll
@@ -0,0 +1,28 @@
+;; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -filetype=obj %s -o - | \
+;; RUN: elf-dump --dump-section-data | FileCheck %s
+
+;; FIXME: this file should be in .s form, change when asm parser is available.
+
+@t = thread_local global i32 0, align 4
+
+define i32* @f() nounwind {
+entry:
+  ret i32* @t
+}
+
+;; Check for a pair of R_PPC64_TPREL16_HA / R_PPC64_TPREL16_LO relocs
+;; against the thread-local symbol 't'.
+;; CHECK:       '.rela.text'
+;; CHECK:       Relocation 0
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000008
+;; CHECK-NEXT:  'r_type', 0x00000048
+;; CHECK:       Relocation 1
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000008
+;; CHECK-NEXT:  'r_type', 0x00000046
+
+;; Check that we got the correct symbol.
+;; CHECK:       Symbol 8
+;; CHECK-NEXT:  't'
+
diff --git a/test/MC/X86/intel-syntax-2.s b/test/MC/X86/intel-syntax-2.s
index ca4afc3..d6dbe15 100644
--- a/test/MC/X86/intel-syntax-2.s
+++ b/test/MC/X86/intel-syntax-2.s
@@ -1,7 +1,9 @@
-// RUN: llvm-mc -triple x86_64-unknown-unknown  %s | FileCheck %s
+// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=att %s | FileCheck %s
 
 	.intel_syntax
 _test:
 // CHECK:	movl	$257, -4(%rsp)
 	mov	DWORD PTR [RSP - 4], 257
-
+    .att_syntax
+// CHECK:	movl	$257, -4(%rsp)
+    movl $257, -4(%rsp)
diff --git a/test/MC/X86/x86-32-ms-inline-asm.s b/test/MC/X86/x86-32-ms-inline-asm.s
new file mode 100644
index 0000000..73d5878
--- /dev/null
+++ b/test/MC/X86/x86-32-ms-inline-asm.s
@@ -0,0 +1,60 @@
+// RUN: llvm-mc -x86-asm-syntax=intel -triple i386-unknown-unknown --show-encoding %s | FileCheck %s
+
+mov eax, [ebx].0
+mov [ebx].4, ecx
+
+// CHECK: movl (%ebx), %eax
+// CHECK: encoding: [0x8b,0x03]
+// CHECK: movl %ecx, 4(%ebx)
+// CHECK: encoding: [0x89,0x4b,0x04]
+        
+_t21:                                   ## @t21
+// CHECK: t21
+	mov eax, [4*eax + 4]
+// CHECK: movl 4(,%eax,4), %eax
+// CHECK: # encoding: [0x8b,0x04,0x85,0x04,0x00,0x00,0x00]
+    mov eax, [4*eax][4]
+// CHECK: movl 4(,%eax,4), %eax
+// CHECK: # encoding: [0x8b,0x04,0x85,0x04,0x00,0x00,0x00]
+        
+	mov eax, [esi + eax]
+// CHECK: movl (%esi,%eax), %eax
+// CHECK: # encoding: [0x8b,0x04,0x06]
+	mov eax, [esi][eax]
+// CHECK: movl (%esi,%eax), %eax
+// CHECK: # encoding: [0x8b,0x04,0x06]
+        
+	mov eax, [esi + 4*eax]
+// CHECK: movl (%esi,%eax,4), %eax
+// CHECK: # encoding: [0x8b,0x04,0x86]
+	mov eax, [esi][4*eax]
+// CHECK: movl (%esi,%eax,4), %eax
+// CHECK: # encoding: [0x8b,0x04,0x86]
+
+    mov eax, [esi + eax + 4]
+// CHECK: movl 4(%esi,%eax), %eax
+// CHECK: # encoding: [0x8b,0x44,0x06,0x04]
+	mov eax, [esi][eax + 4]
+// CHECK: movl 4(%esi,%eax), %eax
+// CHECK: # encoding: [0x8b,0x44,0x06,0x04]
+	mov eax, [esi + eax][4]
+// CHECK: movl 4(%esi,%eax), %eax
+// CHECK: # encoding: [0x8b,0x44,0x06,0x04]
+	mov eax, [esi][eax][4]
+// CHECK: movl 4(%esi,%eax), %eax
+// CHECK: # encoding: [0x8b,0x44,0x06,0x04]
+
+	mov eax, [esi + 2*eax + 4]
+// CHECK: movl 4(%esi,%eax,2), %eax
+// CHECK: # encoding: [0x8b,0x44,0x46,0x04]
+	mov eax, [esi][2*eax + 4]
+// CHECK: movl 4(%esi,%eax,2), %eax
+// CHECK: # encoding: [0x8b,0x44,0x46,0x04]
+	mov eax, [esi + 2*eax][4]
+// CHECK: movl 4(%esi,%eax,2), %eax
+// CHECK: # encoding: [0x8b,0x44,0x46,0x04]
+	mov eax, [esi][2*eax][4]
+// CHECK: movl 4(%esi,%eax,2), %eax
+// CHECK: # encoding: [0x8b,0x44,0x46,0x04]
+
+	ret
diff --git a/test/MC/X86/x86-64.s b/test/MC/X86/x86-64.s
index 6a2d5bb..03cb62e 100644
--- a/test/MC/X86/x86-64.s
+++ b/test/MC/X86/x86-64.s
@@ -1164,6 +1164,10 @@ xsetbv // CHECK: xsetbv # encoding: [0x0f,0x01,0xd1]
 // CHECK: encoding: [0x66,0x48,0x0f,0x6e,0xc7]
 	movd %rdi,%xmm0
 
+// CHECK: movd  %xmm0, %rax
+// CHECK: encoding: [0x66,0x48,0x0f,0x7e,0xc0]
+        movd  %xmm0, %rax
+
 // CHECK: movntil %eax, (%rdi)
 // CHECK: encoding: [0x0f,0xc3,0x07]
 // CHECK: movntil
diff --git a/test/MC/X86/x86_64-rtm-encoding.s b/test/MC/X86/x86_64-rtm-encoding.s
new file mode 100644
index 0000000..44d6bac
--- /dev/null
+++ b/test/MC/X86/x86_64-rtm-encoding.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK: xbegin .L0
+// CHECK: encoding: [0xc7,0xf8,A,A,A,A]
+	xbegin .L0
+
+// CHECK: xend
+// CHECK: encoding: [0x0f,0x01,0xd5]
+	xend
+
+// CHECK: xabort
+// CHECK: encoding: [0xc6,0xf8,0x0d]
+	xabort $13
diff --git a/test/MC/X86/x86_nop.s b/test/MC/X86/x86_nop.s
new file mode 100644
index 0000000..396e302
--- /dev/null
+++ b/test/MC/X86/x86_nop.s
@@ -0,0 +1,13 @@
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=generic %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=i386 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=i486 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=i586 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=pentium %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=pentium-mmx %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=geode %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=i686 %s | llvm-objdump -d - | not FileCheck %s
+
+# CHECK-NOT: nop{{[lw]}}
+inc %eax
+.align 8
+inc %eax
diff --git a/test/Makefile b/test/Makefile
index 9ddfabf..810fdde 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -29,11 +29,6 @@ else
 LIT_ARGS := -s -v
 endif
 
-# -jN causes crash on Cygwin's python.
-ifneq (,$(filter $(HOST_OS),Cygwin))
-  LIT_ARGS += -j1
-endif
-
 ifdef TESTSUITE
 LIT_TESTSUITE := $(TESTSUITE)
 CLEANED_TESTSUITE := $(patsubst %/,%,$(TESTSUITE))
@@ -122,6 +117,16 @@ else
 ENABLE_ASSERTIONS=1
 endif
 
+# Derive whether or not LTO is enabled by checking the extra options.
+LTO_IS_ENABLED := 0
+ifneq ($(findstring -flto,$(CompileCommonOpts)),)
+LTO_IS_ENABLED := 1
+else
+ifneq ($(findstring -O4,$(CompileCommonOpts)),)
+LTO_IS_ENABLED := 1
+endif
+endif
+
 lit.site.cfg: FORCE
 	@echo "Making LLVM 'lit.site.cfg' file..."
 	@$(ECHOPATH) s=@TARGET_TRIPLE@=$(TARGET_TRIPLE)=g > lit.tmp
@@ -131,9 +136,10 @@ lit.site.cfg: FORCE
 	@$(ECHOPATH) s=@SHLIBDIR@=$(SharedLibDir)=g >> lit.tmp
 	@$(ECHOPATH) s=@SHLIBEXT@=$(SHLIBEXT)=g >> lit.tmp
 	@$(ECHOPATH) s=@PYTHON_EXECUTABLE@=python=g >> lit.tmp
-	@$(ECHOPATH) s,@OCAMLOPT@,$(OCAMLOPT) -cc \\\\\"$(CXX_FOR_OCAMLOPT)\\\\\" -I $(LibDir)/ocaml,g >> lit.tmp
+	@$(ECHOPATH) s=@OCAMLOPT@=$(OCAMLOPT) -cc $(subst *,'\\\"',*$(subst =,"\\=",$(CXX_FOR_OCAMLOPT))*) -I $(LibDir)/ocaml=g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_SHARED@=$(ENABLE_SHARED)=g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_ASSERTIONS@=$(ENABLE_ASSERTIONS)=g >> lit.tmp
+	@$(ECHOPATH) s=@LTO_IS_ENABLED@=$(LTO_IS_ENABLED)=g >> lit.tmp
 	@$(ECHOPATH) s=@TARGETS_TO_BUILD@=$(TARGETS_TO_BUILD)=g >> lit.tmp
 	@$(ECHOPATH) s=@LLVM_BINDINGS@=$(BINDINGS_TO_BUILD)=g >> lit.tmp
 	@$(ECHOPATH) s=@HOST_OS@=$(HOST_OS)=g >> lit.tmp
diff --git a/test/Object/Inputs/dext-test.elf-mips64r2 b/test/Object/Inputs/dext-test.elf-mips64r2
new file mode 100644
index 0000000..59dbaef
--- /dev/null
+++ b/test/Object/Inputs/dext-test.elf-mips64r2
diff --git a/test/Object/Inputs/relocations.elf-x86-64 b/test/Object/Inputs/relocations.elf-x86-64
new file mode 100644
index 0000000..6e340c7
--- /dev/null
+++ b/test/Object/Inputs/relocations.elf-x86-64
diff --git a/test/Object/Mips/feature.test b/test/Object/Mips/feature.test
new file mode 100644
index 0000000..e8da609
--- /dev/null
+++ b/test/Object/Mips/feature.test
@@ -0,0 +1,11 @@
+RUN: llvm-objdump -disassemble -triple mips64el -mattr +mips64r2 %p/../Inputs/dext-test.elf-mips64r2 \
+RUN: | FileCheck %s
+
+CHECK: Disassembly of section .text:
+CHECK: .text:
+CHECK:        0:	08 00 e0 03                                  	jr	$ra
+CHECK:        4:	43 49 82 7c                                  	dext $2, $4, 5, 10
+CHECK:        8:	08 00 e0 03                                  	jr	$ra
+CHECK:        c:	83 28 82 7c                                  	dext $2, $4, 2, 6
+CHECK:       10:	08 00 e0 03                                  	jr	$ra
+CHECK:       14:	43 09 82 7c                                  	dext $2, $4, 5, 2
diff --git a/test/Object/Mips/lit.local.cfg b/test/Object/Mips/lit.local.cfg
new file mode 100644
index 0000000..1499317
--- /dev/null
+++ b/test/Object/Mips/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.test']
+
+targets = set(config.root.targets_to_build.split())
+if not 'Mips' in targets:
+    config.unsupported = True
diff --git a/test/Object/nm-shared-object.test b/test/Object/nm-shared-object.test
index b361df5..a57b940 100644
--- a/test/Object/nm-shared-object.test
+++ b/test/Object/nm-shared-object.test
@@ -1,15 +1,23 @@
 RUN: llvm-nm -D %p/Inputs/shared-object-test.elf-i386 \
-RUN:         | FileCheck %s -check-prefix ELF
+RUN:         | FileCheck %s -check-prefix ELF-32
 RUN: llvm-nm -D %p/Inputs/shared-object-test.elf-x86-64 \
-RUN:         | FileCheck %s -check-prefix ELF
+RUN:         | FileCheck %s -check-prefix ELF-64
 
 ; Note: tls_sym should be 'D' (not '?'), but TLS is not
 ; yet recognized by ObjectFile.
 
-ELF: {{[0-9a-f]+}} A __bss_start
-ELF: {{[0-9a-f]+}} A _edata
-ELF: {{[0-9a-f]+}} A _end
-ELF: {{[0-9a-f]+}} B common_sym
-ELF: {{[0-9a-f]+}} D defined_sym
-ELF: {{[0-9a-f]+}} T global_func
-ELF:               ? tls_sym
+ELF-32: 0012c8 A __bss_start
+ELF-32: 0012c8 A _edata
+ELF-32: 0012cc A _end
+ELF-32: 0012c8 B common_sym
+ELF-32: 0012c4 D defined_sym
+ELF-32: 0001f0 T global_func
+ELF-32:        ? tls_sym
+
+ELF-64: 200454 A __bss_start
+ELF-64: 200454 A _edata
+ELF-64: 200458 A _end
+ELF-64: 200454 B common_sym
+ELF-64: 200450 D defined_sym
+ELF-64: 0002f0 T global_func
+ELF-64:        ? tls_sym
diff --git a/test/Object/objdump-relocations.test b/test/Object/objdump-relocations.test
index a394a23..6d35a26 100644
--- a/test/Object/objdump-relocations.test
+++ b/test/Object/objdump-relocations.test
@@ -9,6 +9,9 @@ RUN:              | FileCheck %s -check-prefix ELF-x86-64
 RUN: llvm-objdump -r %p/Inputs/trivial-object-test.elf-hexagon \
 RUN:              | FileCheck %s -check-prefix ELF-hexagon
 
+RUN: llvm-objdump -r %p/Inputs/relocations.elf-x86-64 \
+RUN:              | FileCheck %s -check-prefix ELF-complex-x86-64
+
 COFF-i386: .text
 COFF-i386: IMAGE_REL_I386_DIR32 L_.str
 COFF-i386: IMAGE_REL_I386_REL32 _puts
@@ -36,3 +39,13 @@ ELF-hexagon: R_HEX_HI16 puts
 ELF-hexagon: R_HEX_LO16 puts
 ELF-hexagon: R_HEX_B15_PCREL testf
 ELF-hexagon: R_HEX_B22_PCREL puts
+
+ELF-complex-x86-64: .text
+ELF-complex-x86-64-NEXT: R_X86_64_8 .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_16 .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_32 .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_32S .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_64 .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_PC32 .data-4-P
+ELF-complex-x86-64-NEXT: R_X86_64_32 .data+0
+ELF-complex-x86-64-NEXT: R_X86_64_32 .data+4
diff --git a/test/Object/objdump-symbol-table.test b/test/Object/objdump-symbol-table.test
index 989ec04..c94b077 100644
--- a/test/Object/objdump-symbol-table.test
+++ b/test/Object/objdump-symbol-table.test
@@ -4,6 +4,8 @@ RUN: llvm-objdump -t %p/Inputs/trivial-object-test.elf-i386 \
 RUN:              | FileCheck %s -check-prefix ELF-i386
 RUN: llvm-objdump -t %p/Inputs/trivial-object-test.macho-i386 \
 RUN:              | FileCheck %s -check-prefix macho-i386
+RUN: llvm-objdump -t %p/Inputs/shared-object-test.elf-i386 \
+RUN:              | FileCheck %s -check-prefix ELF-shared
 
 COFF-i386: file format
 COFF-i386: SYMBOL TABLE:
@@ -31,3 +33,9 @@ macho-i386: SYMBOL TABLE:
 macho-i386: 00000000 g     F __TEXT,__text  00000024 _main
 macho-i386: 00000000         *UND*  00000000 _SomeOtherFunction
 macho-i386: 00000000         *UND*  00000000 _puts
+
+ELF-shared: shared-object-test.elf-i386:     file format
+ELF-shared: SYMBOL TABLE:
+ELF-shared: 00000200 l     F .text 00000003 local_func
+ELF-shared: 000012c4 g       .data 00000004 defined_sym
+ELF-shared: 000001f0 g     F .text 00000003 global_func
diff --git a/test/Other/FileCheck-space.txt b/test/Other/FileCheck-space.txt
new file mode 100644
index 0000000..6bbe5bc
--- /dev/null
+++ b/test/Other/FileCheck-space.txt
@@ -0,0 +1,9 @@
+RUN: printf "a\nb" | FileCheck %s -check-prefix=TEST1
+RUN: echo oo | FileCheck %s -check-prefix=TEST2
+
+Check that CHECK-NEXT without a space after the colon works.
+TEST1:a
+TEST1-NEXT:b
+
+Check that CHECK-NOT without a space after the colon works.
+TEST2-NOT:foo
diff --git a/test/Other/Inputs/llvm-cov.gcda b/test/Other/Inputs/llvm-cov.gcda
new file mode 100644
index 0000000..9ae2286
--- /dev/null
+++ b/test/Other/Inputs/llvm-cov.gcda
diff --git a/test/Other/Inputs/llvm-cov.gcno b/test/Other/Inputs/llvm-cov.gcno
new file mode 100644
index 0000000..25e2023
--- /dev/null
+++ b/test/Other/Inputs/llvm-cov.gcno
diff --git a/test/Other/ResponseFile.ll b/test/Other/ResponseFile.ll
new file mode 100644
index 0000000..b8b3d0a
--- /dev/null
+++ b/test/Other/ResponseFile.ll
@@ -0,0 +1,9 @@
+; RUN: echo %s > %t.list
+; RUN: llvm-as @%t.list -o %t.bc
+; RUN: llvm-nm %t.bc 2>&1 | FileCheck %s
+
+; CHECK: T foobar
+
+define void @foobar() {
+  ret void
+}
diff --git a/test/Other/extract-alias.ll b/test/Other/extract-alias.ll
new file mode 100644
index 0000000..d5bab4b
--- /dev/null
+++ b/test/Other/extract-alias.ll
@@ -0,0 +1,49 @@
+; RUN: llvm-extract -func foo -S < %s | FileCheck %s
+; RUN: llvm-extract -delete -func foo -S < %s | FileCheck --check-prefix=DELETE %s
+; RUN: llvm-extract -alias zeda0 -S < %s | FileCheck --check-prefix=ALIAS %s
+; RUN: llvm-extract -ralias .*bar -S < %s | FileCheck --check-prefix=ALIASRE %s
+
+; Both aliases should be converted to declarations
+; CHECK:      @zeda0 = external global i32
+; CHECK:      define i32* @foo() {
+; CHECK-NEXT:  call void @a0bar()
+; CHECK-NEXT:  ret i32* @zeda0
+; CHECK-NEXT: }
+; CHECK:      declare void @a0bar()
+
+; DELETE:      @zed = global i32 0
+; DELETE:      @zeda0 = alias i32* @zed
+; DELETE-NEXT: @a0foo = alias i32* ()* @foo
+; DELETE-NEXT: @a0a0bar = alias void ()* @a0bar
+; DELETE-NEXT: @a0bar = alias void ()* @bar
+; DELETE:      declare i32* @foo()
+; DELETE:      define void @bar() {
+; DELETE-NEXT:  %c = call i32* @foo()
+; DELETE-NEXT:  ret void
+; DELETE-NEXT: }
+
+; ALIAS: @zed = external global i32
+; ALIAS: @zeda0 = alias i32* @zed
+
+; ALIASRE: @a0a0bar = alias void ()* @a0bar
+; ALIASRE: @a0bar = alias void ()* @bar
+; ALIASRE: declare void @bar()
+
+@zed = global i32 0
+@zeda0 = alias i32* @zed
+
+@a0foo = alias i32* ()* @foo
+
+define i32* @foo() {
+  call void @a0bar()
+  ret i32* @zeda0
+}
+
+@a0a0bar = alias void ()* @a0bar
+
+@a0bar = alias void ()* @bar
+
+define void @bar() {
+  %c = call i32* @foo()
+  ret void
+}
diff --git a/test/Other/extract-weak-odr.ll b/test/Other/extract-weak-odr.ll
new file mode 100644
index 0000000..6618f58
--- /dev/null
+++ b/test/Other/extract-weak-odr.ll
@@ -0,0 +1,23 @@
+; RUN: llvm-extract -func foo -S < %s | FileCheck %s
+; RUN: llvm-extract -delete -func foo -S < %s | FileCheck --check-prefix=DELETE %s
+
+; Test that we don't convert weak_odr to external definitions.
+
+; CHECK:      @bar = external global i32
+; CHECK:      define weak_odr i32* @foo() {
+; CHECK-NEXT:  ret i32* @bar
+; CHECK-NEXT: }
+
+; DELETE: @bar = weak_odr global i32 42
+; DELETE: declare i32* @foo()
+
+@bar = weak_odr global i32 42
+
+define weak_odr i32*  @foo() {
+  ret i32* @bar
+}
+
+define void @g() {
+  %c = call i32* @foo()
+  ret void
+}
diff --git a/test/Other/extract.ll b/test/Other/extract.ll
index 57573ed..8b0c835 100644
--- a/test/Other/extract.ll
+++ b/test/Other/extract.ll
@@ -7,18 +7,19 @@
 ; llvm-extract uses lazy bitcode loading, so make sure it correctly reads
 ; from bitcode files in addition to assembly files.
 
-; CHECK: define void @foo() {
+; CHECK: define hidden void @foo() {
 ; CHECK:   ret void
 ; CHECK: }
 
-; The linkonce_odr linkage for foo() should be changed to external linkage.
-; DELETE: declare void @foo()
+; The private linkage for foo() should be changed to external linkage and
+; hidden visibility added.
+; DELETE: declare hidden void @foo()
 ; DELETE: define void @bar() {
 ; DELETE:   call void @foo()
 ; DELETE:   ret void
 ; DELETE: }
 
-define linkonce_odr void @foo() {
+define private void @foo() {
   ret void
 }
 define void @bar() {
diff --git a/test/Other/link-opts.ll b/test/Other/link-opts.ll
new file mode 100644
index 0000000..8e58ac8
--- /dev/null
+++ b/test/Other/link-opts.ll
@@ -0,0 +1,13 @@
+;RUN: opt -S -std-link-opts < %s | FileCheck %s
+; Simple test to check that -std-link-opts keeps only the main function.
+
+; CHECK-NOT: define
+; CHECK: define void @main
+; CHECK-NOT: define
+define void @main() {
+  ret void
+}
+
+define void @foo() {
+  ret void
+}
diff --git a/test/Other/lint.ll b/test/Other/lint.ll
index c84f56f..78bbbe9 100644
--- a/test/Other/lint.ll
+++ b/test/Other/lint.ll
@@ -9,8 +9,11 @@ declare void @has_noaliases(i32* noalias %p, i32* %q)
 declare void @one_arg(i32)
 
 @CG = constant i32 7
+@E = external global i8
 
 define i32 @foo() noreturn {
+  %buf = alloca i8
+  %buf2 = alloca {i8, i8}, align 2
 ; CHECK: Caller and callee calling convention differ
   call void @bar()
 ; CHECK: Null pointer dereference
@@ -26,8 +29,10 @@ define i32 @foo() noreturn {
 ; CHECK: Address one pointer dereference
   store i32 0, i32* inttoptr (i64 1 to i32*)
 ; CHECK: Memory reference address is misaligned
-  %x = inttoptr i32 1 to i32*
-  load i32* %x, align 4
+  store i8 0, i8* %buf, align 2
+; CHECK: Memory reference address is misaligned
+  %gep = getelementptr {i8, i8}* %buf2, i32 0, i32 1
+  store i8 0, i8* %gep, align 2
 ; CHECK: Division by zero
   %sd = sdiv i32 2, 0
 ; CHECK: Division by zero
@@ -75,6 +80,18 @@ define i32 @foo() noreturn {
 ; CHECK: Write to read-only memory
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (i32* @CG to i8*), i8* bitcast (i32* @CG to i8*), i64 1, i32 1, i1 0)
 
+; CHECK: Undefined behavior: Buffer overflow
+  %wider = bitcast i8* %buf to i16*
+  store i16 0, i16* %wider
+; CHECK: Undefined behavior: Buffer overflow
+  %inner = getelementptr {i8, i8}* %buf2, i32 0, i32 1
+  %wider2 = bitcast i8* %inner to i16*
+  store i16 0, i16* %wider2
+; CHECK: Undefined behavior: Buffer overflow
+  %before = getelementptr i8* %buf, i32 -1
+  %wider3 = bitcast i8* %before to i16*
+  store i16 0, i16* %wider3
+
   br label %next
 
 next:
@@ -84,6 +101,10 @@ next:
   ret i32 0
 
 foo:
+; CHECK-NOT: Undefined behavior: Buffer overflow
+; CHECK-NOT: Memory reference address is misaligned
+  %e = bitcast i8* @E to i64*
+  store i64 0, i64* %e
   %z = add i32 0, 0
 ; CHECK: unreachable immediately preceded by instruction without side effects
   unreachable
diff --git a/test/Other/lit.local.cfg b/test/Other/lit.local.cfg
index 19eebc0..2693077 100644
--- a/test/Other/lit.local.cfg
+++ b/test/Other/lit.local.cfg
@@ -1 +1 @@
-config.suffixes = ['.ll', '.c', '.cpp']
+config.suffixes = ['.ll', '.c', '.cpp', '.txt']
diff --git a/test/Other/llvm-cov.test b/test/Other/llvm-cov.test
new file mode 100644
index 0000000..c0aa203
--- /dev/null
+++ b/test/Other/llvm-cov.test
@@ -0,0 +1,3 @@
+PR11760
+RUN: llvm-cov -gcda=%S/Inputs/llvm-cov.gcda -gcno=%S/Inputs/llvm-cov.gcno
+
diff --git a/test/Other/llvm-nm-without-aliases.ll b/test/Other/llvm-nm-without-aliases.ll
new file mode 100644
index 0000000..9d9408c
--- /dev/null
+++ b/test/Other/llvm-nm-without-aliases.ll
@@ -0,0 +1,25 @@
+; RUN: llvm-as < %s > %t
+; RUN: llvm-nm -without-aliases < %t | FileCheck %s
+; RUN: llvm-nm < %t | FileCheck --check-prefix=WITH %s
+
+; CHECK-NOT: T a0bar
+; CHECK-NOT: T a0foo
+; CHECK: T bar
+; CHECK: T foo
+
+; WITH: T a0bar
+; WITH: T a0foo
+; WITH: T bar
+; WITH: T foo
+
+@a0foo = alias void ()* @foo
+
+define void @foo() {
+  ret void
+}
+
+@a0bar = alias void ()* @bar
+
+define void @bar() {
+  ret void
+}
diff --git a/test/Other/spir_cc.ll b/test/Other/spir_cc.ll
new file mode 100644
index 0000000..ffc0294
--- /dev/null
+++ b/test/Other/spir_cc.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-as < %s | llvm-dis > %t1.ll
+; RUN: llvm-as %t1.ll -o - | llvm-dis > %t2.ll
+; RUN: diff %t1.ll %t2.ll
+
+define spir_func void @foo() {
+        ret void
+}
+
+define spir_kernel void @bar() {
+        call spir_func void @foo( )
+        call spir_kernel void @bar( )
+        ret void
+}
diff --git a/test/TableGen/if.td b/test/TableGen/if.td
index 18de368..1d8d623 100644
--- a/test/TableGen/if.td
+++ b/test/TableGen/if.td
@@ -3,15 +3,59 @@
 
 // Support for an `!if' operator as part of a `let' statement.
 // CHECK:      class C
-// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, ?, ?, ?, !if({ C:x{2} }, 0, 1), !if({ C:x{2} }, 1, 1), !if({ C:x{2} }, 0, 0), !if({ C:x{1} }, C:y{3}, 0), !if({ C:x{1} }, C:y{2}, 1), !if({ C:x{0} }, C:y{3}, C:z), !if({ C:x{0} }, C:y{2}, C:y{2}), !if({ C:x{0} }, C:y{1}, C:y{1}), !if({ C:x{0} }, C:y{0}, C:y{0}) };
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, !if({ C:y{3} }, 1, !if({ C:y{2} }, { C:x{0} }, !if({ C:y{1} }, { C:x{1} }, !if({ C:y{0} }, { C:x{2} }, ?)))){0}, !if({ C:x{2} }, { C:y{3}, C:y{2} }, !if({ C:x{1} }, { C:y{2}, C:y{1} }, !if({ C:x{0} }, { C:y{1}, C:y{0} }, ?))){1}, !if({ C:x{2} }, { C:y{3}, C:y{2} }, !if({ C:x{1} }, { C:y{2}, C:y{1} }, !if({ C:x{0} }, { C:y{1}, C:y{0} }, ?))){0}, !if({ C:x{2} }, 2, 6){2}, !if({ C:x{2} }, 2, 6){1}, !if({ C:x{2} }, 2, 6){0}, !if({ C:x{1} }, { C:y{3}, C:y{2} }, { 0, 1 }){1}, !if({ C:x{1} }, { C:y{3}, C:y{2} }, { 0, 1 }){0}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){3}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){2}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){1}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){0} };
 class C<bits<3> x, bits<4> y, bit z> {
   bits<16> n;
 
+  let n{11}  = !if(y{3}, 1,
+               !if(y{2}, x{0},
+               !if(y{1}, x{1},
+               !if(y{0}, x{2}, ?))));
+  let n{10-9}= !if(x{2}, y{3-2},
+               !if(x{1}, y{2-1},
+               !if(x{0}, y{1-0}, ?)));
   let n{8-6} = !if(x{2}, 0b010, 0b110);
   let n{5-4} = !if(x{1}, y{3-2}, {0, 1});
   let n{3-0} = !if(x{0}, y{3-0}, {z, y{2}, y{1}, y{0}});
 }
 
+def C1 : C<{1, 0, 1}, {0, 1, 0, 1}, 0>;
+def C2 : C<{0, 1, 0}, {1, 0, 1, 0}, 1>;
+def C3 : C<{0, 0, 0}, {1, 0, 1, 0}, 0>;
+def C4 : C<{0, 0, 0}, {0, 0, 0, 0}, 0>;
+
+// CHECK: def C1
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1 };
+// CHECK: def C2
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0 };
+// CHECK: def C3
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, 1, ?, ?, 1, 1, 0, 0, 1, 0, 0, 1, 0 };
+// CHECK: def C4
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, ?, ?, ?, 1, 1, 0, 0, 1, 0, 0, 0, 0 };
+
+class S<int s> {
+  bits<2> val = !if(!eq(s, 8),  {0, 0},
+                !if(!eq(s, 16), 0b01,
+                !if(!eq(s, 32), 2,
+                !if(!eq(s, 64), {1, 1}, ?))));
+}
+
+def D8  : S<8>;
+def D16 : S<16>;
+def D32 : S<32>;
+def D64 : S<64>;
+def D128: S<128>;
+// CHECK: def D128
+// CHECK-NEXT: bits<2> val = { ?, ? };
+// CHECK: def D16
+// CHECK-NEXT: bits<2> val = { 0, 1 };
+// CHECK: def D32
+// CHECK-NEXT: bits<2> val = { 1, 0 };
+// CHECK: def D64
+// CHECK-NEXT: bits<2> val = { 1, 1 };
+// CHECK: def D8
+// CHECK-NEXT: bits<2> val = { 0, 0 };
+
 // CHECK:      def One
 // CHECK-NEXT: list<int> first = [1, 2, 3];
 // CHECK-NEXT: list<int> rest = [1, 2, 3];
diff --git a/test/TableGen/list-element-bitref.td b/test/TableGen/list-element-bitref.td
new file mode 100644
index 0000000..5f3e3da
--- /dev/null
+++ b/test/TableGen/list-element-bitref.td
@@ -0,0 +1,15 @@
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+class C<list<bits<8>> L> {
+  bits<2> V0 = L[0]{1-0};
+  bits<2> V1 = L[1]{3-2};
+  string V2 = !if(L[0]{0}, "Odd", "Even");
+}
+
+def c0 : C<[0b0101, 0b1010]>;
+
+// CHECK: def c0
+// CHECk-NEXT: bits<2> V0 = { 0, 1 };
+// CHECk-NEXT: bits<2> V1 = { 1, 0 };
+// CHECk-NEXT: string V2 = "Odd";
diff --git a/test/TableGen/pr8330.td b/test/TableGen/pr8330.td
new file mode 100644
index 0000000..7779b63
--- /dev/null
+++ b/test/TableGen/pr8330.td
@@ -0,0 +1,29 @@
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+class Or4<bits<8> Val> {
+  bits<8> V = {Val{7}, Val{6}, Val{5}, Val{4}, Val{3}, 1, Val{1}, Val{0} };
+}
+
+class Whatev<bits<8> x>;
+
+class Whatever<bits<8> x> {
+  bits<8> W = {x{0}, x{1}, x{2}, x{3}, x{4}, x{5}, x{6}, x{7} };
+}
+
+multiclass X<bits<8> BaseOpc> {
+ def bar : Whatev<Or4<BaseOpc>.V >;
+}
+
+multiclass Y<bits<8> BaseOpc> {
+ def foo : Whatever<Or4<BaseOpc>.V >;
+}
+
+defm a : X<4>;
+
+// CHECK: def abar
+
+defm b : Y<8>;
+
+// CHECK: def bfoo
+// CHECK-NEXT: bits<8> W = { 0, 0, 1, 1, 0, 0, 0, 0 };
diff --git a/test/Transforms/BBVectorize/X86/cmp-types.ll b/test/Transforms/BBVectorize/X86/cmp-types.ll
new file mode 100644
index 0000000..a4fcbb6
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/cmp-types.ll
@@ -0,0 +1,16 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+%"struct.btSoftBody" = type { float, float, float*, i8 }
+
+define void @test1(%"struct.btSoftBody"* %n1, %"struct.btSoftBody"* %n2) uwtable align 2 {
+entry:
+  %tobool15 = icmp ne %"struct.btSoftBody"* %n1, null
+  %cond16 = zext i1 %tobool15 to i32
+  %tobool21 = icmp ne %"struct.btSoftBody"* %n2, null
+  %cond22 = zext i1 %tobool21 to i32
+  ret void
+; CHECK: @test1
+}
+
diff --git a/test/Transforms/BBVectorize/X86/loop1.ll b/test/Transforms/BBVectorize/X86/loop1.ll
new file mode 100644
index 0000000..493f23b
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/loop1.ll
@@ -0,0 +1,53 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
+; The second check covers the use of alias analysis (with loop unrolling).
+
+define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable {
+entry:
+  br label %for.body
+; CHECK: @test1
+; CHECK-UNRL: @test1
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
+  %1 = load double* %arrayidx2, align 8
+  %mul = fmul double %0, %0
+  %mul3 = fmul double %0, %1
+  %add = fadd double %mul, %mul3
+  %add4 = fadd double %1, %1
+  %add5 = fadd double %add4, %0
+  %mul6 = fmul double %0, %add5
+  %add7 = fadd double %add, %mul6
+  %mul8 = fmul double %1, %1
+  %add9 = fadd double %0, %0
+  %add10 = fadd double %add9, %0
+  %mul11 = fmul double %mul8, %add10
+  %add12 = fadd double %add7, %mul11
+  %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
+  store double %add12, double* %arrayidx14, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %for.end, label %for.body
+; CHECK-NOT: <2 x double>
+; CHECK-UNRL: %mul = fmul <2 x double> %2, %2
+; CHECK-UNRL: %mul3 = fmul <2 x double> %2, %3
+; CHECK-UNRL: %add = fadd <2 x double> %mul, %mul3
+; CHECK-UNRL: %add4 = fadd <2 x double> %3, %3
+; CHECK-UNRL: %add5 = fadd <2 x double> %add4, %2
+; CHECK-UNRL: %mul6 = fmul <2 x double> %2, %add5
+; CHECK-UNRL: %add7 = fadd <2 x double> %add, %mul6
+; CHECK-UNRL: %mul8 = fmul <2 x double> %3, %3
+; CHECK-UNRL: %add9 = fadd <2 x double> %2, %2
+; CHECK-UNRL: %add10 = fadd <2 x double> %add9, %2
+; CHECK-UNRL: %mul11 = fmul <2 x double> %mul8, %add10
+; CHECK-UNRL: %add12 = fadd <2 x double> %add7, %mul11
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/BBVectorize/X86/sh-rec.ll b/test/Transforms/BBVectorize/X86/sh-rec.ll
new file mode 100644
index 0000000..1e0492c
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/sh-rec.ll
@@ -0,0 +1,54 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+define void @ptoa() nounwind uwtable {
+entry:
+  %call = call i8* @malloc() nounwind
+  br i1 undef, label %return, label %if.end10
+
+if.end10:                                         ; preds = %entry
+  %incdec.ptr = getelementptr inbounds i8* %call, i64 undef
+  %call17 = call i32 @ptou() nounwind
+  %incdec.ptr26.1 = getelementptr inbounds i8* %incdec.ptr, i64 -2
+  store i8 undef, i8* %incdec.ptr26.1, align 1
+  %div27.1 = udiv i32 %call17, 100
+  %rem.2 = urem i32 %div27.1, 10
+  %add2230.2 = or i32 %rem.2, 48
+  %conv25.2 = trunc i32 %add2230.2 to i8
+  %incdec.ptr26.2 = getelementptr inbounds i8* %incdec.ptr, i64 -3
+  store i8 %conv25.2, i8* %incdec.ptr26.2, align 1
+  %incdec.ptr26.3 = getelementptr inbounds i8* %incdec.ptr, i64 -4
+  store i8 undef, i8* %incdec.ptr26.3, align 1
+  %div27.3 = udiv i32 %call17, 10000
+  %rem.4 = urem i32 %div27.3, 10
+  %add2230.4 = or i32 %rem.4, 48
+  %conv25.4 = trunc i32 %add2230.4 to i8
+  %incdec.ptr26.4 = getelementptr inbounds i8* %incdec.ptr, i64 -5
+  store i8 %conv25.4, i8* %incdec.ptr26.4, align 1
+  %div27.4 = udiv i32 %call17, 100000
+  %rem.5 = urem i32 %div27.4, 10
+  %add2230.5 = or i32 %rem.5, 48
+  %conv25.5 = trunc i32 %add2230.5 to i8
+  %incdec.ptr26.5 = getelementptr inbounds i8* %incdec.ptr, i64 -6
+  store i8 %conv25.5, i8* %incdec.ptr26.5, align 1
+  %incdec.ptr26.6 = getelementptr inbounds i8* %incdec.ptr, i64 -7
+  store i8 0, i8* %incdec.ptr26.6, align 1
+  %incdec.ptr26.7 = getelementptr inbounds i8* %incdec.ptr, i64 -8
+  store i8 undef, i8* %incdec.ptr26.7, align 1
+  %div27.7 = udiv i32 %call17, 100000000
+  %rem.8 = urem i32 %div27.7, 10
+  %add2230.8 = or i32 %rem.8, 48
+  %conv25.8 = trunc i32 %add2230.8 to i8
+  %incdec.ptr26.8 = getelementptr inbounds i8* %incdec.ptr, i64 -9
+  store i8 %conv25.8, i8* %incdec.ptr26.8, align 1
+  unreachable
+
+return:                                           ; preds = %entry
+  ret void
+; CHECK: @ptoa
+}
+
+declare noalias i8* @malloc() nounwind
+
+declare i32 @ptou()
diff --git a/test/Transforms/BBVectorize/X86/sh-rec2.ll b/test/Transforms/BBVectorize/X86/sh-rec2.ll
new file mode 100644
index 0000000..ef22399
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/sh-rec2.ll
@@ -0,0 +1,85 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -basicaa -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352 = type { [280 x i16], i16, i64, i32, [8 x i16], [2 x [8 x i16]], i16, i16, [9 x i16], i16, i8, i8 }
+
+define void @gsm_encode(%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352* %s, i16* %source, i8* %c) nounwind uwtable {
+entry:
+  %xmc = alloca [52 x i16], align 16
+  %arraydecay5 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 0
+  call void @Gsm_Coder(%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352* %s, i16* %source, i16* undef, i16* null, i16* undef, i16* undef, i16* undef, i16* %arraydecay5) nounwind
+  %incdec.ptr136 = getelementptr inbounds i8* %c, i64 10
+  %incdec.ptr157 = getelementptr inbounds i8* %c, i64 11
+  store i8 0, i8* %incdec.ptr136, align 1
+  %arrayidx162 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 11
+  %0 = load i16* %arrayidx162, align 2
+  %conv1631 = trunc i16 %0 to i8
+  %and164 = shl i8 %conv1631, 3
+  %shl165 = and i8 %and164, 56
+  %incdec.ptr172 = getelementptr inbounds i8* %c, i64 12
+  store i8 %shl165, i8* %incdec.ptr157, align 1
+  %1 = load i16* inttoptr (i64 2 to i16*), align 2
+  %conv1742 = trunc i16 %1 to i8
+  %and175 = shl i8 %conv1742, 1
+  %incdec.ptr183 = getelementptr inbounds i8* %c, i64 13
+  store i8 %and175, i8* %incdec.ptr172, align 1
+  %incdec.ptr199 = getelementptr inbounds i8* %c, i64 14
+  store i8 0, i8* %incdec.ptr183, align 1
+  %arrayidx214 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 15
+  %incdec.ptr220 = getelementptr inbounds i8* %c, i64 15
+  store i8 0, i8* %incdec.ptr199, align 1
+  %2 = load i16* %arrayidx214, align 2
+  %conv2223 = trunc i16 %2 to i8
+  %and223 = shl i8 %conv2223, 6
+  %incdec.ptr235 = getelementptr inbounds i8* %c, i64 16
+  store i8 %and223, i8* %incdec.ptr220, align 1
+  %arrayidx240 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 19
+  %3 = load i16* %arrayidx240, align 2
+  %conv2414 = trunc i16 %3 to i8
+  %and242 = shl i8 %conv2414, 2
+  %shl243 = and i8 %and242, 28
+  %incdec.ptr251 = getelementptr inbounds i8* %c, i64 17
+  store i8 %shl243, i8* %incdec.ptr235, align 1
+  %incdec.ptr272 = getelementptr inbounds i8* %c, i64 18
+  store i8 0, i8* %incdec.ptr251, align 1
+  %arrayidx282 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 25
+  %4 = load i16* %arrayidx282, align 2
+  %conv2835 = trunc i16 %4 to i8
+  %and284 = and i8 %conv2835, 7
+  %incdec.ptr287 = getelementptr inbounds i8* %c, i64 19
+  store i8 %and284, i8* %incdec.ptr272, align 1
+  %incdec.ptr298 = getelementptr inbounds i8* %c, i64 20
+  store i8 0, i8* %incdec.ptr287, align 1
+  %incdec.ptr314 = getelementptr inbounds i8* %c, i64 21
+  store i8 0, i8* %incdec.ptr298, align 1
+  %arrayidx319 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 26
+  %5 = load i16* %arrayidx319, align 4
+  %conv3206 = trunc i16 %5 to i8
+  %and321 = shl i8 %conv3206, 4
+  %shl322 = and i8 %and321, 112
+  %incdec.ptr335 = getelementptr inbounds i8* %c, i64 22
+  store i8 %shl322, i8* %incdec.ptr314, align 1
+  %arrayidx340 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 29
+  %6 = load i16* %arrayidx340, align 2
+  %conv3417 = trunc i16 %6 to i8
+  %and342 = shl i8 %conv3417, 3
+  %shl343 = and i8 %and342, 56
+  %incdec.ptr350 = getelementptr inbounds i8* %c, i64 23
+  store i8 %shl343, i8* %incdec.ptr335, align 1
+  %incdec.ptr366 = getelementptr inbounds i8* %c, i64 24
+  store i8 0, i8* %incdec.ptr350, align 1
+  %arrayidx381 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 36
+  %incdec.ptr387 = getelementptr inbounds i8* %c, i64 25
+  store i8 0, i8* %incdec.ptr366, align 1
+  %7 = load i16* %arrayidx381, align 8
+  %conv3898 = trunc i16 %7 to i8
+  %and390 = shl i8 %conv3898, 6
+  store i8 %and390, i8* %incdec.ptr387, align 1
+  unreachable
+; CHECK: @gsm_encode
+}
+
+declare void @Gsm_Coder(%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352*, i16*, i16*, i16*, i16*, i16*, i16*, i16*)
+
+declare void @llvm.trap() noreturn nounwind
diff --git a/test/Transforms/BBVectorize/X86/sh-rec3.ll b/test/Transforms/BBVectorize/X86/sh-rec3.ll
new file mode 100644
index 0000000..fd2cc8b
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/sh-rec3.ll
@@ -0,0 +1,170 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -basicaa -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565 = type { [280 x i16], i16, i64, i32, [8 x i16], [2 x [8 x i16]], i16, i16, [9 x i16], i16, i8, i8 }
+
+define void @gsm_encode(%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565* %s, i16* %source, i8* %c) nounwind uwtable {
+entry:
+  %LARc28 = alloca [2 x i64], align 16
+  %LARc28.sub = getelementptr inbounds [2 x i64]* %LARc28, i64 0, i64 0
+  %tmpcast = bitcast [2 x i64]* %LARc28 to [8 x i16]*
+  %Nc = alloca [4 x i16], align 2
+  %Mc = alloca [4 x i16], align 2
+  %bc = alloca [4 x i16], align 2
+  %xmc = alloca [52 x i16], align 16
+  %arraydecay = bitcast [2 x i64]* %LARc28 to i16*
+  %arraydecay1 = getelementptr inbounds [4 x i16]* %Nc, i64 0, i64 0
+  %arraydecay2 = getelementptr inbounds [4 x i16]* %bc, i64 0, i64 0
+  %arraydecay3 = getelementptr inbounds [4 x i16]* %Mc, i64 0, i64 0
+  %arraydecay5 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 0
+  call void @Gsm_Coder(%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565* %s, i16* %source, i16* %arraydecay, i16* %arraydecay1, i16* %arraydecay2, i16* %arraydecay3, i16* undef, i16* %arraydecay5) nounwind
+  %0 = load i64* %LARc28.sub, align 16
+  %1 = trunc i64 %0 to i32
+  %conv1 = lshr i32 %1, 2
+  %and = and i32 %conv1, 15
+  %or = or i32 %and, 208
+  %conv6 = trunc i32 %or to i8
+  %incdec.ptr = getelementptr inbounds i8* %c, i64 1
+  store i8 %conv6, i8* %c, align 1
+  %conv84 = trunc i64 %0 to i8
+  %and9 = shl i8 %conv84, 6
+  %incdec.ptr15 = getelementptr inbounds i8* %c, i64 2
+  store i8 %and9, i8* %incdec.ptr, align 1
+  %2 = lshr i64 %0, 50
+  %shr226.tr = trunc i64 %2 to i8
+  %conv25 = and i8 %shr226.tr, 7
+  %incdec.ptr26 = getelementptr inbounds i8* %c, i64 3
+  store i8 %conv25, i8* %incdec.ptr15, align 1
+  %incdec.ptr42 = getelementptr inbounds i8* %c, i64 4
+  store i8 0, i8* %incdec.ptr26, align 1
+  %arrayidx52 = getelementptr inbounds [8 x i16]* %tmpcast, i64 0, i64 7
+  %3 = load i16* %arrayidx52, align 2
+  %conv537 = trunc i16 %3 to i8
+  %and54 = and i8 %conv537, 7
+  %incdec.ptr57 = getelementptr inbounds i8* %c, i64 5
+  store i8 %and54, i8* %incdec.ptr42, align 1
+  %incdec.ptr68 = getelementptr inbounds i8* %c, i64 6
+  store i8 0, i8* %incdec.ptr57, align 1
+  %4 = load i16* %arraydecay3, align 2
+  %conv748 = trunc i16 %4 to i8
+  %and75 = shl i8 %conv748, 5
+  %shl76 = and i8 %and75, 96
+  %incdec.ptr84 = getelementptr inbounds i8* %c, i64 7
+  store i8 %shl76, i8* %incdec.ptr68, align 1
+  %arrayidx94 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 1
+  %5 = load i16* %arrayidx94, align 2
+  %conv959 = trunc i16 %5 to i8
+  %and96 = shl i8 %conv959, 1
+  %shl97 = and i8 %and96, 14
+  %or103 = or i8 %shl97, 1
+  %incdec.ptr105 = getelementptr inbounds i8* %c, i64 8
+  store i8 %or103, i8* %incdec.ptr84, align 1
+  %arrayidx115 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 4
+  %6 = bitcast i16* %arrayidx115 to i32*
+  %7 = load i32* %6, align 8
+  %conv11610 = trunc i32 %7 to i8
+  %and117 = and i8 %conv11610, 7
+  %incdec.ptr120 = getelementptr inbounds i8* %c, i64 9
+  store i8 %and117, i8* %incdec.ptr105, align 1
+  %8 = lshr i32 %7, 16
+  %and12330 = shl nuw nsw i32 %8, 5
+  %and123 = trunc i32 %and12330 to i8
+  %incdec.ptr136 = getelementptr inbounds i8* %c, i64 10
+  store i8 %and123, i8* %incdec.ptr120, align 1
+  %incdec.ptr157 = getelementptr inbounds i8* %c, i64 11
+  store i8 0, i8* %incdec.ptr136, align 1
+  %incdec.ptr172 = getelementptr inbounds i8* %c, i64 12
+  store i8 0, i8* %incdec.ptr157, align 1
+  %arrayidx173 = getelementptr inbounds [4 x i16]* %Nc, i64 0, i64 1
+  %9 = load i16* %arrayidx173, align 2
+  %conv17412 = zext i16 %9 to i32
+  %and175 = shl nuw nsw i32 %conv17412, 1
+  %arrayidx177 = getelementptr inbounds [4 x i16]* %bc, i64 0, i64 1
+  %10 = load i16* %arrayidx177, align 2
+  %conv17826 = zext i16 %10 to i32
+  %shr17913 = lshr i32 %conv17826, 1
+  %and180 = and i32 %shr17913, 1
+  %or181 = or i32 %and175, %and180
+  %conv182 = trunc i32 %or181 to i8
+  %incdec.ptr183 = getelementptr inbounds i8* %c, i64 13
+  store i8 %conv182, i8* %incdec.ptr172, align 1
+  %arrayidx188 = getelementptr inbounds [4 x i16]* %Mc, i64 0, i64 1
+  %11 = load i16* %arrayidx188, align 2
+  %conv18914 = trunc i16 %11 to i8
+  %and190 = shl i8 %conv18914, 5
+  %shl191 = and i8 %and190, 96
+  %incdec.ptr199 = getelementptr inbounds i8* %c, i64 14
+  store i8 %shl191, i8* %incdec.ptr183, align 1
+  %arrayidx209 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 14
+  %12 = load i16* %arrayidx209, align 4
+  %conv21015 = trunc i16 %12 to i8
+  %and211 = shl i8 %conv21015, 1
+  %shl212 = and i8 %and211, 14
+  %or218 = or i8 %shl212, 1
+  %incdec.ptr220 = getelementptr inbounds i8* %c, i64 15
+  store i8 %or218, i8* %incdec.ptr199, align 1
+  %arrayidx225 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 16
+  %13 = bitcast i16* %arrayidx225 to i64*
+  %14 = load i64* %13, align 16
+  %conv22616 = trunc i64 %14 to i8
+  %and227 = shl i8 %conv22616, 3
+  %shl228 = and i8 %and227, 56
+  %incdec.ptr235 = getelementptr inbounds i8* %c, i64 16
+  store i8 %shl228, i8* %incdec.ptr220, align 1
+  %15 = lshr i64 %14, 32
+  %and23832 = shl nuw nsw i64 %15, 5
+  %and238 = trunc i64 %and23832 to i8
+  %incdec.ptr251 = getelementptr inbounds i8* %c, i64 17
+  store i8 %and238, i8* %incdec.ptr235, align 1
+  %arrayidx266 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 23
+  %incdec.ptr272 = getelementptr inbounds i8* %c, i64 18
+  store i8 0, i8* %incdec.ptr251, align 1
+  %16 = load i16* %arrayidx266, align 2
+  %conv27418 = trunc i16 %16 to i8
+  %and275 = shl i8 %conv27418, 6
+  %incdec.ptr287 = getelementptr inbounds i8* %c, i64 19
+  store i8 %and275, i8* %incdec.ptr272, align 1
+  %arrayidx288 = getelementptr inbounds [4 x i16]* %Nc, i64 0, i64 2
+  %17 = load i16* %arrayidx288, align 2
+  %conv28919 = zext i16 %17 to i32
+  %and290 = shl nuw nsw i32 %conv28919, 1
+  %arrayidx292 = getelementptr inbounds [4 x i16]* %bc, i64 0, i64 2
+  %18 = load i16* %arrayidx292, align 2
+  %conv29327 = zext i16 %18 to i32
+  %shr29420 = lshr i32 %conv29327, 1
+  %and295 = and i32 %shr29420, 1
+  %or296 = or i32 %and290, %and295
+  %conv297 = trunc i32 %or296 to i8
+  %incdec.ptr298 = getelementptr inbounds i8* %c, i64 20
+  store i8 %conv297, i8* %incdec.ptr287, align 1
+  %conv30021 = trunc i16 %18 to i8
+  %and301 = shl i8 %conv30021, 7
+  %incdec.ptr314 = getelementptr inbounds i8* %c, i64 21
+  store i8 %and301, i8* %incdec.ptr298, align 1
+  %incdec.ptr335 = getelementptr inbounds i8* %c, i64 22
+  store i8 0, i8* %incdec.ptr314, align 1
+  %arrayidx340 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 29
+  %19 = load i16* %arrayidx340, align 2
+  %conv34122 = trunc i16 %19 to i8
+  %and342 = shl i8 %conv34122, 3
+  %shl343 = and i8 %and342, 56
+  %incdec.ptr350 = getelementptr inbounds i8* %c, i64 23
+  store i8 %shl343, i8* %incdec.ptr335, align 1
+  %arrayidx355 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 32
+  %20 = bitcast i16* %arrayidx355 to i32*
+  %21 = load i32* %20, align 16
+  %conv35623 = shl i32 %21, 2
+  %shl358 = and i32 %conv35623, 28
+  %22 = lshr i32 %21, 17
+  %and363 = and i32 %22, 3
+  %or364 = or i32 %shl358, %and363
+  %conv365 = trunc i32 %or364 to i8
+  store i8 %conv365, i8* %incdec.ptr350, align 1
+  unreachable
+; CHECK: @gsm_encode
+}
+
+declare void @Gsm_Coder(%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565*, i16*, i16*, i16*, i16*, i16*, i16*, i16*)
+
+declare void @llvm.trap() noreturn nounwind
diff --git a/test/Transforms/BBVectorize/X86/sh-types.ll b/test/Transforms/BBVectorize/X86/sh-types.ll
new file mode 100644
index 0000000..0bcb714
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/sh-types.ll
@@ -0,0 +1,25 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+define <4 x float> @test7(<4 x float> %A1, <4 x float> %B1, double %C1, double %C2, double %D1, double %D2) {
+        %A2 = shufflevector <4 x float> %A1, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+        %B2 = shufflevector <4 x float> %B1, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+        %X1 = shufflevector <4 x float> %A2, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+        %X2 = shufflevector <4 x float> %B2, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+        %Y1 = shufflevector <2 x float> %X1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+        %Y2 = shufflevector <2 x float> %X2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+
+	%M1 = fsub double %C1, %D1
+	%M2 = fsub double %C2, %D2
+	%N1 = fmul double %M1, %C1
+	%N2 = fmul double %M2, %C2
+	%Z1 = fadd double %N1, %D1
+	%Z2 = fadd double %N2, %D2
+
+        %R = fmul <4 x float> %Y1, %Y2
+        ret <4 x float> %R
+; CHECK: @test7
+; CHECK-NOT: <8 x float>
+; CHECK: ret <4 x float>
+}
+
diff --git a/test/Transforms/BBVectorize/X86/simple-ldstr.ll b/test/Transforms/BBVectorize/X86/simple-ldstr.ll
new file mode 100644
index 0000000..0124399
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/simple-ldstr.ll
@@ -0,0 +1,29 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+; Simple 3-pair chain with loads and stores
+define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+; CHECK: @test1
+; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
+; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
+; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
+; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %0 = bitcast double* %c to <2 x double>*
+; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8
+; CHECK: ret void
+}
+
diff --git a/test/Transforms/BBVectorize/X86/simple.ll b/test/Transforms/BBVectorize/X86/simple.ll
new file mode 100644
index 0000000..0113e38
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/simple.ll
@@ -0,0 +1,103 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+; Basic depth-3 chain
+define double @test1(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test1
+; CHECK-NOT: fmul <2 x double>
+; CHECK: ret double %R
+}
+
+; Basic chain
+define double @test1a(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%W1 = fadd double %Y1, %Z1
+	%W2 = fadd double %Y2, %Z2
+	%V1 = fadd double %W1, %Z1
+	%V2 = fadd double %W2, %Z2
+	%Q1 = fadd double %W1, %V1
+	%Q2 = fadd double %W2, %V2
+	%S1 = fadd double %W1, %Q1
+	%S2 = fadd double %W2, %Q2
+	%R  = fmul double %S1, %S2
+	ret double %R
+; CHECK: @test1a
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %W1 = fadd <2 x double> %Y1, %Z1
+; CHECK: %V1 = fadd <2 x double> %W1, %Z1
+; CHECK: %Q1 = fadd <2 x double> %W1, %V1
+; CHECK: %S1 = fadd <2 x double> %W1, %Q1
+; CHECK: %S1.v.r1 = extractelement <2 x double> %S1, i32 0
+; CHECK: %S1.v.r2 = extractelement <2 x double> %S1, i32 1
+; CHECK: %R = fmul double %S1.v.r1, %S1.v.r2
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain (last pair permuted)
+define double @test2(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%Z1 = fadd double %Y2, %B1
+	%Z2 = fadd double %Y1, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test2
+; CHECK-NOT: fmul <2 x double>
+; CHECK: ret double %R
+}
+
+; Basic depth-4 chain (internal permutation)
+define double @test4(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%Z1 = fadd double %Y2, %B1
+	%Z2 = fadd double %Y1, %B2
+	%W1 = fadd double %Y2, %Z1
+	%W2 = fadd double %Y1, %Z2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test4
+; CHECK-NOT: fmul <2 x double>
+; CHECK: ret double %R
+}
+
+; Basic chain with shuffles
+define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) {
+	%X1 = sub <8 x i8> %A1, %B1
+	%X2 = sub <8 x i8> %A2, %B2
+	%Y1 = mul <8 x i8> %X1, %A1
+	%Y2 = mul <8 x i8> %X2, %A2
+	%Z1 = add <8 x i8> %Y1, %B1
+	%Z2 = add <8 x i8> %Y2, %B2
+        %Q1 = shufflevector <8 x i8> %Z1, <8 x i8> %Z2, <8 x i32> <i32 15, i32 8, i32 6, i32 1, i32 13, i32 10, i32 4, i32 3>
+        %Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 4, i32 4, i32 1>
+	%R  = mul <8 x i8> %Q1, %Q2
+	ret <8 x i8> %R
+; CHECK: @test6
+; CHECK-NOT: sub <16 x i8>
+; CHECK: ret <8 x i8>
+}
+
diff --git a/test/Transforms/BBVectorize/X86/vs-cast.ll b/test/Transforms/BBVectorize/X86/vs-cast.ll
new file mode 100644
index 0000000..be3efca
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/vs-cast.ll
@@ -0,0 +1,12 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+define void @main() nounwind uwtable {
+entry:
+  %0 = bitcast <2 x i64> undef to i128
+  %1 = bitcast <2 x i64> undef to i128
+  ret void
+; CHECK: @main
+}
+
diff --git a/test/Transforms/BBVectorize/cycle.ll b/test/Transforms/BBVectorize/cycle.ll
index 32a91ce..e8e82ce 100644
--- a/test/Transforms/BBVectorize/cycle.ll
+++ b/test/Transforms/BBVectorize/cycle.ll
@@ -107,6 +107,6 @@ done:
   ret void
 ; CHECK: @test1
 ; CHECK: go:
-; CHECK-NEXT: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0
+; CHECK: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0
 ; FIXME: When tree pruning is deterministic, include the entire output.
 }
diff --git a/test/Transforms/BBVectorize/lit.local.cfg b/test/Transforms/BBVectorize/lit.local.cfg
index 19eebc0..a8ad0f1 100644
--- a/test/Transforms/BBVectorize/lit.local.cfg
+++ b/test/Transforms/BBVectorize/lit.local.cfg
@@ -1 +1,6 @@
 config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/BBVectorize/loop1.ll b/test/Transforms/BBVectorize/loop1.ll
index bebc91a..c22ea58 100644
--- a/test/Transforms/BBVectorize/loop1.ll
+++ b/test/Transforms/BBVectorize/loop1.ll
@@ -42,8 +42,8 @@ for.body:                                         ; preds = %for.body, %entry
 ; CHECK: %mul = fmul double %0, %0
 ; CHECK: %mul3 = fmul double %0, %1
 ; CHECK: %add = fadd double %mul, %mul3
-; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
 ; CHECK: %mul8 = fmul double %1, %1
+; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
 ; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1
 ; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2
 ; CHECK: %add5.v.i1.1 = insertelement <2 x double> undef, double %0, i32 0
diff --git a/test/Transforms/BBVectorize/search-limit.ll b/test/Transforms/BBVectorize/search-limit.ll
index d9945b5..aeaf988 100644
--- a/test/Transforms/BBVectorize/search-limit.ll
+++ b/test/Transforms/BBVectorize/search-limit.ll
@@ -7,8 +7,8 @@ define double @test1(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK-SL4: @test1
 ; CHECK-SL4-NOT: <2 x double>
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
diff --git a/test/Transforms/BBVectorize/simple-int.ll b/test/Transforms/BBVectorize/simple-int.ll
index 6844977..ae1d63b 100644
--- a/test/Transforms/BBVectorize/simple-int.ll
+++ b/test/Transforms/BBVectorize/simple-int.ll
@@ -17,8 +17,8 @@ define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1,
 	ret double %R
 ; CHECK: @test1
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
 ; CHECK: %Y1.v.i2.1 = insertelement <2 x double> undef, double %C1, i32 0
@@ -43,8 +43,8 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
 	ret double %R
 ; CHECK: @test2
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
 ; CHECK: %Y1 = call <2 x double> @llvm.cos.v2f64(<2 x double> %X1)
@@ -68,8 +68,8 @@ define double @test3(double %A1, double %A2, double %B1, double %B2, i32 %P) {
 	ret double %R
 ; CHECK: @test3
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
 ; CHECK: %Y1 = call <2 x double> @llvm.powi.v2f64(<2 x double> %X1, i32 %P)
diff --git a/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll b/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll
index f992d41..d46f769 100644
--- a/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll
+++ b/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll
@@ -2,6 +2,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-aligned-only -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-AO
 
+; FIXME: re-enable this once pointer vectors work properly
+; XFAIL: *
+
 ; Simple 3-pair chain also with loads and stores (using ptrs and gep)
 define double @test1(i64* %a, i64* %b, i64* %c) nounwind uwtable readonly {
 entry:
@@ -79,3 +82,53 @@ entry:
 ; CHECK-AO-NOT: <2 x
 }
 
+; Simple 3-pair chain with loads and stores (using ptrs and gep)
+; using pointer vectors.
+define void @test3(<2 x i64*>* %a, <2 x i64*>* %b, <2 x i64*>* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load <2 x i64*>* %a, align 8
+  %i1 = load <2 x i64*>* %b, align 8
+  %arrayidx3 = getelementptr inbounds <2 x i64*>* %a, i64 1
+  %i3 = load <2 x i64*>* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds <2 x i64*>* %b, i64 1
+  %i4 = load <2 x i64*>* %arrayidx4, align 8
+  %j1 = extractelement <2 x i64*> %i1, i32 0
+  %j4 = extractelement <2 x i64*> %i4, i32 0
+  %o1 = load i64* %j1, align 8
+  %o4 = load i64* %j4, align 8
+  %j0 = extractelement <2 x i64*> %i0, i32 0
+  %j3 = extractelement <2 x i64*> %i3, i32 0
+  %ptr0 = getelementptr inbounds i64* %j0, i64 %o1
+  %ptr3 = getelementptr inbounds i64* %j3, i64 %o4
+  %qtr0 = insertelement <2 x i64*> undef, i64* %ptr0, i32 0
+  %rtr0 = insertelement <2 x i64*> %qtr0, i64* %ptr0, i32 1
+  %qtr3 = insertelement <2 x i64*> undef, i64* %ptr3, i32 0
+  %rtr3 = insertelement <2 x i64*> %qtr3, i64* %ptr3, i32 1
+  store <2 x i64*> %rtr0, <2 x i64*>* %c, align 8
+  %arrayidx5 = getelementptr inbounds <2 x i64*>* %c, i64 1
+  store <2 x i64*> %rtr3, <2 x i64*>* %arrayidx5, align 8
+  ret void
+; CHECK: @test3
+; CHECK: %i0.v.i0 = bitcast <2 x i64*>* %a to <4 x i64*>*
+; CHECK: %i1 = load <2 x i64*>* %b, align 8
+; CHECK: %i0 = load <4 x i64*>* %i0.v.i0, align 8
+; CHECK: %arrayidx4 = getelementptr inbounds <2 x i64*>* %b, i64 1
+; CHECK: %i4 = load <2 x i64*>* %arrayidx4, align 8
+; CHECK: %j1 = extractelement <2 x i64*> %i1, i32 0
+; CHECK: %j4 = extractelement <2 x i64*> %i4, i32 0
+; CHECK: %o1 = load i64* %j1, align 8
+; CHECK: %o4 = load i64* %j4, align 8
+; CHECK: %ptr0.v.i1.1 = insertelement <2 x i64> undef, i64 %o1, i32 0
+; CHECK: %ptr0.v.i1.2 = insertelement <2 x i64> %ptr0.v.i1.1, i64 %o4, i32 1
+; CHECK: %ptr0.v.i0 = shufflevector <4 x i64*> %i0, <4 x i64*> undef, <2 x i32> <i32 0, i32 2>
+; CHECK: %ptr0 = getelementptr inbounds <2 x i64*> %ptr0.v.i0, <2 x i64> %ptr0.v.i1.2
+; CHECK: %rtr0 = shufflevector <2 x i64*> %ptr0, <2 x i64*> undef, <2 x i32> zeroinitializer
+; CHECK: %rtr3 = shufflevector <2 x i64*> %ptr0, <2 x i64*> undef, <2 x i32> <i32 1, i32 1>
+; CHECK: %0 = bitcast <2 x i64*>* %c to <4 x i64*>*
+; CHECK: %1 = shufflevector <2 x i64*> %rtr0, <2 x i64*> %rtr3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: store <4 x i64*> %1, <4 x i64*>* %0, align 8
+; CHECK: ret void
+; CHECK-AO: @test3
+; CHECK-AO-NOT: <4 x
+}
+
diff --git a/test/Transforms/BBVectorize/simple-ldstr.ll b/test/Transforms/BBVectorize/simple-ldstr.ll
index a5397ee..7dd77c9 100644
--- a/test/Transforms/BBVectorize/simple-ldstr.ll
+++ b/test/Transforms/BBVectorize/simple-ldstr.ll
@@ -94,13 +94,13 @@ entry:
 ; CHECK-AO: @test3
 ; CHECK-AO: %i0 = load double* %a, align 8
 ; CHECK-AO: %i1 = load double* %b, align 8
-; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0
-; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0
 ; CHECK-AO: %arrayidx3 = getelementptr inbounds double* %a, i64 1
 ; CHECK-AO: %i3 = load double* %arrayidx3, align 8
 ; CHECK-AO: %arrayidx4 = getelementptr inbounds double* %b, i64 1
 ; CHECK-AO: %i4 = load double* %arrayidx4, align 8
+; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0
 ; CHECK-AO: %mul.v.i1.2 = insertelement <2 x double> %mul.v.i1.1, double %i4, i32 1
+; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0
 ; CHECK-AO: %mul.v.i0.2 = insertelement <2 x double> %mul.v.i0.1, double %i3, i32 1
 ; CHECK-AO: %mul = fmul <2 x double> %mul.v.i0.2, %mul.v.i1.2
 ; CHECK-AO: %mulf = fptrunc <2 x double> %mul to <2 x float>
@@ -108,3 +108,63 @@ entry:
 ; CHECK-AO: store <2 x float> %mulf, <2 x float>* %0, align 8
 ; CHECK-AO: ret void
 }
+
+; Simple 3-pair chain with loads and stores (unreachable)
+define void @test4(i1 %bool, double* %a, double* %b, double* %c) nounwind uwtable readonly {
+entry:
+  br i1 %bool, label %if.then1, label %if.end
+
+if.then1:
+  unreachable
+  br label %if.then
+
+if.then:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  br label %if.end
+
+if.end:
+  ret void
+; CHECK: @test4
+; CHECK-NOT: <2 x double>
+; CHECK-AO: @test4
+; CHECK-AO-NOT: <2 x double>
+}
+
+; Simple 3-pair chain with loads and stores
+define void @test5(double* %a, double* %b, double* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  store double %mul, double* %c, align 4
+  ret void
+; CHECK: @test5
+; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
+; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
+; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
+; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %0 = bitcast double* %c to <2 x double>*
+; CHECK: store <2 x double> %mul, <2 x double>* %0, align 4
+; CHECK: ret void
+; CHECK-AO: @test5
+; CHECK-AO-NOT: <2 x double>
+}
+
diff --git a/test/Transforms/BBVectorize/simple-sel.ll b/test/Transforms/BBVectorize/simple-sel.ll
index 325792a..15ecb59 100644
--- a/test/Transforms/BBVectorize/simple-sel.ll
+++ b/test/Transforms/BBVectorize/simple-sel.ll
@@ -6,8 +6,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 define double @test1(double %A1, double %A2, double %B1, double %B2, i1 %C1, i1 %C2) {
 ; CHECK: @test1
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -33,8 +33,8 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test2
 ; CHECK-NB: @test2
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
diff --git a/test/Transforms/BBVectorize/simple.ll b/test/Transforms/BBVectorize/simple.ll
index 88eb9c9..3527ae7 100644
--- a/test/Transforms/BBVectorize/simple.ll
+++ b/test/Transforms/BBVectorize/simple.ll
@@ -5,8 +5,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 define double @test1(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test1
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -29,8 +29,8 @@ define double @test1(double %A1, double %A2, double %B1, double %B2) {
 define double @test2(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test2
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -40,12 +40,13 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
 	%Z1 = fadd double %Y2, %B1
 	%Z2 = fadd double %Y1, %B2
-; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
+; CHECK: %Z1.v.i1.1 = insertelement <2 x double> undef, double %B2, i32 0
+; CHECK: %Z1.v.i1.2 = insertelement <2 x double> %Z1.v.i1.1, double %B1, i32 1
+; CHECK: %Z2 = fadd <2 x double> %Y1, %Z1.v.i1.2
 	%R  = fmul double %Z1, %Z2
-; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
-; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
-; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: %Z2.v.r1 = extractelement <2 x double> %Z2, i32 0
+; CHECK: %Z2.v.r2 = extractelement <2 x double> %Z2, i32 1
+; CHECK: %R = fmul double %Z2.v.r2, %Z2.v.r1
 	ret double %R
 ; CHECK: ret double %R
 }
@@ -54,8 +55,8 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
 define double @test3(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test3
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -79,8 +80,8 @@ define double @test3(double %A1, double %A2, double %B1, double %B2) {
 define double @test4(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test4
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -148,4 +149,51 @@ define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) {
 ; CHECK: ret <8 x i8> %R
 }
 
+; Basic depth-3 chain (flipped order)
+define double @test7(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test7
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z2 = fadd double %Y2, %B2
+	%Z1 = fadd double %Y1, %B1
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+	%R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain (subclass data)
+define i64 @test8(i64 %A1, i64 %A2, i64 %B1, i64 %B2) {
+; CHECK: @test8
+; CHECK: %X1.v.i1.1 = insertelement <2 x i64> undef, i64 %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x i64> %X1.v.i1.1, i64 %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x i64> undef, i64 %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x i64> %X1.v.i0.1, i64 %A2, i32 1
+	%X1 = sub nsw i64 %A1, %B1
+	%X2 = sub i64 %A2, %B2
+; CHECK: %X1 = sub <2 x i64> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = mul i64 %X1, %A1
+	%Y2 = mul i64 %X2, %A2
+; CHECK: %Y1 = mul <2 x i64> %X1, %X1.v.i0.2
+	%Z1 = add i64 %Y1, %B1
+	%Z2 = add i64 %Y2, %B2
+; CHECK: %Z1 = add <2 x i64> %Y1, %X1.v.i1.2
+	%R  = mul i64 %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x i64> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x i64> %Z1, i32 1
+; CHECK: %R = mul i64 %Z1.v.r1, %Z1.v.r2
+	ret i64 %R
+; CHECK: ret i64 %R
+}
 
diff --git a/test/Transforms/ConstProp/loads.ll b/test/Transforms/ConstProp/loads.ll
index 74d80aa..6794288 100644
--- a/test/Transforms/ConstProp/loads.ll
+++ b/test/Transforms/ConstProp/loads.ll
@@ -1,17 +1,24 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s 
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -default-data-layout="e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=LE
+; RUN: opt < %s -default-data-layout="E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=BE
 
+; {{ 0xDEADBEEF, 0xBA }, 0xCAFEBABE}
 @g1 = constant {{i32,i8},i32} {{i32,i8} { i32 -559038737, i8 186 }, i32 -889275714 }
 @g2 = constant double 1.0
+; { 0x7B, 0x06B1BFF8 }
 @g3 = constant {i64, i64} { i64 123, i64 112312312 }
 
 ; Simple load
 define i32 @test1() {
   %r = load i32* getelementptr ({{i32,i8},i32}* @g1, i32 0, i32 0, i32 0)
   ret i32 %r
-; CHECK: @test1
-; CHECK: ret i32 -559038737
+
+; 0xDEADBEEF
+; LE: @test1
+; LE: ret i32 -559038737
+
+; 0xDEADBEEF
+; BE: @test1
+; BE: ret i32 -559038737
 }
 
 ; PR3152
@@ -20,8 +27,13 @@ define i16 @test2() {
   %r = load i16* bitcast(i32* getelementptr ({{i32,i8},i32}* @g1, i32 0, i32 0, i32 0) to i16*)
   ret i16 %r
 
-; CHECK: @test2
-; CHECK: ret i16 -16657 
+; 0xBEEF
+; LE: @test2
+; LE: ret i16 -16657
+
+; 0xDEAD
+; BE: @test2
+; BE: ret i16 -8531
 }
 
 ; Load of second 16 bits of 32-bit value.
@@ -29,16 +41,27 @@ define i16 @test3() {
   %r = load i16* getelementptr(i16* bitcast(i32* getelementptr ({{i32,i8},i32}* @g1, i32 0, i32 0, i32 0) to i16*), i32 1)
   ret i16 %r
 
-; CHECK: @test3
-; CHECK: ret i16 -8531
+; 0xDEAD
+; LE: @test3
+; LE: ret i16 -8531
+
+; 0xBEEF
+; BE: @test3
+; BE: ret i16 -16657
 }
 
 ; Load of 8 bit field + tail padding.
 define i16 @test4() {
   %r = load i16* getelementptr(i16* bitcast(i32* getelementptr ({{i32,i8},i32}* @g1, i32 0, i32 0, i32 0) to i16*), i32 2)
   ret i16 %r
-; CHECK: @test4
-; CHECK: ret i16 186
+
+; 0x00BA
+; LE: @test4
+; LE: ret i16 186
+
+; 0xBA00
+; BE: @test4
+; BE: ret i16 -17920
 }
 
 ; Load of double bits.
@@ -46,8 +69,13 @@ define i64 @test6() {
   %r = load i64* bitcast(double* @g2 to i64*)
   ret i64 %r
 
-; CHECK: @test6
-; CHECK: ret i64 4607182418800017408
+; 0x3FF_0000000000000
+; LE: @test6
+; LE: ret i64 4607182418800017408
+
+; 0x3FF_0000000000000
+; BE: @test6
+; BE: ret i64 4607182418800017408
 }
 
 ; Load of double bits.
@@ -55,8 +83,13 @@ define i16 @test7() {
   %r = load i16* bitcast(double* @g2 to i16*)
   ret i16 %r
 
-; CHECK: @test7
-; CHECK: ret i16 0
+; 0x0000
+; LE: @test7
+; LE: ret i16 0
+
+; 0x3FF0
+; BE: @test7
+; BE: ret i16 16368
 }
 
 ; Double load.
@@ -64,8 +97,11 @@ define double @test8() {
   %r = load double* bitcast({{i32,i8},i32}* @g1 to double*)
   ret double %r
 
-; CHECK: @test8
-; CHECK: ret double 0xBADEADBEEF
+; LE: @test8
+; LE: ret double 0xBADEADBEEF
+
+; BE: @test8
+; BE: ret double 0xDEADBEEFBA000000
 }
 
 
@@ -74,8 +110,13 @@ define i128 @test9() {
   %r = load i128* bitcast({i64, i64}* @g3 to i128*)
   ret i128 %r
 
-; CHECK: @test9
-; CHECK: ret i128 2071796475790618158476296315
+; 0x00000000_06B1BFF8_00000000_0000007B
+; LE: @test9
+; LE: ret i128 2071796475790618158476296315
+
+; 0x00000000_0000007B_00000000_06B1BFF8
+; BE: @test9
+; BE: ret i128 2268949521066387161080
 }
 
 ; vector load.
@@ -83,21 +124,30 @@ define <2 x i64> @test10() {
   %r = load <2 x i64>* bitcast({i64, i64}* @g3 to <2 x i64>*)
   ret <2 x i64> %r
 
-; CHECK: @test10
-; CHECK: ret <2 x i64> <i64 123, i64 112312312>
+; LE: @test10
+; LE: ret <2 x i64> <i64 123, i64 112312312>
+
+; BE: @test10
+; BE: ret <2 x i64> <i64 123, i64 112312312>
 }
 
 
 ; PR5287
+; { 0xA1, 0x08 }
 @g4 = internal constant { i8, i8 } { i8 -95, i8 8 }
 
 define i16 @test11() nounwind {
 entry:
   %a = load i16* bitcast ({ i8, i8 }* @g4 to i16*)
   ret i16 %a
-  
-; CHECK: @test11
-; CHECK: ret i16 2209
+
+; 0x08A1
+; LE: @test11
+; LE: ret i16 2209
+
+; 0xA108
+; BE: @test11
+; BE: ret i16 -24312
 }
 
 
@@ -107,8 +157,14 @@ entry:
 define i16 @test12() {
   %a = load i16* getelementptr inbounds ([3 x i16]* bitcast ([6 x i8]* @test12g to [3 x i16]*), i32 0, i64 1) 
   ret i16 %a
-; CHECK: @test12
-; CHECK: ret i16 98
+
+; 0x0062
+; LE: @test12
+; LE: ret i16 98
+
+; 0x6200
+; BE: @test12
+; BE: ret i16 25088
 }
 
 
@@ -117,8 +173,12 @@ define i16 @test12() {
 define i1 @test13() {
   %A = load i1* bitcast (i8* @g5 to i1*)
   ret i1 %A
-; CHECK: @test13
-; CHECK: ret i1 false
+
+; LE: @test13
+; LE: ret i1 false
+
+; BE: @test13
+; BE: ret i1 false
 }
 
 @g6 = constant [2 x i8*] [i8* inttoptr (i64 1 to i8*), i8* inttoptr (i64 2 to i8*)]
@@ -126,14 +186,22 @@ define i64 @test14() nounwind {
 entry:
   %tmp = load i64* bitcast ([2 x i8*]* @g6 to i64*)
   ret i64 %tmp
-; CHECK: @test14
-; CHECK: ret i64 1
+
+; LE: @test14
+; LE: ret i64 1
+
+; BE: @test14
+; BE: ret i64 1
 }
 
 define i64 @test15() nounwind {
 entry:
   %tmp = load i64* bitcast (i8** getelementptr inbounds ([2 x i8*]* @g6, i32 0, i64 1) to i64*)
   ret i64 %tmp
-; CHECK: @test15
-; CHECK: ret i64 2
+
+; LE: @test15
+; LE: ret i64 2
+
+; BE: @test15
+; BE: ret i64 2
 }
diff --git a/test/Transforms/CorrelatedValuePropagation/crash.ll b/test/Transforms/CorrelatedValuePropagation/crash.ll
index 80c43d0..9723d18 100644
--- a/test/Transforms/CorrelatedValuePropagation/crash.ll
+++ b/test/Transforms/CorrelatedValuePropagation/crash.ll
@@ -35,3 +35,28 @@ srf.exit.i:
 func_29.exit:
   ret void
 }
+
+; PR13972
+define void @test3() nounwind {
+for.body:
+  br label %return
+
+for.cond.i:                                       ; preds = %if.else.i, %for.body.i
+  %e.2.i = phi i32 [ %e.2.i, %if.else.i ], [ -8, %for.body.i ]
+  br i1 undef, label %return, label %for.body.i
+
+for.body.i:                                       ; preds = %for.cond.i
+  switch i32 %e.2.i, label %for.cond3.i [
+    i32 -3, label %if.else.i
+    i32 0, label %for.cond.i
+  ]
+
+for.cond3.i:                                      ; preds = %for.cond3.i, %for.body.i
+  br label %for.cond3.i
+
+if.else.i:                                        ; preds = %for.body.i
+  br label %for.cond.i
+
+return:                                           ; preds = %for.cond.i, %for.body
+  ret void
+}
diff --git a/test/Transforms/DeadArgElim/dbginfo.ll b/test/Transforms/DeadArgElim/dbginfo.ll
new file mode 100644
index 0000000..dcbfaaa
--- /dev/null
+++ b/test/Transforms/DeadArgElim/dbginfo.ll
@@ -0,0 +1,64 @@
+; RUN: opt %s -deadargelim -S | FileCheck %s
+; PR14016
+
+; Check that debug info metadata for subprograms stores pointers to
+; updated LLVM functions.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@x = global i32 0, align 4
+
+define void @_Z3runv() uwtable {
+entry:
+  call void @_ZN12_GLOBAL__N_18dead_argEPv(i8* null), !dbg !10
+  call void (...)* @_ZN12_GLOBAL__N_111dead_varargEz(), !dbg !12
+  ret void, !dbg !13
+}
+
+; Argument will be deleted
+define internal void @_ZN12_GLOBAL__N_18dead_argEPv(i8* %foo) nounwind uwtable {
+entry:
+  %0 = load i32* @x, align 4, !dbg !14
+  %inc = add nsw i32 %0, 1, !dbg !14
+  store i32 %inc, i32* @x, align 4, !dbg !14
+  ret void, !dbg !16
+}
+
+; Vararg will be deleted
+define internal void @_ZN12_GLOBAL__N_111dead_varargEz(...) nounwind uwtable {
+entry:
+  %0 = load i32* @x, align 4, !dbg !17
+  %inc = add nsw i32 %0, 1, !dbg !17
+  store i32 %inc, i32* @x, align 4, !dbg !17
+  ret void, !dbg !19
+}
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"test.cc", metadata !"/home/samsonov/tmp/clang-di", metadata !"clang version 3.2 (trunk 165305)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/home/samsonov/tmp/clang-di/test.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5, metadata !8, metadata !9}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"run", metadata !"run", metadata !"", metadata !6, i32 8, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3runv, null, null, metadata !1, i32 8} ; [ DW_TAG_subprogram ] [line 8] [def] [run]
+!6 = metadata !{i32 786473, metadata !"test.cc", metadata !"/home/samsonov/tmp/clang-di", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{i32 786478, i32 0, metadata !6, metadata !"dead_vararg", metadata !"dead_vararg", metadata !"", metadata !6, i32 5, metadata !7, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (...)* @_ZN12_GLOBAL__N_111dead_varargEz, null, null, metadata !1, i32 5} ; [ DW_TAG_subprogram ] [line 5] [local] [def] [dead_vararg]
+
+; CHECK: metadata !"dead_vararg"{{.*}}void ()* @_ZN12_GLOBAL__N_111dead_varargEz
+
+!9 = metadata !{i32 786478, i32 0, metadata !6, metadata !"dead_arg", metadata !"dead_arg", metadata !"", metadata !6, i32 4, metadata !7, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*)* @_ZN12_GLOBAL__N_18dead_argEPv, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ] [line 4] [local] [def] [dead_arg]
+
+; CHECK: metadata !"dead_arg"{{.*}}void ()* @_ZN12_GLOBAL__N_18dead_argEPv
+
+!10 = metadata !{i32 8, i32 14, metadata !11, null}
+!11 = metadata !{i32 786443, metadata !5, i32 8, i32 12, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/home/samsonov/tmp/clang-di/test.cc]
+!12 = metadata !{i32 8, i32 27, metadata !11, null}
+!13 = metadata !{i32 8, i32 42, metadata !11, null}
+!14 = metadata !{i32 4, i32 28, metadata !15, null}
+!15 = metadata !{i32 786443, metadata !9, i32 4, i32 26, metadata !6, i32 2} ; [ DW_TAG_lexical_block ] [/home/samsonov/tmp/clang-di/test.cc]
+!16 = metadata !{i32 4, i32 33, metadata !15, null}
+!17 = metadata !{i32 5, i32 25, metadata !18, null}
+!18 = metadata !{i32 786443, metadata !8, i32 5, i32 23, metadata !6, i32 1} ; [ DW_TAG_lexical_block ] [/home/samsonov/tmp/clang-di/test.cc]
+!19 = metadata !{i32 5, i32 30, metadata !18, null}
diff --git a/test/Transforms/DeadStoreElimination/libcalls.ll b/test/Transforms/DeadStoreElimination/libcalls.ll
new file mode 100644
index 0000000..4639c0b
--- /dev/null
+++ b/test/Transforms/DeadStoreElimination/libcalls.ll
@@ -0,0 +1,70 @@
+; RUN: opt -S -basicaa -dse < %s | FileCheck %s
+
+declare i8* @strcpy(i8* %dest, i8* %src) nounwind
+define void @test1(i8* %src) {
+; CHECK: @test1
+  %B = alloca [16 x i8]
+  %dest = getelementptr inbounds [16 x i8]* %B, i64 0, i64 0
+; CHECK-NOT: @strcpy
+  %call = call i8* @strcpy(i8* %dest, i8* %src)
+; CHECK: ret void
+  ret void
+}
+
+declare i8* @strncpy(i8* %dest, i8* %src, i32 %n) nounwind
+define void @test2(i8* %src) {
+; CHECK: @test2
+  %B = alloca [16 x i8]
+  %dest = getelementptr inbounds [16 x i8]* %B, i64 0, i64 0
+; CHECK-NOT: @strncpy
+  %call = call i8* @strncpy(i8* %dest, i8* %src, i32 12)
+; CHECK: ret void
+  ret void
+}
+
+declare i8* @strcat(i8* %dest, i8* %src) nounwind
+define void @test3(i8* %src) {
+; CHECK: @test3
+  %B = alloca [16 x i8]
+  %dest = getelementptr inbounds [16 x i8]* %B, i64 0, i64 0
+; CHECK-NOT: @strcat
+  %call = call i8* @strcat(i8* %dest, i8* %src)
+; CHECK: ret void
+  ret void
+}
+
+declare i8* @strncat(i8* %dest, i8* %src, i32 %n) nounwind
+define void @test4(i8* %src) {
+; CHECK: @test4
+  %B = alloca [16 x i8]
+  %dest = getelementptr inbounds [16 x i8]* %B, i64 0, i64 0
+; CHECK-NOT: @strncat
+  %call = call i8* @strncat(i8* %dest, i8* %src, i32 12)
+; CHECK: ret void
+  ret void
+}
+
+define void @test5(i8* nocapture %src) {
+; CHECK: @test5
+  %dest = alloca [100 x i8], align 16
+  %arraydecay = getelementptr inbounds [100 x i8]* %dest, i64 0, i64 0
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %src)
+; CHECK: %call = call i8* @strcpy
+  %arrayidx = getelementptr inbounds i8* %call, i64 10
+  store i8 97, i8* %arrayidx, align 1
+  ret void
+}
+
+declare void @user(i8* %p)
+define void @test6(i8* %src) {
+; CHECK: @test6
+  %B = alloca [16 x i8]
+  %dest = getelementptr inbounds [16 x i8]* %B, i64 0, i64 0
+; CHECK: @strcpy
+  %call = call i8* @strcpy(i8* %dest, i8* %src)
+; CHECK: @user
+  call void @user(i8* %dest)
+; CHECK: ret void
+  ret void
+}
+
diff --git a/test/Transforms/DeadStoreElimination/simple.ll b/test/Transforms/DeadStoreElimination/simple.ll
index 7a8cdd5..e0eb90a 100644
--- a/test/Transforms/DeadStoreElimination/simple.ll
+++ b/test/Transforms/DeadStoreElimination/simple.ll
@@ -310,3 +310,17 @@ define void @test24([2 x i32]* %a, i32 %b, i32 %c) nounwind {
   store i32 %c, i32* %4, align 4
   ret void
 }
+
+; Check another case like PR13547 where strdup is not like malloc.
+; CHECK: @test25
+; CHECK: load i8
+; CHECK: store i8 0
+; CHECK: store i8 %tmp
+define i8* @test25(i8* %p) nounwind {
+  %p.4 = getelementptr i8* %p, i64 4
+  %tmp = load i8* %p.4, align 1
+  store i8 0, i8* %p.4, align 1
+  %q = call i8* @strdup(i8* %p) nounwind optsize
+  store i8 %tmp, i8* %p.4, align 1
+  ret i8* %q
+}
diff --git a/test/Transforms/EarlyCSE/commute.ll b/test/Transforms/EarlyCSE/commute.ll
new file mode 100644
index 0000000..f84a7dd
--- /dev/null
+++ b/test/Transforms/EarlyCSE/commute.ll
@@ -0,0 +1,66 @@
+; RUN: opt < %s -S -early-cse | FileCheck %s
+
+; CHECK: @test1
+define void @test1(float %A, float %B, float* %PA, float* %PB) {
+  ; CHECK-NEXT: fadd
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: ret
+  %C = fadd float %A, %B
+  store float %C, float* %PA
+  %D = fadd float %B, %A
+  store float %D, float* %PB
+  ret void
+}
+
+; CHECK: @test2
+define void @test2(float %A, float %B, i1* %PA, i1* %PB) {
+  ; CHECK-NEXT: fcmp
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: ret
+  %C = fcmp eq float %A, %B
+  store i1 %C, i1* %PA
+  %D = fcmp eq float %B, %A
+  store i1 %D, i1* %PB
+  ret void
+}
+
+; CHECK: @test3
+define void @test3(float %A, float %B, i1* %PA, i1* %PB) {
+  ; CHECK-NEXT: fcmp
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: ret
+  %C = fcmp uge float %A, %B
+  store i1 %C, i1* %PA
+  %D = fcmp ule float %B, %A
+  store i1 %D, i1* %PB
+  ret void
+}
+
+; CHECK: @test4
+define void @test4(i32 %A, i32 %B, i1* %PA, i1* %PB) {
+  ; CHECK-NEXT: icmp
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: ret
+  %C = icmp eq i32 %A, %B
+  store i1 %C, i1* %PA
+  %D = icmp eq i32 %B, %A
+  store i1 %D, i1* %PB
+  ret void
+}
+
+; CHECK: @test5
+define void @test5(i32 %A, i32 %B, i1* %PA, i1* %PB) {
+  ; CHECK-NEXT: icmp
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: ret
+  %C = icmp sgt i32 %A, %B
+  store i1 %C, i1* %PA
+  %D = icmp slt i32 %B, %A
+  store i1 %D, i1* %PB
+  ret void
+}
diff --git a/test/Transforms/GVN/crash.ll b/test/Transforms/GVN/crash.ll
index 31eae25..4a8c8e4 100644
--- a/test/Transforms/GVN/crash.ll
+++ b/test/Transforms/GVN/crash.ll
@@ -163,3 +163,39 @@ entry:
   ret i8 %1
 }
 
+
+; Test that a GEP in an unreachable block with the following form doesn't crash
+; GVN:
+;
+;    %x = gep %some.type %x, ...
+
+%struct.type = type { i64, i32, i32 }
+
+define fastcc void @func() nounwind uwtable ssp align 2 {
+entry:
+  br label %reachable.bb
+
+;; Unreachable code.
+
+unreachable.bb:
+  %gep.val = getelementptr inbounds %struct.type* %gep.val, i64 1
+  br i1 undef, label %u2.bb, label %u1.bb
+
+u1.bb:
+  %tmp1 = getelementptr inbounds %struct.type* %gep.val, i64 0, i32 0
+  store i64 -1, i64* %tmp1, align 8
+  br label %unreachable.bb
+
+u2.bb:
+  %0 = load i32* undef, align 4
+  %conv.i.i.i.i.i = zext i32 %0 to i64
+  br label %u2.bb
+
+;; Reachable code.
+
+reachable.bb:
+  br label %r1.bb
+
+r1.bb:
+  br label %u2.bb
+}
diff --git a/test/Transforms/GVN/malloc-load-removal.ll b/test/Transforms/GVN/malloc-load-removal.ll
new file mode 100644
index 0000000..66b6929
--- /dev/null
+++ b/test/Transforms/GVN/malloc-load-removal.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -basicaa -gvn < %s | FileCheck %s
+; RUN: opt -S -basicaa -gvn -disable-simplify-libcalls < %s | FileCheck %s -check-prefix=CHECK_NO_LIBCALLS
+; PR13694
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+declare i8* @malloc(i64) nounwind
+
+define noalias i8* @test() nounwind uwtable ssp {
+entry:
+  %call = tail call i8* @malloc(i64 100) nounwind
+  %0 = load i8* %call, align 1
+  %tobool = icmp eq i8 %0, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i8 0, i8* %call, align 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i8* %call
+
+; CHECK: @test
+; CHECK-NOT: load
+; CHECK-NOT: icmp
+
+; CHECK_NO_LIBCALLS: @test
+; CHECK_NO_LIBCALLS: load
+; CHECK_NO_LIBCALLS: icmp
+}
diff --git a/test/Transforms/GVN/pr14166.ll b/test/Transforms/GVN/pr14166.ll
new file mode 100644
index 0000000..9f47e46
--- /dev/null
+++ b/test/Transforms/GVN/pr14166.ll
@@ -0,0 +1,27 @@
+; RUN: opt -gvn -S < %s | FileCheck %s
+target datalayout = "e-p:32:32:32"
+target triple = "i386-pc-linux-gnu"
+define <2 x i32> @test1() {
+  %v1 = alloca <2 x i32>
+  call void @anything(<2 x i32>* %v1)
+  %v2 = load <2 x i32>* %v1
+  %v3 = inttoptr <2 x i32> %v2 to <2 x i8*>
+  %v4 = bitcast <2 x i32>* %v1 to <2 x i8*>*
+  store <2 x i8*> %v3, <2 x i8*>* %v4
+  %v5 = load <2 x i32>* %v1
+  ret <2 x i32> %v5
+; CHECK: @test1
+; CHECK: %v1 = alloca <2 x i32>
+; CHECK: call void @anything(<2 x i32>* %v1)
+; CHECK: %v2 = load <2 x i32>* %v1
+; CHECK: %v3 = inttoptr <2 x i32> %v2 to <2 x i8*>
+; CHECK: %v4 = bitcast <2 x i32>* %v1 to <2 x i8*>*
+; CHECK: store <2 x i8*> %v3, <2 x i8*>* %v4
+; CHECK: %1 = ptrtoint <2 x i8*> %v3 to <2 x i32>
+; CHECK: %2 = bitcast <2 x i32> %1 to i64
+; CHECK: %3 = bitcast i64 %2 to <2 x i32>
+; CHECK: ret <2 x i32> %3
+}
+
+declare void @anything(<2 x i32>*)
+
diff --git a/test/Transforms/GVN/rle.ll b/test/Transforms/GVN/rle.ll
index e764169..72fa819 100644
--- a/test/Transforms/GVN/rle.ll
+++ b/test/Transforms/GVN/rle.ll
@@ -1,7 +1,5 @@
-; RUN: opt < %s -basicaa -gvn -S -die | FileCheck %s
-
-; 32-bit little endian target.
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+; RUN: opt < %s -default-data-layout="e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -basicaa -gvn -S -die | FileCheck %s
+; RUN: opt < %s -default-data-layout="E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-n32"      -basicaa -gvn -S -die | FileCheck %s
 
 ;; Trivial RLE test.
 define i32 @test0(i32 %V, i32* %P) {
@@ -318,7 +316,7 @@ define i8 @coerce_offset_nonlocal0(i32* %P, i1 %cond) {
   %P4 = getelementptr i8* %P3, i32 2
   br i1 %cond, label %T, label %F
 T:
-  store i32 42, i32* %P
+  store i32 57005, i32* %P
   br label %Cont
   
 F:
diff --git a/test/Transforms/GlobalOpt/blockaddress.ll b/test/Transforms/GlobalOpt/blockaddress.ll
new file mode 100644
index 0000000..13da762
--- /dev/null
+++ b/test/Transforms/GlobalOpt/blockaddress.ll
@@ -0,0 +1,20 @@
+; RUN: opt < %s -globalopt -S | FileCheck %s
+
+@x = internal global i8* zeroinitializer
+
+define void @f() {
+; CHECK: @f
+
+; Check that we don't hit an assert in Constant::IsThreadDependent()
+; when storing this blockaddress into a global.
+
+  store i8* blockaddress(@g, %here), i8** @x, align 8
+  ret void
+}
+
+define void @g() {
+; CHECK: @g
+
+here:
+  ret void
+}
diff --git a/test/Transforms/GlobalOpt/load-store-global.ll b/test/Transforms/GlobalOpt/load-store-global.ll
index f824b2c..25a5337 100644
--- a/test/Transforms/GlobalOpt/load-store-global.ll
+++ b/test/Transforms/GlobalOpt/load-store-global.ll
@@ -1,15 +1,38 @@
-; RUN: opt < %s -globalopt -S | not grep G
+; RUN: opt < %s -globalopt -S | FileCheck %s
 
 @G = internal global i32 17             ; <i32*> [#uses=3]
+; CHECK-NOT: @G
 
 define void @foo() {
         %V = load i32* @G               ; <i32> [#uses=1]
         store i32 %V, i32* @G
         ret void
+; CHECK: @foo
+; CHECK-NEXT: ret void
 }
 
 define i32 @bar() {
         %X = load i32* @G               ; <i32> [#uses=1]
         ret i32 %X
+; CHECK: @bar
+; CHECK-NEXT: ret i32 17
+}
+
+@a = internal global i64* null, align 8
+; CHECK-NOT: @a
+
+; PR13968
+define void @qux() nounwind {
+  %b = bitcast i64** @a to i8*
+  %g = getelementptr i64** @a, i32 1
+  %cmp = icmp ne i8* null, %b
+  %cmp2 = icmp eq i8* null, %b
+  %cmp3 = icmp eq i64** null, %g
+  store i64* inttoptr (i64 1 to i64*), i64** @a, align 8
+  %l = load i64** @a, align 8
+  ret void
+; CHECK: @qux
+; CHECK-NOT: store
+; CHECK-NOT: load
 }
 
diff --git a/test/Transforms/GlobalOpt/tls.ll b/test/Transforms/GlobalOpt/tls.ll
new file mode 100644
index 0000000..7a410e5
--- /dev/null
+++ b/test/Transforms/GlobalOpt/tls.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -globalopt -S | FileCheck %s
+
+declare void @wait()
+declare void @signal()
+declare void @start_thread(void ()*)
+
+@x = internal thread_local global [100 x i32] zeroinitializer, align 16
+@ip = internal global i32* null, align 8
+
+; PR14309: GlobalOpt would think that the value of @ip is always the address of
+; x[1]. However, that address is different for different threads so @ip cannot
+; be replaced with a constant.
+
+define i32 @f() {
+entry:
+  ; Set @ip to point to x[1] for thread 1.
+  store i32* getelementptr inbounds ([100 x i32]* @x, i64 0, i64 1), i32** @ip, align 8
+
+  ; Run g on a new thread.
+  tail call void @start_thread(void ()* @g) nounwind
+  tail call void @wait() nounwind
+
+  ; Reset x[1] for thread 1.
+  store i32 0, i32* getelementptr inbounds ([100 x i32]* @x, i64 0, i64 1), align 4
+
+  ; Read the value of @ip, which now points at x[1] for thread 2.
+  %0 = load i32** @ip, align 8
+
+  %1 = load i32* %0, align 4
+  ret i32 %1
+
+; CHECK: @f
+; Make sure that the load from @ip hasn't been removed.
+; CHECK: load i32** @ip
+; CHECK: ret
+}
+
+define internal void @g() nounwind uwtable {
+entry:
+  ; Set @ip to point to x[1] for thread 2.
+  store i32* getelementptr inbounds ([100 x i32]* @x, i64 0, i64 1), i32** @ip, align 8
+
+  ; Store 50 in x[1] for thread 2.
+  store i32 50, i32* getelementptr inbounds ([100 x i32]* @x, i64 0, i64 1), align 4
+
+  tail call void @signal() nounwind
+  ret void
+
+; CHECK: @g
+; Make sure that the store to @ip hasn't been removed.
+; CHECK: store {{.*}} @ip
+; CHECK: ret
+}
diff --git a/test/Transforms/IndVarSimplify/2004-04-05-InvokeCastCrash.ll b/test/Transforms/IndVarSimplify/2004-04-05-InvokeCastCrash.ll
index 708a961..0c88e83 100644
--- a/test/Transforms/IndVarSimplify/2004-04-05-InvokeCastCrash.ll
+++ b/test/Transforms/IndVarSimplify/2004-04-05-InvokeCastCrash.ll
@@ -39,11 +39,11 @@
 	%"struct.llvm::SymbolTable" = type opaque
 	%"struct.llvm::SymbolTableListTraits<llvm::Argument,llvm::Function,llvm::Function,llvm::ilist_traits<llvm::Argument> >" = type { %"struct.llvm::Function"*, %"struct.llvm::Function"* }
 	%"struct.llvm::SymbolTableListTraits<llvm::Instruction,llvm::BasicBlock,llvm::Function,llvm::ilist_traits<llvm::Instruction> >" = type { %"struct.llvm::Function"*, %"struct.llvm::BasicBlock"* }
-	%"struct.llvm::TargetData" = type { %"struct.llvm::FunctionPass", i1, i8, i8, i8, i8, i8, i8, i8, i8 }
+	%"struct.llvm::DataLayout" = type { %"struct.llvm::FunctionPass", i1, i8, i8, i8, i8, i8, i8, i8, i8 }
 	%"struct.llvm::TargetFrameInfo" = type { i32 (...)**, i32, i32, i32 }
 	%"struct.llvm::TargetInstrDescriptor" = type { i8*, i32, i32, i32, i1, i32, i32, i32, i32, i32, i32*, i32* }
 	%"struct.llvm::TargetInstrInfo" = type { i32 (...)**, %"struct.llvm::TargetInstrDescriptor"*, i32, i32 }
-	%"struct.llvm::TargetMachine" = type { i32 (...)**, %"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >", %"struct.llvm::TargetData", %"struct.llvm::IntrinsicLowering"* }
+	%"struct.llvm::TargetMachine" = type { i32 (...)**, %"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >", %"struct.llvm::DataLayout", %"struct.llvm::IntrinsicLowering"* }
 	%"struct.llvm::TargetRegClassInfo" = type { i32 (...)**, i32, i32, i32 }
 	%"struct.llvm::TargetRegInfo" = type { i32 (...)**, %"struct.std::vector<const llvm::TargetRegClassInfo*,std::allocator<const llvm::TargetRegClassInfo*> >", %"struct.llvm::TargetMachine"* }
 	%"struct.llvm::Type" = type { %"struct.llvm::Value", i32, i32, i1, i32, %"struct.llvm::Type"*, %"struct.std::vector<llvm::PATypeHandle,std::allocator<llvm::PATypeHandle> >" }
diff --git a/test/Transforms/IndVarSimplify/2012-10-19-congruent-constant.ll b/test/Transforms/IndVarSimplify/2012-10-19-congruent-constant.ll
new file mode 100644
index 0000000..5c47866
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/2012-10-19-congruent-constant.ll
@@ -0,0 +1,27 @@
+; RUN: opt -S -indvars < %s | FileCheck %s
+
+; PR12627
+define void @test1(i32 %x) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %phi1 = phi i1 [ false, %entry ], [ %cmpa, %for.body ]
+  %phi2 = phi i1 [ false, %entry ], [ %cmpb, %for.body ]
+  %i.07 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  tail call void @aux(i1 %phi1, i1 %phi2) nounwind
+  %cmpa = icmp sgt i32 %i.07, 200
+  %cmpb = icmp sgt i32 %i.07, 100
+  %inc = add nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+
+; CHECK: @test1
+; CHECK-NOT: phi i1
+; CHECK: call void @aux(i1 false, i1 false)
+}
+
+declare void @aux(i1, i1)
diff --git a/test/Transforms/IndVarSimplify/crash.ll b/test/Transforms/IndVarSimplify/crash.ll
index 3335be7..1b702a3 100644
--- a/test/Transforms/IndVarSimplify/crash.ll
+++ b/test/Transforms/IndVarSimplify/crash.ll
@@ -87,3 +87,47 @@ entry:
 main.f.exit:                                      ; preds = %"3.i"
   unreachable
 }
+
+
+; PR13967
+
+define void @f() nounwind ssp {
+bb:
+  br label %bb4
+
+bb4:
+  %tmp = phi i64 [ %tmp5, %bb7 ], [ undef, %bb ]
+  %tmp5 = add nsw i64 %tmp, 1
+  %extract.t1 = trunc i64 %tmp5 to i32
+  br i1 false, label %bb6, label %bb7
+
+bb6:
+  br label %bb7
+
+bb7:
+  %.off0 = phi i32 [ undef, %bb6 ], [ %extract.t1, %bb4 ]
+  %tmp8 = icmp eq i32 %.off0, 0
+  br i1 %tmp8, label %bb9, label %bb4
+
+bb9:
+  ret void
+}
+
+; PR12536
+define void @fn1() noreturn nounwind {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.end, %entry
+  %b.0 = phi i32 [ undef, %entry ], [ %conv, %for.end ]
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.cond1, %for.cond
+  %c.0 = phi i32 [ %b.0, %for.cond1 ], [ 0, %for.cond ]
+  br i1 undef, label %for.cond1, label %for.end
+
+for.end:                                          ; preds = %for.cond1
+  %cmp2 = icmp slt i32 %c.0, 1
+  %conv = zext i1 %cmp2 to i32
+  br label %for.cond
+}
diff --git a/test/Transforms/IndVarSimplify/eliminate-comparison.ll b/test/Transforms/IndVarSimplify/eliminate-comparison.ll
index 953bbdf..5dca712 100644
--- a/test/Transforms/IndVarSimplify/eliminate-comparison.ll
+++ b/test/Transforms/IndVarSimplify/eliminate-comparison.ll
@@ -106,3 +106,106 @@ loop:
 return:
   ret void
 }
+
+; PR14432
+; Indvars should not turn the second loop into an infinite one.
+
+; CHECK: @func_11
+; CHECK: %tmp5 = icmp slt i32 %__key6.0, 10
+; CHECK-NOT: br i1 true, label %noassert68, label %unrolledend
+
+define i32 @func_11() nounwind uwtable {
+entry:
+  br label %forcond
+
+forcond:                                          ; preds = %noassert, %entry
+  %__key6.0 = phi i32 [ 2, %entry ], [ %tmp37, %noassert ]
+  %tmp5 = icmp slt i32 %__key6.0, 10
+  br i1 %tmp5, label %noassert, label %forcond38.preheader
+
+forcond38.preheader:                              ; preds = %forcond
+  br label %forcond38
+
+noassert:                                         ; preds = %forbody
+  %tmp13 = sdiv i32 -32768, %__key6.0
+  %tmp2936 = shl i32 %tmp13, 24
+  %sext23 = shl i32 %tmp13, 24
+  %tmp32 = icmp eq i32 %tmp2936, %sext23
+  %tmp37 = add i32 %__key6.0, 1
+  br i1 %tmp32, label %forcond, label %assert33
+
+assert33:                                         ; preds = %noassert
+  tail call void @llvm.trap()
+  unreachable
+
+forcond38:                                        ; preds = %noassert68, %forcond38.preheader
+  %__key8.0 = phi i32 [ %tmp81, %noassert68 ], [ 2, %forcond38.preheader ]
+  %tmp46 = icmp slt i32 %__key8.0, 10
+  br i1 %tmp46, label %noassert68, label %unrolledend
+
+noassert68:                                       ; preds = %forbody39
+  %tmp57 = sdiv i32 -32768, %__key8.0
+  %sext34 = shl i32 %tmp57, 16
+  %sext21 = shl i32 %tmp57, 16
+  %tmp76 = icmp eq i32 %sext34, %sext21
+  %tmp81 = add i32 %__key8.0, 1
+  br i1 %tmp76, label %forcond38, label %assert77
+
+assert77:                                         ; preds = %noassert68
+  tail call void @llvm.trap()
+  unreachable
+
+unrolledend:                                      ; preds = %forcond38
+  ret i32 0
+}
+
+declare void @llvm.trap() noreturn nounwind
+
+; In this case the second loop only has a single iteration, fold the header away
+; CHECK: @func_12
+; CHECK: %tmp5 = icmp slt i32 %__key6.0, 10
+; CHECK: br i1 true, label %noassert68, label %unrolledend
+define i32 @func_12() nounwind uwtable {
+entry:
+  br label %forcond
+
+forcond:                                          ; preds = %noassert, %entry
+  %__key6.0 = phi i32 [ 2, %entry ], [ %tmp37, %noassert ]
+  %tmp5 = icmp slt i32 %__key6.0, 10
+  br i1 %tmp5, label %noassert, label %forcond38.preheader
+
+forcond38.preheader:                              ; preds = %forcond
+  br label %forcond38
+
+noassert:                                         ; preds = %forbody
+  %tmp13 = sdiv i32 -32768, %__key6.0
+  %tmp2936 = shl i32 %tmp13, 24
+  %sext23 = shl i32 %tmp13, 24
+  %tmp32 = icmp eq i32 %tmp2936, %sext23
+  %tmp37 = add i32 %__key6.0, 1
+  br i1 %tmp32, label %forcond, label %assert33
+
+assert33:                                         ; preds = %noassert
+  tail call void @llvm.trap()
+  unreachable
+
+forcond38:                                        ; preds = %noassert68, %forcond38.preheader
+  %__key8.0 = phi i32 [ %tmp81, %noassert68 ], [ 2, %forcond38.preheader ]
+  %tmp46 = icmp slt i32 %__key8.0, 10
+  br i1 %tmp46, label %noassert68, label %unrolledend
+
+noassert68:                                       ; preds = %forbody39
+  %tmp57 = sdiv i32 -32768, %__key8.0
+  %sext34 = shl i32 %tmp57, 16
+  %sext21 = shl i32 %tmp57, 16
+  %tmp76 = icmp ne i32 %sext34, %sext21
+  %tmp81 = add i32 %__key8.0, 1
+  br i1 %tmp76, label %forcond38, label %assert77
+
+assert77:                                         ; preds = %noassert68
+  tail call void @llvm.trap()
+  unreachable
+
+unrolledend:                                      ; preds = %forcond38
+  ret i32 0
+}
diff --git a/test/Transforms/IndVarSimplify/no-iv-rewrite.ll b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
index bfdd000..507f695 100644
--- a/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
+++ b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
@@ -199,7 +199,6 @@ entry:
 ; back to the loop iv.
 ;
 ; CHECK: loop:
-; CHECK: phi i32
 ; CHECK-NOT: phi
 ; CHECK: exit:
 loop:
diff --git a/test/Transforms/IndVarSimplify/verify-scev.ll b/test/Transforms/IndVarSimplify/verify-scev.ll
new file mode 100644
index 0000000..019f583
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/verify-scev.ll
@@ -0,0 +1,421 @@
+; RUN: opt < %s -S -indvars -verify-scev
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define void @test1() nounwind uwtable ssp {
+entry:
+  br i1 undef, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 false, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  br i1 undef, label %for.end11, label %for.body3
+
+for.body3:                                        ; preds = %for.end
+  unreachable
+
+for.end11:                                        ; preds = %for.end
+  br i1 undef, label %while.body, label %while.end
+
+while.body:                                       ; preds = %for.end11
+  unreachable
+
+while.end:                                        ; preds = %for.end11
+  br i1 undef, label %if.end115, label %for.cond109
+
+for.cond109:                                      ; preds = %while.end
+  unreachable
+
+if.end115:                                        ; preds = %while.end
+  br i1 undef, label %while.body119.lr.ph.lr.ph, label %for.cond612
+
+while.body119.lr.ph.lr.ph:                        ; preds = %if.end115
+  br i1 undef, label %for.cond612, label %if.end123.us
+
+if.end123.us:                                     ; preds = %while.body119.lr.ph.lr.ph
+  br label %for.cond132.us
+
+for.cond132.us:                                   ; preds = %for.cond132.us, %if.end123.us
+  br i1 undef, label %if.then136.us, label %for.cond132.us
+
+if.then136.us:                                    ; preds = %for.cond132.us
+  br i1 undef, label %while.end220, label %while.body211
+
+while.body211:                                    ; preds = %while.body211, %if.then136.us
+  br i1 undef, label %while.end220, label %while.body211
+
+while.end220:                                     ; preds = %while.body211, %if.then136.us
+  br label %for.cond246.outer
+
+for.cond246.outer:                                ; preds = %for.inc558, %for.cond394.preheader, %if.then274, %for.cond404.preheader, %while.end220
+  br label %for.cond246
+
+for.cond246:                                      ; preds = %for.cond372.loopexit, %for.cond246.outer
+  br i1 undef, label %for.end562, label %if.end250
+
+if.end250:                                        ; preds = %for.cond246
+  br i1 undef, label %if.end256, label %for.end562
+
+if.end256:                                        ; preds = %if.end250
+  %cmp272 = icmp eq i32 undef, undef
+  br i1 %cmp272, label %if.then274, label %for.cond404.preheader
+
+for.cond404.preheader:                            ; preds = %if.end256
+  br i1 undef, label %for.cond246.outer, label %for.body409.lr.ph
+
+for.body409.lr.ph:                                ; preds = %for.cond404.preheader
+  br label %for.body409
+
+if.then274:                                       ; preds = %if.end256
+  br i1 undef, label %for.cond246.outer, label %if.end309
+
+if.end309:                                        ; preds = %if.then274
+  br i1 undef, label %for.cond372.loopexit, label %for.body361
+
+for.body361:                                      ; preds = %for.body361, %if.end309
+  br i1 undef, label %for.cond372.loopexit, label %for.body361
+
+for.cond372.loopexit:                             ; preds = %for.body361, %if.end309
+  br i1 undef, label %for.cond394.preheader, label %for.cond246
+
+for.cond394.preheader:                            ; preds = %for.cond372.loopexit
+  br i1 undef, label %for.cond246.outer, label %for.body397
+
+for.body397:                                      ; preds = %for.cond394.preheader
+  unreachable
+
+for.body409:                                      ; preds = %for.inc558, %for.body409.lr.ph
+  %k.029 = phi i32 [ 1, %for.body409.lr.ph ], [ %inc559, %for.inc558 ]
+  br i1 undef, label %if.then412, label %if.else433
+
+if.then412:                                       ; preds = %for.body409
+  br label %if.end440
+
+if.else433:                                       ; preds = %for.body409
+  br label %if.end440
+
+if.end440:                                        ; preds = %if.else433, %if.then412
+  br i1 undef, label %for.inc558, label %if.end461
+
+if.end461:                                        ; preds = %if.end440
+  br i1 undef, label %for.cond528.loopexit, label %for.body517
+
+for.body517:                                      ; preds = %for.body517, %if.end461
+  br i1 undef, label %for.cond528.loopexit, label %for.body517
+
+for.cond528.loopexit:                             ; preds = %for.body517, %if.end461
+  br label %for.inc558
+
+for.inc558:                                       ; preds = %for.cond528.loopexit, %if.end440
+  %inc559 = add nsw i32 %k.029, 1
+  %cmp407 = icmp sgt i32 %inc559, undef
+  br i1 %cmp407, label %for.cond246.outer, label %for.body409
+
+for.end562:                                       ; preds = %if.end250, %for.cond246
+  unreachable
+
+for.cond612:                                      ; preds = %while.body119.lr.ph.lr.ph, %if.end115
+  unreachable
+}
+
+define void @test2() nounwind uwtable ssp {
+entry:
+  br i1 undef, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  br i1 undef, label %for.end11, label %for.body3
+
+for.body3:                                        ; preds = %for.end
+  unreachable
+
+for.end11:                                        ; preds = %for.end
+  br i1 undef, label %while.body, label %while.end
+
+while.body:                                       ; preds = %for.end11
+  unreachable
+
+while.end:                                        ; preds = %for.end11
+  br i1 undef, label %if.end115, label %for.cond109
+
+for.cond109:                                      ; preds = %while.end
+  unreachable
+
+if.end115:                                        ; preds = %while.end
+  br i1 undef, label %while.body119.lr.ph.lr.ph, label %for.cond612
+
+while.body119.lr.ph.lr.ph:                        ; preds = %if.end115
+  br i1 undef, label %for.cond612, label %if.end123.us
+
+if.end123.us:                                     ; preds = %while.body119.lr.ph.lr.ph
+  br label %for.cond132.us
+
+for.cond132.us:                                   ; preds = %for.cond132.us, %if.end123.us
+  br i1 undef, label %if.then136.us, label %for.cond132.us
+
+if.then136.us:                                    ; preds = %for.cond132.us
+  br i1 undef, label %while.end220, label %while.body211
+
+while.body211:                                    ; preds = %while.body211, %if.then136.us
+  br i1 undef, label %while.end220, label %while.body211
+
+while.end220:                                     ; preds = %while.body211, %if.then136.us
+  br label %for.cond246.outer
+
+for.cond246.outer:                                ; preds = %for.inc558, %for.cond394.preheader, %if.then274, %for.cond404.preheader, %while.end220
+  br label %for.cond246
+
+for.cond246:                                      ; preds = %for.cond372.loopexit, %for.cond246.outer
+  br i1 undef, label %for.end562, label %if.end250
+
+if.end250:                                        ; preds = %for.cond246
+  br i1 undef, label %if.end256, label %for.end562
+
+if.end256:                                        ; preds = %if.end250
+  %0 = load i32* undef, align 4
+  br i1 undef, label %if.then274, label %for.cond404.preheader
+
+for.cond404.preheader:                            ; preds = %if.end256
+  %add406 = add i32 0, %0
+  br i1 undef, label %for.cond246.outer, label %for.body409.lr.ph
+
+for.body409.lr.ph:                                ; preds = %for.cond404.preheader
+  br label %for.body409
+
+if.then274:                                       ; preds = %if.end256
+  br i1 undef, label %for.cond246.outer, label %if.end309
+
+if.end309:                                        ; preds = %if.then274
+  br i1 undef, label %for.cond372.loopexit, label %for.body361
+
+for.body361:                                      ; preds = %for.body361, %if.end309
+  br i1 undef, label %for.cond372.loopexit, label %for.body361
+
+for.cond372.loopexit:                             ; preds = %for.body361, %if.end309
+  br i1 undef, label %for.cond394.preheader, label %for.cond246
+
+for.cond394.preheader:                            ; preds = %for.cond372.loopexit
+  br i1 undef, label %for.cond246.outer, label %for.body397
+
+for.body397:                                      ; preds = %for.cond394.preheader
+  unreachable
+
+for.body409:                                      ; preds = %for.inc558, %for.body409.lr.ph
+  %k.029 = phi i32 [ 1, %for.body409.lr.ph ], [ %inc559, %for.inc558 ]
+  br i1 undef, label %if.then412, label %if.else433
+
+if.then412:                                       ; preds = %for.body409
+  br label %if.end440
+
+if.else433:                                       ; preds = %for.body409
+  br label %if.end440
+
+if.end440:                                        ; preds = %if.else433, %if.then412
+  br i1 undef, label %for.inc558, label %if.end461
+
+if.end461:                                        ; preds = %if.end440
+  br i1 undef, label %for.cond528.loopexit, label %for.body517
+
+for.body517:                                      ; preds = %for.body517, %if.end461
+  br i1 undef, label %for.cond528.loopexit, label %for.body517
+
+for.cond528.loopexit:                             ; preds = %for.body517, %if.end461
+  br label %for.inc558
+
+for.inc558:                                       ; preds = %for.cond528.loopexit, %if.end440
+  %inc559 = add nsw i32 %k.029, 1
+  %cmp407 = icmp sgt i32 %inc559, %add406
+  br i1 %cmp407, label %for.cond246.outer, label %for.body409
+
+for.end562:                                       ; preds = %if.end250, %for.cond246
+  unreachable
+
+for.cond612:                                      ; preds = %while.body119.lr.ph.lr.ph, %if.end115
+  unreachable
+}
+
+define void @test3() nounwind uwtable ssp {
+entry:
+  br i1 undef, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  br i1 undef, label %for.end11, label %for.body3
+
+for.body3:                                        ; preds = %for.end
+  unreachable
+
+for.end11:                                        ; preds = %for.end
+  br i1 undef, label %while.body, label %while.end
+
+while.body:                                       ; preds = %for.end11
+  unreachable
+
+while.end:                                        ; preds = %for.end11
+  br i1 undef, label %if.end115, label %for.cond109
+
+for.cond109:                                      ; preds = %while.end
+  unreachable
+
+if.end115:                                        ; preds = %while.end
+  br i1 undef, label %while.body119.lr.ph.lr.ph, label %for.cond612
+
+while.body119.lr.ph.lr.ph:                        ; preds = %if.end115
+  br i1 undef, label %for.cond612, label %if.end123.us
+
+if.end123.us:                                     ; preds = %while.body119.lr.ph.lr.ph
+  br label %for.cond132.us
+
+for.cond132.us:                                   ; preds = %for.cond132.us, %if.end123.us
+  br i1 undef, label %if.then136.us, label %for.cond132.us
+
+if.then136.us:                                    ; preds = %for.cond132.us
+  br i1 undef, label %while.end220, label %while.body211
+
+while.body211:                                    ; preds = %while.body211, %if.then136.us
+  br i1 undef, label %while.end220, label %while.body211
+
+while.end220:                                     ; preds = %while.body211, %if.then136.us
+  br label %for.cond246.outer
+
+for.cond246.outer:                                ; preds = %for.inc558, %for.cond394.preheader, %if.then274, %for.cond404.preheader, %while.end220
+  br label %for.cond246
+
+for.cond246:                                      ; preds = %for.cond372.loopexit, %for.cond246.outer
+  br i1 undef, label %for.end562, label %if.end250
+
+if.end250:                                        ; preds = %for.cond246
+  br i1 undef, label %if.end256, label %for.end562
+
+if.end256:                                        ; preds = %if.end250
+  br i1 undef, label %if.then274, label %for.cond404.preheader
+
+for.cond404.preheader:                            ; preds = %if.end256
+  br i1 undef, label %for.cond246.outer, label %for.body409.lr.ph
+
+for.body409.lr.ph:                                ; preds = %for.cond404.preheader
+  br label %for.body409
+
+if.then274:                                       ; preds = %if.end256
+  br i1 undef, label %for.cond246.outer, label %if.end309
+
+if.end309:                                        ; preds = %if.then274
+  br i1 undef, label %for.cond372.loopexit, label %for.body361
+
+for.body361:                                      ; preds = %for.body361, %if.end309
+  br i1 undef, label %for.cond372.loopexit, label %for.body361
+
+for.cond372.loopexit:                             ; preds = %for.body361, %if.end309
+  br i1 undef, label %for.cond394.preheader, label %for.cond246
+
+for.cond394.preheader:                            ; preds = %for.cond372.loopexit
+  br i1 undef, label %for.cond246.outer, label %for.body397
+
+for.body397:                                      ; preds = %for.cond394.preheader
+  unreachable
+
+for.body409:                                      ; preds = %for.inc558, %for.body409.lr.ph
+  br i1 undef, label %if.then412, label %if.else433
+
+if.then412:                                       ; preds = %for.body409
+  br label %if.end440
+
+if.else433:                                       ; preds = %for.body409
+  br label %if.end440
+
+if.end440:                                        ; preds = %if.else433, %if.then412
+  br i1 undef, label %for.inc558, label %if.end461
+
+if.end461:                                        ; preds = %if.end440
+  br i1 undef, label %for.cond528.loopexit, label %for.body517
+
+for.body517:                                      ; preds = %for.body517, %if.end461
+  br i1 undef, label %for.cond528.loopexit, label %for.body517
+
+for.cond528.loopexit:                             ; preds = %for.body517, %if.end461
+  br label %for.inc558
+
+for.inc558:                                       ; preds = %for.cond528.loopexit, %if.end440
+  br i1 undef, label %for.cond246.outer, label %for.body409
+
+for.end562:                                       ; preds = %if.end250, %for.cond246
+  unreachable
+
+for.cond612:                                      ; preds = %while.body119.lr.ph.lr.ph, %if.end115
+  unreachable
+}
+
+define void @test4() nounwind uwtable ssp {
+entry:
+  br i1 undef, label %if.end8, label %if.else
+
+if.else:                                          ; preds = %entry
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %entry
+  br i1 undef, label %if.end26, label %if.else22
+
+if.else22:                                        ; preds = %if.end8
+  br label %if.end26
+
+if.end26:                                         ; preds = %if.else22, %if.end8
+  br i1 undef, label %if.end35, label %if.else31
+
+if.else31:                                        ; preds = %if.end26
+  br label %if.end35
+
+if.end35:                                         ; preds = %if.else31, %if.end26
+  br i1 undef, label %for.end226, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %if.end35
+  br label %for.body48
+
+for.body48:                                       ; preds = %for.inc221, %for.body.lr.ph
+  br i1 undef, label %for.inc221, label %for.body65.lr.ph
+
+for.body65.lr.ph:                                 ; preds = %for.body48
+  %0 = load i32* undef, align 4
+  br label %for.body65.us
+
+for.body65.us:                                    ; preds = %for.inc219.us, %for.body65.lr.ph
+  %k.09.us = phi i32 [ %inc.us, %for.inc219.us ], [ 1, %for.body65.lr.ph ]
+  %idxprom66.us = sext i32 %k.09.us to i64
+  br i1 undef, label %for.inc219.us, label %if.end72.us
+
+if.end72.us:                                      ; preds = %for.body65.us
+  br i1 undef, label %if.end93.us, label %if.then76.us
+
+if.then76.us:                                     ; preds = %if.end72.us
+  br label %if.end93.us
+
+if.end93.us:                                      ; preds = %if.then76.us, %if.end72.us
+  br i1 undef, label %if.end110.us, label %for.inc219.us
+
+if.end110.us:                                     ; preds = %if.end93.us
+  br i1 undef, label %for.inc219.us, label %for.body142.us
+
+for.body142.us:                                   ; preds = %for.cond139.loopexit.us, %if.end110.us
+  br label %for.cond152.us
+
+for.cond152.us:                                   ; preds = %for.cond152.us, %for.body142.us
+  br i1 undef, label %for.cond139.loopexit.us, label %for.cond152.us
+
+for.inc219.us:                                    ; preds = %for.cond139.loopexit.us, %if.end110.us, %if.end93.us, %for.body65.us
+  %inc.us = add nsw i32 %k.09.us, 1
+  %cmp64.us = icmp sgt i32 %inc.us, %0
+  br i1 %cmp64.us, label %for.inc221, label %for.body65.us
+
+for.cond139.loopexit.us:                          ; preds = %for.cond152.us
+  br i1 undef, label %for.inc219.us, label %for.body142.us
+
+for.inc221:                                       ; preds = %for.inc219.us, %for.body48
+  br label %for.body48
+
+for.end226:                                       ; preds = %if.end35
+  ret void
+}
diff --git a/test/Transforms/Inline/recursive.ll b/test/Transforms/Inline/recursive.ll
new file mode 100644
index 0000000..5fe8d16
--- /dev/null
+++ b/test/Transforms/Inline/recursive.ll
@@ -0,0 +1,38 @@
+; RUN: opt %s -inline -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin10.0"
+
+; rdar://10853263
+
+; Make sure that the callee is still here.
+; CHECK: define i32 @callee
+define i32 @callee(i32 %param) {
+ %yyy = alloca [100000 x i8]
+ %r = bitcast [100000 x i8]* %yyy to i8*
+ call void @foo2(i8* %r)
+ ret i32 4
+}
+
+; CHECK: define i32 @caller
+; CHECK-NEXT: entry:
+; CHECK-NOT: alloca
+; CHECK: ret
+define i32 @caller(i32 %param) {
+entry:
+  %t = call i32 @foo(i32 %param)
+  %cmp = icmp eq i32 %t, -1
+  br i1 %cmp, label %exit, label %cont
+
+cont:
+  %r = call i32 @caller(i32 %t)
+  %f = call i32 @callee(i32 %r)
+  br label %cont
+exit:
+  ret i32 4
+}
+
+declare void @foo2(i8* %in)
+
+declare i32 @foo(i32 %param)
+
diff --git a/test/Transforms/InstCombine/2012-07-25-LoadPart.ll b/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
index 73e5a66..18aab7f 100644
--- a/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
+++ b/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
@@ -1,12 +1,14 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -default-data-layout="e-p:32:32:32" -instcombine -S | FileCheck %s --check-prefix=LE
+; RUN: opt < %s -default-data-layout="E-p:32:32:32" -instcombine -S | FileCheck %s --check-prefix=BE
 ; PR13442
 
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
-
 @test = constant [4 x i32] [i32 1, i32 2, i32 3, i32 4]
 
 define i64 @foo() {
   %ret = load i64* bitcast (i8* getelementptr (i8* bitcast ([4 x i32]* @test to i8*), i64 2) to i64*), align 1
   ret i64 %ret
-  ; CHECK: ret i64 844424930263040
+  ; 0x00030000_00020000 in [01 00/00 00 02 00 00 00 03 00/00 00 04 00 00 00]
+  ; LE: ret i64 844424930263040
+  ; 0x00000200_00000300 in [00 00/00 01 00 00 00 02 00 00/00 03 00 00 00 04]
+  ; BE: ret i64 281474976841728
 }
diff --git a/test/Transforms/InstCombine/2012-08-28-udiv_ashl.ll b/test/Transforms/InstCombine/2012-08-28-udiv_ashl.ll
new file mode 100644
index 0000000..4efaf8c
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-08-28-udiv_ashl.ll
@@ -0,0 +1,57 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; rdar://12182093
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: @udiv400
+; CHECK: udiv i32 %x, 400
+; CHECK: ret
+define i32 @udiv400(i32 %x) {
+entry:
+  %div = lshr i32 %x, 2
+  %div1 = udiv i32 %div, 100
+  ret i32 %div1
+}
+
+
+; CHECK: @udiv400_no
+; CHECK: ashr
+; CHECK: div
+; CHECK: ret
+define i32 @udiv400_no(i32 %x) {
+entry:
+  %div = ashr i32 %x, 2
+  %div1 = udiv i32 %div, 100
+  ret i32 %div1
+}
+
+; CHECK: @sdiv400_yes
+; CHECK: udiv i32 %x, 400
+; CHECK: ret
+define i32 @sdiv400_yes(i32 %x) {
+entry:
+  %div = lshr i32 %x, 2
+  ; The sign bits of both operands are zero (i.e. we can prove they are
+  ; unsigned inputs), turn this into a udiv.
+  ; Next, optimize this just like sdiv.
+  %div1 = sdiv i32 %div, 100
+  ret i32 %div1
+}
+
+
+; CHECK: @udiv_i80
+; CHECK: udiv i80 %x, 400
+; CHECK: ret
+define i80 @udiv_i80(i80 %x) {
+  %div = lshr i80 %x, 2
+  %div1 = udiv i80 %div, 100
+  ret i80 %div1
+}
+
+define i32 @no_crash_notconst_udiv(i32 %x, i32 %notconst) {
+  %div = lshr i32 %x, %notconst
+  %div1 = udiv i32 %div, 100
+  ret i32 %div1
+}
diff --git a/test/Transforms/InstCombine/2012-09-17-ZeroSizedAlloca.ll b/test/Transforms/InstCombine/2012-09-17-ZeroSizedAlloca.ll
new file mode 100644
index 0000000..ba025e9
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-09-17-ZeroSizedAlloca.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; When merging zero sized alloca check that requested alignments of the allocas
+; are obeyed.
+
+@x = global i8* null, align 8
+@y = global i8* null, align 8
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: @f
+; CHECK-NEXT: alloca [0 x i8], align 1024
+; CHECK-NOT: alloca
+; CHECK: ret void
+define void @f() {
+  %1 = alloca [0 x i8], align 1
+  %2 = alloca [0 x i8], align 1024
+  %3 = getelementptr inbounds [0 x i8]* %1, i64 0, i64 0
+  %4 = getelementptr inbounds [0 x i8]* %2, i64 0, i64 0
+  store i8* %3, i8** @x, align 8
+  store i8* %4, i8** @y, align 8
+  ret void
+}
diff --git a/test/Transforms/InstCombine/2012-09-24-MemcpyFromGlobalCrash.ll b/test/Transforms/InstCombine/2012-09-24-MemcpyFromGlobalCrash.ll
new file mode 100644
index 0000000..4cd60b4
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-09-24-MemcpyFromGlobalCrash.ll
@@ -0,0 +1,19 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; Check we don't crash due to lack of target data.
+
+@G = constant [100 x i8] zeroinitializer
+
+declare void @bar(i8*)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define void @test() {
+; CHECK: @test
+; CHECK: llvm.memcpy
+; CHECK: ret void
+  %A = alloca [100 x i8]
+  %a = getelementptr inbounds [100 x i8]* %A, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* getelementptr inbounds ([100 x i8]* @G, i64 0, i32 0), i64 100, i32 4, i1 false)
+  call void @bar(i8* %a) readonly
+  ret void
+}
diff --git a/test/Transforms/InstCombine/2012-10-25-vector-of-pointers.ll b/test/Transforms/InstCombine/2012-10-25-vector-of-pointers.ll
new file mode 100644
index 0000000..20ea282
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-10-25-vector-of-pointers.ll
@@ -0,0 +1,51 @@
+; RUN: opt < %s -instcombine -S
+
+; Make sure that we don't crash when optimizing the vectors of pointers.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.hoge = type { double*, double*, double*, double** }
+
+define void @widget(%struct.hoge* nocapture %arg) nounwind uwtable ssp {
+bb:
+  %tmp = getelementptr inbounds %struct.hoge* %arg, i64 0, i32 0
+  br i1 undef, label %bb1, label %bb17
+
+bb1:                                              ; preds = %bb
+  br i1 undef, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb1
+  br label %bb17
+
+bb3:                                              ; preds = %bb1
+  %tmp4 = bitcast double** %tmp to <2 x double*>*
+  %tmp5 = load <2 x double*>* %tmp4, align 8
+  %tmp6 = ptrtoint <2 x double*> %tmp5 to <2 x i64>
+  %tmp7 = sub <2 x i64> zeroinitializer, %tmp6
+  %tmp8 = ashr exact <2 x i64> %tmp7, <i64 3, i64 3>
+  %tmp9 = extractelement <2 x i64> %tmp8, i32 0
+  %tmp10 = add nsw i64 undef, %tmp9
+  br i1 undef, label %bb11, label %bb12
+
+bb11:                                             ; preds = %bb3
+  br label %bb13
+
+bb12:                                             ; preds = %bb3
+  br label %bb13
+
+bb13:                                             ; preds = %bb12, %bb11
+  br i1 undef, label %bb16, label %bb14
+
+bb14:                                             ; preds = %bb13
+  br i1 undef, label %bb16, label %bb15
+
+bb15:                                             ; preds = %bb14
+  br label %bb16
+
+bb16:                                             ; preds = %bb15, %bb14, %bb13
+  unreachable
+
+bb17:                                             ; preds = %bb2, %bb
+  ret void
+}
diff --git a/test/Transforms/InstCombine/align-addr.ll b/test/Transforms/InstCombine/align-addr.ll
index 27916b9..4ea1bd9 100644
--- a/test/Transforms/InstCombine/align-addr.ll
+++ b/test/Transforms/InstCombine/align-addr.ll
@@ -58,3 +58,19 @@ define double @test2(double* %p, double %n) nounwind {
   store double %n, double* %p
   ret double %t
 }
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+declare void @use(i8*)
+
+%struct.s = type { i32, i32, i32, i32 }
+
+define void @test3(%struct.s* sret %a4) {
+; Check that the alignment is bumped up the alignment of the sret type.
+; CHECK: @test3
+  %a4.cast = bitcast %struct.s* %a4 to i8*
+  call void @llvm.memset.p0i8.i64(i8* %a4.cast, i8 0, i64 16, i32 1, i1 false)
+; CHECK: call void @llvm.memset.p0i8.i64(i8* %a4.cast, i8 0, i64 16, i32 4, i1 false)
+  call void @use(i8* %a4.cast)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/alloca.ll b/test/Transforms/InstCombine/alloca.ll
index 50e0347..68a671c 100644
--- a/test/Transforms/InstCombine/alloca.ll
+++ b/test/Transforms/InstCombine/alloca.ll
@@ -94,3 +94,19 @@ entry:
   tail call void @f(i32* %b)
   ret void
 }
+
+; PR14371
+%opaque_type = type opaque
+%real_type = type { { i32, i32* } }
+
+@opaque_global = external constant %opaque_type, align 4
+
+define void @test7() {
+entry:
+  %0 = alloca %real_type, align 4
+  %1 = bitcast %real_type* %0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* bitcast (%opaque_type* @opaque_global to i8*), i32 8, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/Transforms/InstCombine/and-fcmp.ll b/test/Transforms/InstCombine/and-fcmp.ll
index 838c2f7..40c44c0 100644
--- a/test/Transforms/InstCombine/and-fcmp.ll
+++ b/test/Transforms/InstCombine/and-fcmp.ll
@@ -10,7 +10,7 @@ define zeroext i8 @t1(float %x, float %y) nounwind {
 ; CHECK: fcmp oeq float %x, %y
 ; CHECK-NOT: fcmp ueq float %x, %y
 ; CHECK-NOT: fcmp ord float %x, %y
-; CHECK-NOW: and
+; CHECK-NOT: and
 }
 
 define zeroext i8 @t2(float %x, float %y) nounwind {
diff --git a/test/Transforms/InstCombine/cast.ll b/test/Transforms/InstCombine/cast.ll
index 56e5ca3..b4eb69d 100644
--- a/test/Transforms/InstCombine/cast.ll
+++ b/test/Transforms/InstCombine/cast.ll
@@ -694,3 +694,209 @@ define i1 @test67(i1 %a, i32 %b) {
 ; CHECK: @test67
 ; CHECK: ret i1 false
 }
+
+%s = type { i32, i32, i32 }
+
+define %s @test68(%s *%p, i64 %i) {
+; CHECK: @test68
+  %o = mul i64 %i, 12
+  %q = bitcast %s* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o
+; CHECK-NEXT: getelementptr %s*
+  %r = bitcast i8* %pp to %s*
+  %l = load %s* %r
+; CHECK-NEXT: load %s*
+  ret %s %l
+; CHECK-NEXT: ret %s
+}
+
+define double @test69(double *%p, i64 %i) {
+; CHECK: @test69
+  %o = shl nsw i64 %i, 3
+  %q = bitcast double* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o
+; CHECK-NEXT: getelementptr inbounds double*
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+; CHECK-NEXT: load double*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
+define %s @test70(%s *%p, i64 %i) {
+; CHECK: @test70
+  %o = mul nsw i64 %i, 36
+; CHECK-NEXT: mul nsw i64 %i, 3
+  %q = bitcast %s* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o
+; CHECK-NEXT: getelementptr inbounds %s*
+  %r = bitcast i8* %pp to %s*
+  %l = load %s* %r
+; CHECK-NEXT: load %s*
+  ret %s %l
+; CHECK-NEXT: ret %s
+}
+
+define double @test71(double *%p, i64 %i) {
+; CHECK: @test71
+  %o = shl i64 %i, 5
+; CHECK-NEXT: shl i64 %i, 2
+  %q = bitcast double* %p to i8*
+  %pp = getelementptr i8* %q, i64 %o
+; CHECK-NEXT: getelementptr double*
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+; CHECK-NEXT: load double*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
+define double @test72(double *%p, i32 %i) {
+; CHECK: @test72
+  %so = mul nsw i32 %i, 8
+  %o = sext i32 %so to i64
+; CHECK-NEXT: sext i32 %i to i64
+  %q = bitcast double* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o
+; CHECK-NEXT: getelementptr inbounds double*
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+; CHECK-NEXT: load double*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
+define double @test73(double *%p, i128 %i) {
+; CHECK: @test73
+  %lo = mul nsw i128 %i, 8
+  %o = trunc i128 %lo to i64
+; CHECK-NEXT: trunc i128 %i to i64
+  %q = bitcast double* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o
+; CHECK-NEXT: getelementptr double*
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+; CHECK-NEXT: load double*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
+define double @test74(double *%p, i64 %i) {
+; CHECK: @test74
+  %q = bitcast double* %p to i64*
+  %pp = getelementptr inbounds i64* %q, i64 %i
+; CHECK-NEXT: getelementptr inbounds double*
+  %r = bitcast i64* %pp to double*
+  %l = load double* %r
+; CHECK-NEXT: load double*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
+define i32* @test75(i32* %p, i32 %x) {
+; CHECK: @test75
+  %y = shl i32 %x, 3
+; CHECK-NEXT: shl i32 %x, 3
+  %z = sext i32 %y to i64
+; CHECK-NEXT: sext i32 %y to i64
+  %q = bitcast i32* %p to i8*
+  %r = getelementptr i8* %q, i64 %z
+  %s = bitcast i8* %r to i32*
+  ret i32* %s
+}
+
+define %s @test76(%s *%p, i64 %i, i64 %j) {
+; CHECK: @test76
+  %o = mul i64 %i, 12
+  %o2 = mul nsw i64 %o, %j
+; CHECK-NEXT: %o2 = mul i64 %i, %j
+  %q = bitcast %s* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o2
+; CHECK-NEXT: getelementptr %s* %p, i64 %o2
+  %r = bitcast i8* %pp to %s*
+  %l = load %s* %r
+; CHECK-NEXT: load %s*
+  ret %s %l
+; CHECK-NEXT: ret %s
+}
+
+define %s @test77(%s *%p, i64 %i, i64 %j) {
+; CHECK: @test77
+  %o = mul nsw i64 %i, 36
+  %o2 = mul nsw i64 %o, %j
+; CHECK-NEXT: %o = mul nsw i64 %i, 3
+; CHECK-NEXT: %o2 = mul nsw i64 %o, %j
+  %q = bitcast %s* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o2
+; CHECK-NEXT: getelementptr inbounds %s* %p, i64 %o2
+  %r = bitcast i8* %pp to %s*
+  %l = load %s* %r
+; CHECK-NEXT: load %s*
+  ret %s %l
+; CHECK-NEXT: ret %s
+}
+
+define %s @test78(%s *%p, i64 %i, i64 %j, i32 %k, i32 %l, i128 %m, i128 %n) {
+; CHECK: @test78
+  %a = mul nsw i32 %k, 36
+; CHECK-NEXT: mul nsw i32 %k, 3
+  %b = mul nsw i32 %a, %l
+; CHECK-NEXT: mul nsw i32 %a, %l
+  %c = sext i32 %b to i128
+; CHECK-NEXT: sext i32 %b to i128
+  %d = mul nsw i128 %c, %m
+; CHECK-NEXT: mul nsw i128 %c, %m
+  %e = mul i128 %d, %n
+; CHECK-NEXT: mul i128 %d, %n
+  %f = trunc i128 %e to i64
+; CHECK-NEXT: trunc i128 %e to i64
+  %g = mul nsw i64 %f, %i
+; CHECK-NEXT: mul i64 %f, %i
+  %h = mul nsw i64 %g, %j
+; CHECK-NEXT: mul i64 %g, %j
+  %q = bitcast %s* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %h
+; CHECK-NEXT: getelementptr %s* %p, i64 %h
+  %r = bitcast i8* %pp to %s*
+  %load = load %s* %r
+; CHECK-NEXT: load %s*
+  ret %s %load
+; CHECK-NEXT: ret %s
+}
+
+define %s @test79(%s *%p, i64 %i, i32 %j) {
+; CHECK: @test79
+  %a = mul nsw i64 %i, 36
+; CHECK: mul nsw i64 %i, 36
+  %b = trunc i64 %a to i32
+  %c = mul i32 %b, %j
+  %q = bitcast %s* %p to i8*
+; CHECK: bitcast
+  %pp = getelementptr inbounds i8* %q, i32 %c
+  %r = bitcast i8* %pp to %s*
+  %l = load %s* %r
+  ret %s %l
+}
+
+define double @test80([100 x double]* %p, i32 %i) {
+; CHECK: @test80
+  %tmp = mul nsw i32 %i, 8
+; CHECK-NEXT: sext i32 %i to i64
+  %q = bitcast [100 x double]* %p to i8*
+  %pp = getelementptr i8* %q, i32 %tmp
+; CHECK-NEXT: getelementptr [100 x double]*
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+; CHECK-NEXT: load double*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
+define double @test81(double *%p, float %f) {
+  %i = fptosi float %f to i64
+  %q = bitcast double* %p to i8*
+  %pp = getelementptr i8* %q, i64 %i
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+  ret double %l
+}
diff --git a/test/Transforms/InstCombine/disable-simplify-libcalls.ll b/test/Transforms/InstCombine/disable-simplify-libcalls.ll
new file mode 100644
index 0000000..d81e9ae
--- /dev/null
+++ b/test/Transforms/InstCombine/disable-simplify-libcalls.ll
@@ -0,0 +1,236 @@
+; Test that -disable-simplify-libcalls is wired up correctly.
+;
+; RUN: opt < %s -instcombine -disable-simplify-libcalls -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@.str  = constant [1 x i8] zeroinitializer, align 1
+@.str1 = constant [13 x i8] c"hello, world\00", align 1
+@.str2 = constant [4 x i8] c"foo\00", align 1
+@.str3 = constant [4 x i8] c"bar\00", align 1
+@.str4 = constant [6 x i8] c"123.4\00", align 1
+@.str5 = constant [5 x i8] c"1234\00", align 1
+@empty = constant [1 x i8] c"\00", align 1
+
+declare double @ceil(double)
+declare double @copysign(double, double)
+declare double @cos(double)
+declare double @fabs(double)
+declare double @floor(double)
+declare i8* @strcat(i8*, i8*)
+declare i8* @strncat(i8*, i8*, i32)
+declare i8* @strchr(i8*, i32)
+declare i8* @strrchr(i8*, i32)
+declare i32 @strcmp(i8*, i8*)
+declare i32 @strncmp(i8*, i8*, i64)
+declare i8* @strcpy(i8*, i8*)
+declare i8* @stpcpy(i8*, i8*)
+declare i8* @strncpy(i8*, i8*, i64)
+declare i64 @strlen(i8*)
+declare i8* @strpbrk(i8*, i8*)
+declare i64 @strspn(i8*, i8*)
+declare double @strtod(i8*, i8**)
+declare float @strtof(i8*, i8**)
+declare x86_fp80 @strtold(i8*, i8**)
+declare i64 @strtol(i8*, i8**, i32)
+declare i64 @strtoll(i8*, i8**, i32)
+declare i64 @strtoul(i8*, i8**, i32)
+declare i64 @strtoull(i8*, i8**, i32)
+declare i64 @strcspn(i8*, i8*)
+
+define double @t1(double %x) {
+; CHECK: @t1
+  %ret = call double @ceil(double %x)
+  ret double %ret
+; CHECK: call double @ceil
+}
+
+define double @t2(double %x, double %y) {
+; CHECK: @t2
+  %ret = call double @copysign(double %x, double %y)
+  ret double %ret
+; CHECK: call double @copysign
+}
+
+define double @t3(double %x) {
+; CHECK: @t3
+  %call = call double @cos(double %x)
+  ret double %call
+; CHECK: call double @cos
+}
+
+define double @t4(double %x) {
+; CHECK: @t4
+  %ret = call double @fabs(double %x)
+  ret double %ret
+; CHECK: call double @fabs
+}
+
+define double @t5(double %x) {
+; CHECK: @t5
+  %ret = call double @floor(double %x)
+  ret double %ret
+; CHECK: call double @floor
+}
+
+define i8* @t6(i8* %x) {
+; CHECK: @t6
+  %empty = getelementptr [1 x i8]* @empty, i32 0, i32 0
+  %ret = call i8* @strcat(i8* %x, i8* %empty)
+  ret i8* %ret
+; CHECK: call i8* @strcat
+}
+
+define i8* @t7(i8* %x) {
+; CHECK: @t7
+  %empty = getelementptr [1 x i8]* @empty, i32 0, i32 0
+  %ret = call i8* @strncat(i8* %x, i8* %empty, i32 1)
+  ret i8* %ret
+; CHECK: call i8* @strncat
+}
+
+define i8* @t8() {
+; CHECK: @t8
+  %x = getelementptr inbounds [13 x i8]* @.str1, i32 0, i32 0
+  %ret = call i8* @strchr(i8* %x, i32 119)
+  ret i8* %ret
+; CHECK: call i8* @strchr
+}
+
+define i8* @t9() {
+; CHECK: @t9
+  %x = getelementptr inbounds [13 x i8]* @.str1, i32 0, i32 0
+  %ret = call i8* @strrchr(i8* %x, i32 119)
+  ret i8* %ret
+; CHECK: call i8* @strrchr
+}
+
+define i32 @t10() {
+; CHECK: @t10
+  %x = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %y = getelementptr inbounds [4 x i8]* @.str3, i32 0, i32 0
+  %ret = call i32 @strcmp(i8* %x, i8* %y)
+  ret i32 %ret
+; CHECK: call i32 @strcmp
+}
+
+define i32 @t11() {
+; CHECK: @t11
+  %x = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %y = getelementptr inbounds [4 x i8]* @.str3, i32 0, i32 0
+  %ret = call i32 @strncmp(i8* %x, i8* %y, i64 3)
+  ret i32 %ret
+; CHECK: call i32 @strncmp
+}
+
+define i8* @t12(i8* %x) {
+; CHECK: @t12
+  %y = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %ret = call i8* @strcpy(i8* %x, i8* %y)
+  ret i8* %ret
+; CHECK: call i8* @strcpy
+}
+
+define i8* @t13(i8* %x) {
+; CHECK: @t13
+  %y = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %ret = call i8* @stpcpy(i8* %x, i8* %y)
+  ret i8* %ret
+; CHECK: call i8* @stpcpy
+}
+
+define i8* @t14(i8* %x) {
+; CHECK: @t14
+  %y = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %ret = call i8* @strncpy(i8* %x, i8* %y, i64 3)
+  ret i8* %ret
+; CHECK: call i8* @strncpy
+}
+
+define i64 @t15() {
+; CHECK: @t15
+  %x = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %ret = call i64 @strlen(i8* %x)
+  ret i64 %ret
+; CHECK: call i64 @strlen
+}
+
+define i8* @t16(i8* %x) {
+; CHECK: @t16
+  %y = getelementptr inbounds [1 x i8]* @.str, i32 0, i32 0
+  %ret = call i8* @strpbrk(i8* %x, i8* %y)
+  ret i8* %ret
+; CHECK: call i8* @strpbrk
+}
+
+define i64 @t17(i8* %x) {
+; CHECK: @t17
+  %y = getelementptr inbounds [1 x i8]* @.str, i32 0, i32 0
+  %ret = call i64 @strspn(i8* %x, i8* %y)
+  ret i64 %ret
+; CHECK: call i64 @strspn
+}
+
+define double @t18(i8** %y) {
+; CHECK: @t18
+  %x = getelementptr inbounds [6 x i8]* @.str4, i64 0, i64 0
+  %ret = call double @strtod(i8* %x, i8** %y)
+  ret double %ret
+; CHECK: call double @strtod
+}
+
+define float @t19(i8** %y) {
+; CHECK: @t19
+  %x = getelementptr inbounds [6 x i8]* @.str4, i64 0, i64 0
+  %ret = call float @strtof(i8* %x, i8** %y)
+  ret float %ret
+; CHECK: call float @strtof
+}
+
+define x86_fp80 @t20(i8** %y) {
+; CHECK: @t20
+  %x = getelementptr inbounds [6 x i8]* @.str4, i64 0, i64 0
+  %ret = call x86_fp80 @strtold(i8* %x, i8** %y)
+  ret x86_fp80 %ret
+; CHECK: call x86_fp80 @strtold
+}
+
+define i64 @t21(i8** %y) {
+; CHECK: @t21
+  %x = getelementptr inbounds [5 x i8]* @.str5, i64 0, i64 0
+  %ret = call i64 @strtol(i8* %x, i8** %y, i32 10)
+  ret i64 %ret
+; CHECK: call i64 @strtol
+}
+
+define i64 @t22(i8** %y) {
+; CHECK: @t22
+  %x = getelementptr inbounds [5 x i8]* @.str5, i64 0, i64 0
+  %ret = call i64 @strtoll(i8* %x, i8** %y, i32 10)
+  ret i64 %ret
+; CHECK: call i64 @strtoll
+}
+
+define i64 @t23(i8** %y) {
+; CHECK: @t23
+  %x = getelementptr inbounds [5 x i8]* @.str5, i64 0, i64 0
+  %ret = call i64 @strtoul(i8* %x, i8** %y, i32 10)
+  ret i64 %ret
+; CHECK: call i64 @strtoul
+}
+
+define i64 @t24(i8** %y) {
+; CHECK: @t24
+  %x = getelementptr inbounds [5 x i8]* @.str5, i64 0, i64 0
+  %ret = call i64 @strtoull(i8* %x, i8** %y, i32 10)
+  ret i64 %ret
+; CHECK: call i64 @strtoull
+}
+
+define i64 @t25(i8* %y) {
+; CHECK: @t25
+  %x = getelementptr [1 x i8]* @empty, i32 0, i32 0
+  %ret = call i64 @strcspn(i8* %x, i8* %y)
+  ret i64 %ret
+; CHECK: call i64 @strcspn
+}
diff --git a/test/Transforms/InstCombine/div-shift.ll b/test/Transforms/InstCombine/div-shift.ll
index a07f3ea..e0372eb 100644
--- a/test/Transforms/InstCombine/div-shift.ll
+++ b/test/Transforms/InstCombine/div-shift.ll
@@ -21,3 +21,17 @@ define i64 @t2(i64 %x, i32 %y) nounwind  {
   %3 = udiv i64 %x, %2
   ret i64 %3
 }
+
+; PR13250
+define i64 @t3(i64 %x, i32 %y) nounwind  {
+; CHECK: t3
+; CHECK-NOT: udiv
+; CHECK-NEXT: %1 = add i32 %y, 2
+; CHECK-NEXT: %2 = zext i32 %1 to i64
+; CHECK-NEXT: %3 = lshr i64 %x, %2
+; CHECK-NEXT: ret i64 %3
+  %1 = shl i32 4, %y
+  %2 = zext i32 %1 to i64
+  %3 = udiv i64 %x, %2
+  ret i64 %3
+}
diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index d08cbf5..376fa07 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -54,9 +54,8 @@ define i1 @test7(float %x) nounwind readnone ssp noredzone {
   %ext = fpext float %x to ppc_fp128
   %cmp = fcmp ogt ppc_fp128 %ext, 0xM00000000000000000000000000000000
   ret i1 %cmp
-; Can't convert ppc_fp128
 ; CHECK: @test7
-; CHECK-NEXT: fpext float %x to ppc_fp128
+; CHECK-NEXT: fcmp ogt float %x, 0.000000e+00
 }
 
 define float @test8(float %x) nounwind readnone optsize ssp {
@@ -69,3 +68,93 @@ define float @test8(float %x) nounwind readnone optsize ssp {
 ; CHECK: @test8
 ; CHECK-NEXT: fcmp olt float %x, 0.000000e+00
 }
+
+declare double @fabs(double) nounwind readnone
+
+define i32 @test9(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp olt double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test9
+; CHECK-NOT: fabs
+; CHECK: ret i32 0
+}
+
+define i32 @test10(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp ole double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test10
+; CHECK-NOT: fabs
+; CHECK: fcmp oeq double %a, 0.000000e+00
+}
+
+define i32 @test11(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp ogt double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test11
+; CHECK-NOT: fabs
+; CHECK: fcmp one double %a, 0.000000e+00
+}
+
+define i32 @test12(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp oge double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test12
+; CHECK-NOT: fabs
+; CHECK: fcmp ord double %a, 0.000000e+00
+}
+
+define i32 @test13(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp une double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test13
+; CHECK-NOT: fabs
+; CHECK: fcmp une double %a, 0.000000e+00
+}
+
+define i32 @test14(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp oeq double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test14
+; CHECK-NOT: fabs
+; CHECK: fcmp oeq double %a, 0.000000e+00
+}
+
+define i32 @test15(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp one double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test15
+; CHECK-NOT: fabs
+; CHECK: fcmp one double %a, 0.000000e+00
+}
+
+define i32 @test16(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp ueq double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test16
+; CHECK-NOT: fabs
+; CHECK: fcmp ueq double %a, 0.000000e+00
+}
+
+; Don't crash.
+define i32 @test17(double %a, double (double)* %p) nounwind {
+  %call = tail call double %p(double %a) nounwind
+  %cmp = fcmp ueq double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
diff --git a/test/Transforms/InstCombine/fold-vector-select.ll b/test/Transforms/InstCombine/fold-vector-select.ll
index 3f22522..2cb970b 100644
--- a/test/Transforms/InstCombine/fold-vector-select.ll
+++ b/test/Transforms/InstCombine/fold-vector-select.ll
@@ -1,13 +1,148 @@
 ; RUN: opt < %s -instcombine -S | not grep select
 
-define void @foo(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C, <4 x i32> *%D) {
- %r = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> zeroinitializer
- %g = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>,  <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 6, i32 9, i32 1>
- %b = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>,  <4 x i32> zeroinitializer, <4 x i32> <i32 7, i32 1, i32 4, i32 9>
- %a = select <4 x i1> zeroinitializer,  <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 8, i32 5>
- store <4 x i32> %r, <4 x i32>* %A
- store <4 x i32> %g, <4 x i32>* %B
- store <4 x i32> %b, <4 x i32>* %C
- store <4 x i32> %a, <4 x i32>* %D
+define void @foo(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C, <4 x i32> *%D,
+                 <4 x i32> *%E, <4 x i32> *%F, <4 x i32> *%G, <4 x i32> *%H,
+                 <4 x i32> *%I, <4 x i32> *%J, <4 x i32> *%K, <4 x i32> *%L,
+                 <4 x i32> *%M, <4 x i32> *%N, <4 x i32> *%O, <4 x i32> *%P,
+                 <4 x i32> *%Q, <4 x i32> *%R, <4 x i32> *%S, <4 x i32> *%T,
+                 <4 x i32> *%U, <4 x i32> *%V, <4 x i32> *%W, <4 x i32> *%X,
+                 <4 x i32> *%Y, <4 x i32> *%Z, <4 x i32> *%BA, <4 x i32> *%BB,
+                 <4 x i32> *%BC, <4 x i32> *%BD, <4 x i32> *%BE, <4 x i32> *%BF,
+                 <4 x i32> *%BG, <4 x i32> *%BH, <4 x i32> *%BI, <4 x i32> *%BJ,
+                 <4 x i32> *%BK, <4 x i32> *%BL, <4 x i32> *%BM, <4 x i32> *%BN,
+                 <4 x i32> *%BO, <4 x i32> *%BP, <4 x i32> *%BQ, <4 x i32> *%BR,
+                 <4 x i32> *%BS, <4 x i32> *%BT, <4 x i32> *%BU, <4 x i32> *%BV,
+                 <4 x i32> *%BW, <4 x i32> *%BX, <4 x i32> *%BY, <4 x i32> *%BZ,
+                 <4 x i32> *%CA, <4 x i32> *%CB, <4 x i32> *%CC, <4 x i32> *%CD,
+                 <4 x i32> *%CE, <4 x i32> *%CF, <4 x i32> *%CG, <4 x i32> *%CH,
+                 <4 x i32> *%CI, <4 x i32> *%CJ, <4 x i32> *%CK, <4 x i32> *%CL) {
+ %a = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 9, i32 87, i32 57, i32 8>
+ %b = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 44, i32 99, i32 49, i32 29>
+ %c = select <4 x i1> <i1 false, i1 true, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 15, i32 18, i32 53, i32 84>
+ %d = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 29, i32 82, i32 45, i32 16>
+ %e = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 11, i32 15, i32 32, i32 99>
+ %f = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 19, i32 86, i32 29, i32 33>
+ %g = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 44, i32 10, i32 26, i32 45>
+ %h = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 88, i32 70, i32 90, i32 48>
+ %i = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 30, i32 53, i32 42, i32 12>
+ %j = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 46, i32 24, i32 93, i32 26>
+ %k = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 33, i32 99, i32 15, i32 57>
+ %l = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 51, i32 60, i32 60, i32 50>
+ %m = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 50, i32 12, i32 7, i32 45>
+ %n = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 15, i32 65, i32 36, i32 36>
+ %o = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 54, i32 0, i32 17, i32 78>
+ %p = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 56, i32 13, i32 64, i32 48>
+ %q = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x i32> <i32 52, i32 69, i32 88, i32 11>, <4 x i32> zeroinitializer
+ %r = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> <i32 5, i32 87, i32 68, i32 14>, <4 x i32> zeroinitializer
+ %s = select <4 x i1> <i1 false, i1 true, i1 false, i1 false>, <4 x i32> <i32 47, i32 17, i32 66, i32 63>, <4 x i32> zeroinitializer
+ %t = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> <i32 64, i32 25, i32 73, i32 81>, <4 x i32> zeroinitializer
+ %u = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i32> <i32 51, i32 41, i32 61, i32 63>, <4 x i32> zeroinitializer
+ %v = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> <i32 39, i32 59, i32 17, i32 0>, <4 x i32> zeroinitializer
+ %w = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x i32> <i32 91, i32 99, i32 97, i32 29>, <4 x i32> zeroinitializer
+ %x = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> <i32 89, i32 45, i32 89, i32 10>, <4 x i32> zeroinitializer
+ %y = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> <i32 25, i32 70, i32 21, i32 27>, <4 x i32> zeroinitializer
+ %z = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> <i32 40, i32 12, i32 27, i32 88>, <4 x i32> zeroinitializer
+ %ba = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>, <4 x i32> <i32 36, i32 35, i32 90, i32 23>, <4 x i32> zeroinitializer
+ %bb = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> <i32 83, i32 3, i32 64, i32 82>, <4 x i32> zeroinitializer
+ %bc = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32> <i32 15, i32 72, i32 2, i32 54>, <4 x i32> zeroinitializer
+ %bd = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i32> <i32 32, i32 47, i32 100, i32 84>, <4 x i32> zeroinitializer
+ %be = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> <i32 92, i32 57, i32 82, i32 1>, <4 x i32> zeroinitializer
+ %bf = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> <i32 42, i32 14, i32 22, i32 89>, <4 x i32> zeroinitializer
+ %bg = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x i32> <i32 33, i32 10, i32 67, i32 66>, <4 x i32> <i32 42, i32 91, i32 47, i32 40>
+ %bh = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> <i32 8, i32 13, i32 48, i32 0>, <4 x i32> <i32 84, i32 66, i32 87, i32 84>
+ %bi = select <4 x i1> <i1 false, i1 true, i1 false, i1 false>, <4 x i32> <i32 85, i32 96, i32 1, i32 94>, <4 x i32> <i32 54, i32 57, i32 7, i32 92>
+ %bj = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> <i32 55, i32 21, i32 92, i32 68>, <4 x i32> <i32 51, i32 61, i32 62, i32 39>
+ %bk = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i32> <i32 42, i32 18, i32 77, i32 74>, <4 x i32> <i32 82, i32 33, i32 30, i32 7>
+ %bl = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> <i32 80, i32 92, i32 61, i32 84>, <4 x i32> <i32 43, i32 89, i32 92, i32 6>
+ %bm = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x i32> <i32 49, i32 14, i32 62, i32 62>, <4 x i32> <i32 35, i32 33, i32 92, i32 59>
+ %bn = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> <i32 3, i32 97, i32 49, i32 18>, <4 x i32> <i32 56, i32 64, i32 19, i32 75>
+ %bo = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> <i32 91, i32 57, i32 0, i32 1>, <4 x i32> <i32 43, i32 63, i32 64, i32 11>
+ %bp = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> <i32 41, i32 65, i32 18, i32 11>, <4 x i32> <i32 86, i32 26, i32 31, i32 3>
+ %bq = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>, <4 x i32> <i32 31, i32 46, i32 32, i32 68>, <4 x i32> <i32 100, i32 59, i32 62, i32 6>
+ %br = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> <i32 76, i32 67, i32 87, i32 7>, <4 x i32> <i32 63, i32 48, i32 97, i32 24>
+ %bs = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32> <i32 83, i32 89, i32 19, i32 4>, <4 x i32> <i32 21, i32 2, i32 40, i32 21>
+ %bt = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i32> <i32 45, i32 76, i32 81, i32 100>, <4 x i32> <i32 65, i32 26, i32 100, i32 46>
+ %bu = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> <i32 16, i32 75, i32 31, i32 17>, <4 x i32> <i32 37, i32 66, i32 86, i32 65>
+ %bv = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> <i32 13, i32 25, i32 43, i32 59>, <4 x i32> <i32 82, i32 78, i32 60, i32 52>
+ %bw = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %bx = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %by = select <4 x i1> <i1 false, i1 true, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %bz = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ca = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cb = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cc = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cd = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ce = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cf = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cg = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ch = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ci = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cj = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ck = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cl = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ store <4 x i32> %a, <4 x i32>* %A
+ store <4 x i32> %b, <4 x i32>* %B
+ store <4 x i32> %c, <4 x i32>* %C
+ store <4 x i32> %d, <4 x i32>* %D
+ store <4 x i32> %e, <4 x i32>* %E
+ store <4 x i32> %f, <4 x i32>* %F
+ store <4 x i32> %g, <4 x i32>* %G
+ store <4 x i32> %h, <4 x i32>* %H
+ store <4 x i32> %i, <4 x i32>* %I
+ store <4 x i32> %j, <4 x i32>* %J
+ store <4 x i32> %k, <4 x i32>* %K
+ store <4 x i32> %l, <4 x i32>* %L
+ store <4 x i32> %m, <4 x i32>* %M
+ store <4 x i32> %n, <4 x i32>* %N
+ store <4 x i32> %o, <4 x i32>* %O
+ store <4 x i32> %p, <4 x i32>* %P
+ store <4 x i32> %q, <4 x i32>* %Q
+ store <4 x i32> %r, <4 x i32>* %R
+ store <4 x i32> %s, <4 x i32>* %S
+ store <4 x i32> %t, <4 x i32>* %T
+ store <4 x i32> %u, <4 x i32>* %U
+ store <4 x i32> %v, <4 x i32>* %V
+ store <4 x i32> %w, <4 x i32>* %W
+ store <4 x i32> %x, <4 x i32>* %X
+ store <4 x i32> %y, <4 x i32>* %Y
+ store <4 x i32> %z, <4 x i32>* %Z
+ store <4 x i32> %ba, <4 x i32>* %BA
+ store <4 x i32> %bb, <4 x i32>* %BB
+ store <4 x i32> %bc, <4 x i32>* %BC
+ store <4 x i32> %bd, <4 x i32>* %BD
+ store <4 x i32> %be, <4 x i32>* %BE
+ store <4 x i32> %bf, <4 x i32>* %BF
+ store <4 x i32> %bg, <4 x i32>* %BG
+ store <4 x i32> %bh, <4 x i32>* %BH
+ store <4 x i32> %bi, <4 x i32>* %BI
+ store <4 x i32> %bj, <4 x i32>* %BJ
+ store <4 x i32> %bk, <4 x i32>* %BK
+ store <4 x i32> %bl, <4 x i32>* %BL
+ store <4 x i32> %bm, <4 x i32>* %BM
+ store <4 x i32> %bn, <4 x i32>* %BN
+ store <4 x i32> %bo, <4 x i32>* %BO
+ store <4 x i32> %bp, <4 x i32>* %BP
+ store <4 x i32> %bq, <4 x i32>* %BQ
+ store <4 x i32> %br, <4 x i32>* %BR
+ store <4 x i32> %bs, <4 x i32>* %BS
+ store <4 x i32> %bt, <4 x i32>* %BT
+ store <4 x i32> %bu, <4 x i32>* %BU
+ store <4 x i32> %bv, <4 x i32>* %BV
+ store <4 x i32> %bw, <4 x i32>* %BW
+ store <4 x i32> %bx, <4 x i32>* %BX
+ store <4 x i32> %by, <4 x i32>* %BY
+ store <4 x i32> %bz, <4 x i32>* %BZ
+ store <4 x i32> %ca, <4 x i32>* %CA
+ store <4 x i32> %cb, <4 x i32>* %CB
+ store <4 x i32> %cc, <4 x i32>* %CC
+ store <4 x i32> %cd, <4 x i32>* %CD
+ store <4 x i32> %ce, <4 x i32>* %CE
+ store <4 x i32> %cf, <4 x i32>* %CF
+ store <4 x i32> %cg, <4 x i32>* %CG
+ store <4 x i32> %ch, <4 x i32>* %CH
+ store <4 x i32> %ci, <4 x i32>* %CI
+ store <4 x i32> %cj, <4 x i32>* %CJ
+ store <4 x i32> %ck, <4 x i32>* %CK
+ store <4 x i32> %cl, <4 x i32>* %CL
  ret void
 }
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index eaff87d..8e064a4 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -659,3 +659,21 @@ define i1 @test64(i8 %a, i32 %b) nounwind {
 ; CHECK-NEXT: %c = icmp eq i8 %1, %a
 ; CHECK-NEXT: ret i1 %c
 }
+
+define i1 @test65(i64 %A, i64 %B) {
+  %s1 = add i64 %A, %B
+  %s2 = add i64 %A, %B
+  %cmp = icmp eq i64 %s1, %s2
+; CHECK: @test65
+; CHECK-NEXT: ret i1 true
+  ret i1 %cmp
+}
+
+define i1 @test66(i64 %A, i64 %B) {
+  %s1 = add i64 %A, %B
+  %s2 = add i64 %B, %A
+  %cmp = icmp eq i64 %s1, %s2
+; CHECK: @test66
+; CHECK-NEXT: ret i1 true
+  ret i1 %cmp
+}
diff --git a/test/Transforms/InstCombine/memcmp-1.ll b/test/Transforms/InstCombine/memcmp-1.ll
new file mode 100644
index 0000000..4238c5f
--- /dev/null
+++ b/test/Transforms/InstCombine/memcmp-1.ll
@@ -0,0 +1,72 @@
+; Test that the memcmp library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@foo = constant [4 x i8] c"foo\00"
+@hel = constant [4 x i8] c"hel\00"
+@hello_u = constant [8 x i8] c"hello_u\00"
+
+declare i32 @memcmp(i8*, i8*, i32)
+
+; Check memcmp(mem, mem, size) -> 0.
+
+define i32 @test_simplify1(i8* %mem, i32 %size) {
+; CHECK: @test_simplify1
+  %ret = call i32 @memcmp(i8* %mem, i8* %mem, i32 %size)
+  ret i32 %ret
+; CHECK: ret i32 0
+}
+
+; Check memcmp(mem1, mem2, 0) -> 0.
+
+define i32 @test_simplify2(i8* %mem1, i8* %mem2) {
+; CHECK: @test_simplify2
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 0)
+  ret i32 %ret
+; CHECK: ret i32 0
+}
+
+;; Check memcmp(mem1, mem2, 1) -> *(unsigned char*)mem1 - *(unsigned char*)mem2.
+
+define i32 @test_simplify3(i8* %mem1, i8* %mem2) {
+; CHECK: @test_simplify3
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 1)
+; CHECK: [[LOAD1:%[a-z]+]] = load i8* %mem1, align 1
+; CHECK: [[ZEXT1:%[a-z]+]] = zext i8 [[LOAD1]] to i32
+; CHECK: [[LOAD2:%[a-z]+]] = load i8* %mem2, align 1
+; CHECK: [[ZEXT2:%[a-z]+]] = zext i8 [[LOAD2]] to i32
+; CHECK: [[RET:%[a-z]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+  ret i32 %ret
+; CHECK: ret i32 [[RET]]
+}
+
+; Check memcmp(mem1, mem2, size) -> cnst, where all arguments are constants.
+
+define i32 @test_simplify4() {
+; CHECK: @test_simplify4
+  %mem1 = getelementptr [4 x i8]* @hel, i32 0, i32 0
+  %mem2 = getelementptr [8 x i8]* @hello_u, i32 0, i32 0
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 3)
+  ret i32 %ret
+; CHECK: ret i32 0
+}
+
+define i32 @test_simplify5() {
+; CHECK: @test_simplify5
+  %mem1 = getelementptr [4 x i8]* @hel, i32 0, i32 0
+  %mem2 = getelementptr [4 x i8]* @foo, i32 0, i32 0
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 3)
+  ret i32 %ret
+; CHECK: ret i32 {{[0-9]+}}
+}
+
+define i32 @test_simplify6() {
+; CHECK: @test_simplify6
+  %mem1 = getelementptr [4 x i8]* @foo, i32 0, i32 0
+  %mem2 = getelementptr [4 x i8]* @hel, i32 0, i32 0
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 3)
+  ret i32 %ret
+; CHECK: ret i32 {{-[0-9]+}}
+}
diff --git a/test/Transforms/InstCombine/memcmp-2.ll b/test/Transforms/InstCombine/memcmp-2.ll
new file mode 100644
index 0000000..3796117
--- /dev/null
+++ b/test/Transforms/InstCombine/memcmp-2.ll
@@ -0,0 +1,17 @@
+; Test that the memcmp library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i32* @memcmp(i8*, i8*, i32)
+
+; Check that memcmp functions with the wrong prototype aren't simplified.
+
+define i32* @test_no_simplify1(i8* %mem, i32 %size) {
+; CHECK: @test_no_simplify1
+  %ret = call i32* @memcmp(i8* %mem, i8* %mem, i32 %size)
+; CHECK-NEXT: call i32* @memcmp
+  ret i32* %ret
+; CHECK-NEXT: ret i32* %ret
+}
diff --git a/test/Transforms/InstCombine/memcpy-1.ll b/test/Transforms/InstCombine/memcpy-1.ll
new file mode 100644
index 0000000..65b79ad
--- /dev/null
+++ b/test/Transforms/InstCombine/memcpy-1.ll
@@ -0,0 +1,17 @@
+; Test that the memcpy library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8* @memcpy(i8*, i8*, i32)
+
+; Check memcpy(mem1, mem2, size) -> llvm.memcpy(mem1, mem2, size, 1).
+
+define i8* @test_simplify1(i8* %mem1, i8* %mem2, i32 %size) {
+; CHECK: @test_simplify1
+  %ret = call i8* @memcpy(i8* %mem1, i8* %mem2, i32 %size)
+; CHECK: call void @llvm.memcpy
+  ret i8* %ret
+; CHECK: ret i8* %mem1
+}
diff --git a/test/Transforms/InstCombine/memcpy-2.ll b/test/Transforms/InstCombine/memcpy-2.ll
new file mode 100644
index 0000000..4a8a020
--- /dev/null
+++ b/test/Transforms/InstCombine/memcpy-2.ll
@@ -0,0 +1,17 @@
+; Test that the memcpy library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8 @memcpy(i8*, i8*, i32)
+
+; Check that memcpy functions with the wrong prototype aren't simplified.
+
+define i8 @test_no_simplify1(i8* %mem1, i8* %mem2, i32 %size) {
+; CHECK: @test_no_simplify1
+  %ret = call i8 @memcpy(i8* %mem1, i8* %mem2, i32 %size)
+; CHECK: call i8 @memcpy
+  ret i8 %ret
+; CHECK: ret i8 %ret
+}
diff --git a/test/Transforms/ScalarRepl/memcpy-from-global.ll b/test/Transforms/InstCombine/memcpy-from-global.ll
index 5557a8f..83c893e 100644
--- a/test/Transforms/ScalarRepl/memcpy-from-global.ll
+++ b/test/Transforms/InstCombine/memcpy-from-global.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -scalarrepl -S | FileCheck %s
+; RUN: opt < %s -instcombine -S | FileCheck %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 @C.0.1248 = internal constant [128 x float] [ float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 ], align 32		; <[128 x float]*> [#uses=1]
 
@@ -6,13 +6,11 @@ define float @test1(i32 %hash, float %x, float %y, float %z, float %w) {
 entry:
 	%lookupTable = alloca [128 x float], align 16		; <[128 x float]*> [#uses=5]
 	%lookupTable1 = bitcast [128 x float]* %lookupTable to i8*		; <i8*> [#uses=1]
-	call void @llvm.memcpy.i32( i8* %lookupTable1, i8* bitcast ([128 x float]* @C.0.1248 to i8*), i32 512, i32 16 )
+	call void @llvm.memcpy.p0i8.p0i8.i64(i8* %lookupTable1, i8* bitcast ([128 x float]* @C.0.1248 to i8*), i64 512, i32 16, i1 false)
         
 ; CHECK: @test1
 ; CHECK-NOT: alloca
 ; CHECK-NOT: call{{.*}}@llvm.memcpy
-; CHECK: %lookupTable1 = bitcast [128 x float]* @C.0.1248 to i8*
-; CHECK-NOT: call{{.*}}@llvm.memcpy
         
 	%tmp3 = shl i32 %hash, 2		; <i32> [#uses=1]
 	%tmp5 = and i32 %tmp3, 124		; <i32> [#uses=4]
@@ -38,10 +36,6 @@ entry:
 	ret float %tmp43
 }
 
-declare void @llvm.memcpy.i32(i8*, i8*, i32, i32)
-
-
-
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
 
 %T = type { i8, [123 x i8] }
@@ -59,10 +53,11 @@ define void @test2() {
 ; CHECK: @test2
 
 ; %A alloca is deleted
-; CHECK-NEXT: %B = alloca %T
+; CHECK-NEXT: alloca [124 x i8]
+; CHECK-NEXT: getelementptr inbounds [124 x i8]*
 
 ; use @G instead of %A
-; CHECK-NEXT: %a = bitcast %T* @G to i8*
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* getelementptr inbounds (%T* @G, i64 0, i32 0)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %b, i8* %a, i64 124, i32 4, i1 false)
   call void @bar(i8* %b)
@@ -79,8 +74,7 @@ define void @test3() {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
   call void @bar(i8* %a) readonly
 ; CHECK: @test3
-; CHECK-NEXT: %a = bitcast %T* @G to i8*
-; CHECK-NEXT: call void @bar(i8* %a)
+; CHECK-NEXT: call void @bar(i8* getelementptr inbounds (%T* @G, i64 0, i32 0))
   ret void
 }
 
@@ -90,8 +84,7 @@ define void @test4() {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
   call void @baz(i8* byval %a) 
 ; CHECK: @test4
-; CHECK-NEXT: %a = bitcast %T* @G to i8*
-; CHECK-NEXT: call void @baz(i8* byval %a)
+; CHECK-NEXT: call void @baz(i8* byval getelementptr inbounds (%T* @G, i64 0, i32 0))
   ret void
 }
 
@@ -103,8 +96,7 @@ define void @test5() {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
   call void @baz(i8* byval %a) 
 ; CHECK: @test5
-; CHECK-NEXT: %a = bitcast %T* @G to i8*
-; CHECK-NEXT: call void @baz(i8* byval %a)
+; CHECK-NEXT: call void @baz(i8* byval getelementptr inbounds (%T* @G, i64 0, i32 0))
   ret void
 }
 
@@ -118,8 +110,7 @@ define void @test6() {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast ([2 x %U]* @H to i8*), i64 20, i32 16, i1 false)
   call void @bar(i8* %a) readonly
 ; CHECK: @test6
-; CHECK-NEXT: %a = bitcast
-; CHECK-NEXT: call void @bar(i8* %a)
+; CHECK-NEXT: call void @bar(i8* bitcast ([2 x %U]* @H to i8*))
   ret void
 }
 
@@ -129,8 +120,7 @@ define void @test7() {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%U* getelementptr ([2 x %U]* @H, i64 0, i32 0) to i8*), i64 20, i32 4, i1 false)
   call void @bar(i8* %a) readonly
 ; CHECK: @test7
-; CHECK-NEXT: %a = bitcast
-; CHECK-NEXT: call void @bar(i8* %a)
+; CHECK-NEXT: call void @bar(i8* bitcast ([2 x %U]* @H to i8*))
   ret void
 }
 
diff --git a/test/Transforms/InstCombine/memcpy_chk-1.ll b/test/Transforms/InstCombine/memcpy_chk-1.ll
new file mode 100644
index 0000000..7c7d918
--- /dev/null
+++ b/test/Transforms/InstCombine/memcpy_chk-1.ll
@@ -0,0 +1,60 @@
+; Test lib call simplification of __memcpy_chk calls with various values
+; for dstlen and len.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.T1 = type { [100 x i32], [100 x i32], [1024 x i8] }
+%struct.T2 = type { [100 x i32], [100 x i32], [1024 x i8] }
+%struct.T3 = type { [100 x i32], [100 x i32], [2048 x i8] }
+
+@t1 = common global %struct.T1 zeroinitializer
+@t2 = common global %struct.T2 zeroinitializer
+@t3 = common global %struct.T3 zeroinitializer
+
+; Check cases where dstlen >= len.
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64
+  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1824, i64 1824)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T3* @t3 to i8*
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64
+  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1824, i64 2848)
+  ret void
+}
+
+; Check cases where dstlen < len.
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = bitcast %struct.T3* @t3 to i8*
+  %src = bitcast %struct.T1* @t1 to i8*
+
+; CHECK-NEXT: call i8* @__memcpy_chk
+  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 2848, i64 1824)
+  ret void
+}
+
+define void @test_no_simplify2() {
+; CHECK: @test_no_simplify2
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: call i8* @__memcpy_chk
+  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1024, i64 0)
+  ret void
+}
+
+declare i8* @__memcpy_chk(i8*, i8*, i64, i64)
diff --git a/test/Transforms/InstCombine/memcpy_chk-2.ll b/test/Transforms/InstCombine/memcpy_chk-2.ll
new file mode 100644
index 0000000..aa43029
--- /dev/null
+++ b/test/Transforms/InstCombine/memcpy_chk-2.ll
@@ -0,0 +1,24 @@
+; Test that lib call simplification doesn't simplify __memcpy_chk calls
+; with the wrong prototype.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.T1 = type { [100 x i32], [100 x i32], [1024 x i8] }
+%struct.T2 = type { [100 x i32], [100 x i32], [1024 x i8] }
+
+@t1 = common global %struct.T1 zeroinitializer
+@t2 = common global %struct.T2 zeroinitializer
+
+define void @test_no_simplify() {
+; CHECK: @test_no_simplify
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: call i8* @__memcpy_chk
+  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1824)
+  ret void
+}
+
+declare i8* @__memcpy_chk(i8*, i8*, i64)
diff --git a/test/Transforms/InstCombine/memmove-1.ll b/test/Transforms/InstCombine/memmove-1.ll
new file mode 100644
index 0000000..53f2f11
--- /dev/null
+++ b/test/Transforms/InstCombine/memmove-1.ll
@@ -0,0 +1,17 @@
+; Test that the memmove library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8* @memmove(i8*, i8*, i32)
+
+; Check memmove(mem1, mem2, size) -> llvm.memmove(mem1, mem2, size, 1).
+
+define i8* @test_simplify1(i8* %mem1, i8* %mem2, i32 %size) {
+; CHECK: @test_simplify1
+  %ret = call i8* @memmove(i8* %mem1, i8* %mem2, i32 %size)
+; CHECK: call void @llvm.memmove
+  ret i8* %ret
+; CHECK: ret i8* %mem1
+}
diff --git a/test/Transforms/InstCombine/memmove-2.ll b/test/Transforms/InstCombine/memmove-2.ll
new file mode 100644
index 0000000..23887bc
--- /dev/null
+++ b/test/Transforms/InstCombine/memmove-2.ll
@@ -0,0 +1,17 @@
+; Test that the memmove library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8 @memmove(i8*, i8*, i32)
+
+; Check that memmove functions with the wrong prototype aren't simplified.
+
+define i8 @test_no_simplify1(i8* %mem1, i8* %mem2, i32 %size) {
+; CHECK: @test_no_simplify1
+  %ret = call i8 @memmove(i8* %mem1, i8* %mem2, i32 %size)
+; CHECK: call i8 @memmove
+  ret i8 %ret
+; CHECK: ret i8 %ret
+}
diff --git a/test/Transforms/InstCombine/memmove_chk-1.ll b/test/Transforms/InstCombine/memmove_chk-1.ll
new file mode 100644
index 0000000..f9ff9a1
--- /dev/null
+++ b/test/Transforms/InstCombine/memmove_chk-1.ll
@@ -0,0 +1,60 @@
+; Test lib call simplification of __memmove_chk calls with various values
+; for dstlen and len.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.T1 = type { [100 x i32], [100 x i32], [1024 x i8] }
+%struct.T2 = type { [100 x i32], [100 x i32], [1024 x i8] }
+%struct.T3 = type { [100 x i32], [100 x i32], [2048 x i8] }
+
+@t1 = common global %struct.T1 zeroinitializer
+@t2 = common global %struct.T2 zeroinitializer
+@t3 = common global %struct.T3 zeroinitializer
+
+; Check cases where dstlen >= len.
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64
+  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1824, i64 1824)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T3* @t3 to i8*
+
+; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64
+  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1824, i64 2848)
+  ret void
+}
+
+; Check cases where dstlen < len.
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = bitcast %struct.T3* @t3 to i8*
+  %src = bitcast %struct.T1* @t1 to i8*
+
+; CHECK-NEXT: call i8* @__memmove_chk
+  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 2848, i64 1824)
+  ret void
+}
+
+define void @test_no_simplify2() {
+; CHECK: @test_no_simplify2
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: call i8* @__memmove_chk
+  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1024, i64 0)
+  ret void
+}
+
+declare i8* @__memmove_chk(i8*, i8*, i64, i64)
diff --git a/test/Transforms/InstCombine/memmove_chk-2.ll b/test/Transforms/InstCombine/memmove_chk-2.ll
new file mode 100644
index 0000000..f0a915f
--- /dev/null
+++ b/test/Transforms/InstCombine/memmove_chk-2.ll
@@ -0,0 +1,24 @@
+; Test that lib call simplification doesn't simplify __memmove_chk calls
+; with the wrong prototype.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.T1 = type { [100 x i32], [100 x i32], [1024 x i8] }
+%struct.T2 = type { [100 x i32], [100 x i32], [1024 x i8] }
+
+@t1 = common global %struct.T1 zeroinitializer
+@t2 = common global %struct.T2 zeroinitializer
+
+define void @test_no_simplify() {
+; CHECK: @test_no_simplify
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: call i8* @__memmove_chk
+  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1824)
+  ret void
+}
+
+declare i8* @__memmove_chk(i8*, i8*, i64)
diff --git a/test/Transforms/InstCombine/memset-1.ll b/test/Transforms/InstCombine/memset-1.ll
new file mode 100644
index 0000000..48b433e
--- /dev/null
+++ b/test/Transforms/InstCombine/memset-1.ll
@@ -0,0 +1,17 @@
+; Test that the memset library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8* @memset(i8*, i32, i32)
+
+; Check memset(mem1, val, size) -> llvm.memset(mem1, val, size, 1).
+
+define i8* @test_simplify1(i8* %mem, i32 %val, i32 %size) {
+; CHECK: @test_simplify1
+  %ret = call i8* @memset(i8* %mem, i32 %val, i32 %size)
+; CHECK: call void @llvm.memset
+  ret i8* %ret
+; CHECK: ret i8* %mem
+}
diff --git a/test/Transforms/InstCombine/memset-2.ll b/test/Transforms/InstCombine/memset-2.ll
new file mode 100644
index 0000000..8a90333
--- /dev/null
+++ b/test/Transforms/InstCombine/memset-2.ll
@@ -0,0 +1,17 @@
+; Test that the memset library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8 @memset(i8*, i32, i32)
+
+; Check that memset functions with the wrong prototype aren't simplified.
+
+define i8 @test_no_simplify1(i8* %mem, i32 %val, i32 %size) {
+; CHECK: @test_no_simplify1
+  %ret = call i8 @memset(i8* %mem, i32 %val, i32 %size)
+; CHECK: call i8 @memset
+  ret i8 %ret
+; CHECK: ret i8 %ret
+}
diff --git a/test/Transforms/InstCombine/memset_chk-1.ll b/test/Transforms/InstCombine/memset_chk-1.ll
new file mode 100644
index 0000000..be4c1cf
--- /dev/null
+++ b/test/Transforms/InstCombine/memset_chk-1.ll
@@ -0,0 +1,61 @@
+; Test lib call simplification of __memset_chk calls with various values
+; for dstlen and len.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; rdar://7719085
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.T = type { [100 x i32], [100 x i32], [1024 x i8] }
+@t = common global %struct.T zeroinitializer
+
+; Check cases where dstlen >= len.
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+  %dst = bitcast %struct.T* @t to i8*
+
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64
+  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 1824)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = bitcast %struct.T* @t to i8*
+
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64
+  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 3648)
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: @test_simplify3
+  %dst = bitcast %struct.T* @t to i8*
+
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64
+  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 -1)
+  ret void
+}
+
+; Check cases where dstlen < len.
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = bitcast %struct.T* @t to i8*
+
+; CHECK-NEXT: call i8* @__memset_chk
+  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 400)
+  ret void
+}
+
+define void @test_no_simplify2() {
+; CHECK: @test_no_simplify2
+  %dst = bitcast %struct.T* @t to i8*
+
+; CHECK-NEXT: call i8* @__memset_chk
+  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 0)
+  ret void
+}
+
+declare i8* @__memset_chk(i8*, i32, i64, i64)
diff --git a/test/Transforms/InstCombine/memset_chk-2.ll b/test/Transforms/InstCombine/memset_chk-2.ll
new file mode 100644
index 0000000..60fbf16
--- /dev/null
+++ b/test/Transforms/InstCombine/memset_chk-2.ll
@@ -0,0 +1,20 @@
+; Test that lib call simplification doesn't simplify __memset_chk calls
+; with the wrong prototype.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.T = type { [100 x i32], [100 x i32], [1024 x i8] }
+@t = common global %struct.T zeroinitializer
+
+define void @test_no_simplify() {
+; CHECK: @test_no_simplify
+  %dst = bitcast %struct.T* @t to i8*
+
+; CHECK-NEXT: call i8* @__memset_chk
+  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824)
+  ret void
+}
+
+declare i8* @__memset_chk(i8*, i32, i64)
diff --git a/test/Transforms/InstCombine/memset_chk.ll b/test/Transforms/InstCombine/memset_chk.ll
deleted file mode 100644
index 58ecda5..0000000
--- a/test/Transforms/InstCombine/memset_chk.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
-; rdar://7719085
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-
-%struct.data = type { [100 x i32], [100 x i32], [1024 x i8] }
-
-define i32 @t() nounwind ssp {
-; CHECK: @t
-; CHECK: @llvm.memset.p0i8.i64
-entry:
-  %0 = alloca %struct.data, align 8               ; <%struct.data*> [#uses=1]
-  %1 = bitcast %struct.data* %0 to i8*            ; <i8*> [#uses=1]
-  %2 = call i8* @__memset_chk(i8* %1, i32 0, i64 1824, i64 1824) nounwind ; <i8*> [#uses=0]
-  ret i32 0
-}
-
-declare i8* @__memset_chk(i8*, i32, i64, i64) nounwind
diff --git a/test/Transforms/InstCombine/obfuscated_splat.ll b/test/Transforms/InstCombine/obfuscated_splat.ll
new file mode 100644
index 0000000..c25dade
--- /dev/null
+++ b/test/Transforms/InstCombine/obfuscated_splat.ll
@@ -0,0 +1,11 @@
+; RUN: opt -instcombine -S %s | FileCheck %s
+
+define void @test(<4 x float> *%in_ptr, <4 x float> *%out_ptr) {
+  %A = load <4 x float>* %in_ptr, align 16
+  %B = shufflevector <4 x float> %A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
+  %C = shufflevector <4 x float> %B, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
+  %D = shufflevector <4 x float> %C, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+; CHECK:  %D = shufflevector <4 x float> %A, <4 x float> undef, <4 x i32> zeroinitializer
+  store <4 x float> %D, <4 x float> *%out_ptr
+  ret void
+}
diff --git a/test/Transforms/InstCombine/objsize.ll b/test/Transforms/InstCombine/objsize.ll
index d7e2921..31a3cb4 100644
--- a/test/Transforms/InstCombine/objsize.ll
+++ b/test/Transforms/InstCombine/objsize.ll
@@ -247,7 +247,8 @@ entry:
 
 ; technically reachable, but this malformed IR may appear as a result of constant propagation
 xpto:
-  %gep = getelementptr i8* %gep, i32 1
+  %gep2 = getelementptr i8* %gep, i32 1
+  %gep = getelementptr i8* %gep2, i32 1
   %o = call i32 @llvm.objectsize.i32(i8* %gep, i1 true)
 ; CHECK: ret i32 undef
   ret i32 %o
diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll
index 4baae26..cc3aacd 100644
--- a/test/Transforms/InstCombine/select.ll
+++ b/test/Transforms/InstCombine/select.ll
@@ -829,3 +829,37 @@ define i1 @test63(i1 %A, i1 %B) {
 ; CHECK: %C = or i1 %B, %not
 ; CHECK: ret i1 %C
 }
+
+; PR14131
+define void @test64(i32 %p, i16 %b) noreturn nounwind {
+entry:
+  %p.addr.0.insert.mask = and i32 %p, -65536
+  %conv2 = and i32 %p, 65535
+  br i1 undef, label %lor.rhs, label %lor.end
+
+lor.rhs:
+  %p.addr.0.extract.trunc = trunc i32 %p.addr.0.insert.mask to i16
+  %phitmp = zext i16 %p.addr.0.extract.trunc to i32
+  br label %lor.end
+
+lor.end:
+  %t.1 = phi i32 [ 0, %entry ], [ %phitmp, %lor.rhs ]
+  %conv6 = zext i16 %b to i32
+  %div = udiv i32 %conv6, %t.1
+  %tobool8 = icmp eq i32 %div, 0
+  %cmp = icmp eq i32 %t.1, 0
+  %cmp12 = icmp ult i32 %conv2, 2
+  %cmp.sink = select i1 %tobool8, i1 %cmp12, i1 %cmp
+  br i1 %cmp.sink, label %cond.end17, label %cond.false16
+
+cond.false16:
+  br label %cond.end17
+
+cond.end17:
+  br label %while.body
+
+while.body:
+  br label %while.body
+; CHECK: @test64
+; CHECK-NOT: select
+}
diff --git a/test/Transforms/InstCombine/stpcpy-1.ll b/test/Transforms/InstCombine/stpcpy-1.ll
new file mode 100644
index 0000000..8b6bb0e
--- /dev/null
+++ b/test/Transforms/InstCombine/stpcpy-1.ll
@@ -0,0 +1,46 @@
+; Test that the stpcpy library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+;
+; This transformation requires the pointer size, as it assumes that size_t is
+; the size of a pointer.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+
+@hello = constant [6 x i8] c"hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+@b = common global [32 x i8] zeroinitializer, align 1
+
+declare i8* @stpcpy(i8*, i8*)
+
+define i8* @test_simplify1() {
+; CHECK: @test_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  %ret = call i8* @stpcpy(i8* %dst, i8* %src)
+; CHECK: @llvm.memcpy.p0i8.p0i8.i32
+; CHECK-NEXT: getelementptr inbounds ([32 x i8]* @a, i32 0, i32 5)
+  ret i8* %ret
+}
+
+define i8* @test_simplify2() {
+; CHECK: @test_simplify2
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+
+  %ret = call i8* @stpcpy(i8* %dst, i8* %dst)
+; CHECK: [[LEN:%[a-z]+]] = call i32 @strlen
+; CHECK-NEXT: getelementptr inbounds [32 x i8]* @a, i32 0, i32 [[LEN]]
+  ret i8* %ret
+}
+
+define i8* @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [32 x i8]* @b, i32 0, i32 0
+
+  %ret = call i8* @stpcpy(i8* %dst, i8* %src)
+; CHECK: call i8* @stpcpy
+  ret i8* %ret
+}
diff --git a/test/Transforms/InstCombine/stpcpy-2.ll b/test/Transforms/InstCombine/stpcpy-2.ll
new file mode 100644
index 0000000..2e92c08
--- /dev/null
+++ b/test/Transforms/InstCombine/stpcpy-2.ll
@@ -0,0 +1,22 @@
+; Test that the stpcpy library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+;
+; This transformation requires the pointer size, as it assumes that size_t is
+; the size of a pointer.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+
+@hello = constant [6 x i8] c"hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i16* @stpcpy(i8*, i8*)
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i16* @stpcpy(i8* %dst, i8* %src)
+; CHECK: call i16* @stpcpy
+  ret void
+}
diff --git a/test/Transforms/InstCombine/stpcpy_chk-1.ll b/test/Transforms/InstCombine/stpcpy_chk-1.ll
new file mode 100644
index 0000000..0560391
--- /dev/null
+++ b/test/Transforms/InstCombine/stpcpy_chk-1.ll
@@ -0,0 +1,96 @@
+; Test lib call simplification of __stpcpy_chk calls with various values
+; for src, dst, and slen.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@a = common global [60 x i8] zeroinitializer, align 1
+@b = common global [60 x i8] zeroinitializer, align 1
+@.str = private constant [12 x i8] c"abcdefghijk\00"
+
+; Check cases where slen >= strlen (src).
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 60)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 12)
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: @test_simplify3
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret void
+}
+
+; Check cases where there are no string constants.
+
+define void @test_simplify4() {
+; CHECK: @test_simplify4
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @stpcpy
+  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret void
+}
+
+; Check case where the string length is not constant.
+
+define i8* @test_simplify5() {
+; CHECK: @test_simplify5
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK: @__memcpy_chk
+  %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false)
+  %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 %len)
+; CHECK: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 11)
+  ret i8* %ret
+}
+
+; Check case where the source and destination are the same.
+
+define i8* @test_simplify6() {
+; CHECK: @test_simplify6
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+
+; CHECK: [[LEN:%[a-z]+]] = call i32 @strlen
+; CHECK-NEXT: getelementptr inbounds [60 x i8]* @a, i32 0, i32 [[LEN]]
+  %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false)
+  %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %dst, i32 %len)
+  ret i8* %ret
+}
+
+; Check case where slen < strlen (src).
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @__stpcpy_chk
+  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 8)
+  ret void
+}
+
+declare i8* @__stpcpy_chk(i8*, i8*, i32) nounwind
+declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
diff --git a/test/Transforms/InstCombine/stpcpy_chk-2.ll b/test/Transforms/InstCombine/stpcpy_chk-2.ll
new file mode 100644
index 0000000..46c2139
--- /dev/null
+++ b/test/Transforms/InstCombine/stpcpy_chk-2.ll
@@ -0,0 +1,21 @@
+; Test that lib call simplification doesn't simplify __stpcpy_chk calls
+; with the wrong prototype.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@a = common global [60 x i16] zeroinitializer, align 1
+@.str = private constant [8 x i8] c"abcdefg\00"
+
+define void @test_no_simplify() {
+; CHECK: @test_no_simplify
+  %dst = getelementptr inbounds [60 x i16]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [8 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call i16* @__strcpy_chk
+  call i16* @__strcpy_chk(i16* %dst, i8* %src, i32 8)
+  ret void
+}
+
+declare i16* @__strcpy_chk(i16*, i8*, i32)
diff --git a/test/Transforms/InstCombine/strcat-1.ll b/test/Transforms/InstCombine/strcat-1.ll
new file mode 100644
index 0000000..3c05d6b
--- /dev/null
+++ b/test/Transforms/InstCombine/strcat-1.ll
@@ -0,0 +1,38 @@
+; Test that the strcat libcall simplifier works correctly per the
+; bug found in PR3661.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@null = constant [1 x i8] zeroinitializer
+@null_hello = constant [7 x i8] c"\00hello\00"
+
+declare i8* @strcat(i8*, i8*)
+declare i32 @puts(i8*)
+
+define i32 @main() {
+; CHECK: @main
+; CHECK-NOT: call i8* @strcat
+; CHECK: call i32 @puts
+
+  %target = alloca [1024 x i8]
+  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
+  store i8 0, i8* %arg1
+
+  ; rslt1 = strcat(target, "hello\00")
+  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %rslt1 = call i8* @strcat(i8* %arg1, i8* %arg2)
+
+  ; rslt2 = strcat(rslt1, "\00")
+  %arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %rslt2 = call i8* @strcat(i8* %rslt1, i8* %arg3)
+
+  ; rslt3 = strcat(rslt2, "\00hello\00")
+  %arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0
+  %rslt3 = call i8* @strcat(i8* %rslt2, i8* %arg4)
+
+  call i32 @puts( i8* %rslt3 )
+  ret i32 0
+}
diff --git a/test/Transforms/InstCombine/strcat-2.ll b/test/Transforms/InstCombine/strcat-2.ll
new file mode 100644
index 0000000..379ee74
--- /dev/null
+++ b/test/Transforms/InstCombine/strcat-2.ll
@@ -0,0 +1,32 @@
+; Test that the strcat libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@empty = constant [1 x i8] c"\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i8* @strcat(i8*, i8*)
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+; CHECK-NOT: call i8* @strcat
+; CHECK: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  call i8* @strcat(i8* %dst, i8* %src)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+; CHECK-NEXT: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [1 x i8]* @empty, i32 0, i32 0
+  call i8* @strcat(i8* %dst, i8* %src)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strcat-3.ll b/test/Transforms/InstCombine/strcat-3.ll
new file mode 100644
index 0000000..15aff2f
--- /dev/null
+++ b/test/Transforms/InstCombine/strcat-3.ll
@@ -0,0 +1,22 @@
+; Test that the strcat libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@empty = constant [1 x i8] c"\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i16* @strcat(i8*, i8*)
+
+define void @test_nosimplify1() {
+; CHECK: @test_nosimplify1
+; CHECK: call i16* @strcat
+; CHECK: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  call i16* @strcat(i8* %dst, i8* %src)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strchr-1.ll b/test/Transforms/InstCombine/strchr-1.ll
new file mode 100644
index 0000000..5efab9e
--- /dev/null
+++ b/test/Transforms/InstCombine/strchr-1.ll
@@ -0,0 +1,54 @@
+; Test that the strchr library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [14 x i8] c"hello world\5Cn\00"
+@null = constant [1 x i8] zeroinitializer
+@chp = global i8* zeroinitializer
+
+declare i8* @strchr(i8*, i32)
+
+define void @test_simplify1() {
+; CHECK: store i8* getelementptr inbounds ([14 x i8]* @hello, i32 0, i32 6)
+; CHECK-NOT: call i8* @strchr
+; CHECK: ret void
+
+  %str = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strchr(i8* %str, i32 119)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: store i8* null, i8** @chp, align 4
+; CHECK-NOT: call i8* @strchr
+; CHECK: ret void
+
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %dst = call i8* @strchr(i8* %str, i32 119)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: store i8* getelementptr inbounds ([14 x i8]* @hello, i32 0, i32 13)
+; CHECK-NOT: call i8* @strchr
+; CHECK: ret void
+
+  %src = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strchr(i8* %src, i32 0)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
+define void @test_simplify4(i32 %chr) {
+; CHECK: call i8* @memchr
+; CHECK-NOT: call i8* @strchr
+; CHECK: ret void
+
+  %src = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strchr(i8* %src, i32 %chr)
+  store i8* %dst, i8** @chp
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strchr-2.ll b/test/Transforms/InstCombine/strchr-2.ll
new file mode 100644
index 0000000..35bbd23
--- /dev/null
+++ b/test/Transforms/InstCombine/strchr-2.ll
@@ -0,0 +1,21 @@
+; Test that the strchr libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [14 x i8] c"hello world\5Cn\00"
+@chr = global i8 zeroinitializer
+
+declare i8 @strchr(i8*, i32)
+
+define void @test_nosimplify1() {
+; CHECK: test_nosimplify1
+; CHECK: call i8 @strchr
+; CHECK: ret void
+
+  %str = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8 @strchr(i8* %str, i32 119)
+  store i8 %dst, i8* @chr
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strcmp-1.ll b/test/Transforms/InstCombine/strcmp-1.ll
new file mode 100644
index 0000000..0679246
--- /dev/null
+++ b/test/Transforms/InstCombine/strcmp-1.ll
@@ -0,0 +1,82 @@
+; Test that the strcmp library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@hell = constant [5 x i8] c"hell\00"
+@bell = constant [5 x i8] c"bell\00"
+@null = constant [1 x i8] zeroinitializer
+
+declare i32 @strcmp(i8*, i8*)
+
+; strcmp("", x) -> -*x
+define i32 @test1(i8* %str2) {
+; CHECK: @test1
+; CHECK: %strcmpload = load i8* %str
+; CHECK: %1 = zext i8 %strcmpload to i32
+; CHECK: %2 = sub i32 0, %1
+; CHECK: ret i32 %2
+
+  %str1 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+
+}
+
+; strcmp(x, "") -> *x
+define i32 @test2(i8* %str1) {
+; CHECK: @test2
+; CHECK: %strcmpload = load i8* %str
+; CHECK: %1 = zext i8 %strcmpload to i32
+; CHECK: ret i32 %1
+
+  %str2 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+}
+
+; strcmp(x, y)  -> cnst
+define i32 @test3() {
+; CHECK: @test3
+; CHECK: ret i32 -1
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = call i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+}
+
+define i32 @test4() {
+; CHECK: @test4
+; CHECK: ret i32 1
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+}
+
+; strcmp(x, y)   -> memcmp(x, y, <known length>)
+; (This transform is rather difficult to trigger in a useful manner)
+define i32 @test5(i1 %b) {
+; CHECK: @test5
+; CHECK: %memcmp = call i32 @memcmp(i8* getelementptr inbounds ([6 x i8]* @hello, i32 0, i32 0), i8* %str2, i32 5)
+; CHECK: ret i32 %memcmp
+
+  %str1 = getelementptr inbounds [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %temp2 = getelementptr inbounds [5 x i8]* @bell, i32 0, i32 0
+  %str2 = select i1 %b, i8* %temp1, i8* %temp2
+  %temp3 = call i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp3
+}
+
+; strcmp(x,x)  -> 0
+define i32 @test6(i8* %str) {
+; CHECK: @test6
+; CHECK: ret i32 0
+
+  %temp1 = call i32 @strcmp(i8* %str, i8* %str)
+  ret i32 %temp1
+}
diff --git a/test/Transforms/InstCombine/strcmp-2.ll b/test/Transforms/InstCombine/strcmp-2.ll
new file mode 100644
index 0000000..2051896
--- /dev/null
+++ b/test/Transforms/InstCombine/strcmp-2.ll
@@ -0,0 +1,20 @@
+; Test that the strcmp library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@hell = constant [5 x i8] c"hell\00"
+
+declare i16 @strcmp(i8*, i8*)
+
+define i16 @test_nosimplify() {
+; CHECK: @test_nosimplify
+; CHECK: call i16 @strcmp
+; CHECK: ret i16 %temp1
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = call i16 @strcmp(i8* %str1, i8* %str2)
+  ret i16 %temp1
+}
diff --git a/test/Transforms/InstCombine/strcpy-1.ll b/test/Transforms/InstCombine/strcpy-1.ll
new file mode 100644
index 0000000..b6cf048
--- /dev/null
+++ b/test/Transforms/InstCombine/strcpy-1.ll
@@ -0,0 +1,45 @@
+; Test that the strcpy library call simplifier works correctly.
+; rdar://6839935
+; RUN: opt < %s -instcombine -S | FileCheck %s
+;
+; This transformation requires the pointer size, as it assumes that size_t is
+; the size of a pointer.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+
+@hello = constant [6 x i8] c"hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+@b = common global [32 x i8] zeroinitializer, align 1
+
+declare i8* @strcpy(i8*, i8*)
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i8* @strcpy(i8* %dst, i8* %src)
+; CHECK: @llvm.memcpy.p0i8.p0i8.i32
+  ret void
+}
+
+define i8* @test_simplify2() {
+; CHECK: @test_simplify2
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+
+  %ret = call i8* @strcpy(i8* %dst, i8* %dst)
+; CHECK: ret i8* getelementptr inbounds ([32 x i8]* @a, i32 0, i32 0)
+  ret i8* %ret
+}
+
+define i8* @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [32 x i8]* @b, i32 0, i32 0
+
+  %ret = call i8* @strcpy(i8* %dst, i8* %src)
+; CHECK: call i8* @strcpy
+  ret i8* %ret
+}
diff --git a/test/Transforms/InstCombine/strcpy-2.ll b/test/Transforms/InstCombine/strcpy-2.ll
new file mode 100644
index 0000000..779e9fd
--- /dev/null
+++ b/test/Transforms/InstCombine/strcpy-2.ll
@@ -0,0 +1,22 @@
+; Test that the strcpy library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+;
+; This transformation requires the pointer size, as it assumes that size_t is
+; the size of a pointer.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+
+@hello = constant [6 x i8] c"hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i16* @strcpy(i8*, i8*)
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i16* @strcpy(i8* %dst, i8* %src)
+; CHECK: call i16* @strcpy
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strcpy_chk-1.ll b/test/Transforms/InstCombine/strcpy_chk-1.ll
new file mode 100644
index 0000000..3e48f4f
--- /dev/null
+++ b/test/Transforms/InstCombine/strcpy_chk-1.ll
@@ -0,0 +1,94 @@
+; Test lib call simplification of __strcpy_chk calls with various values
+; for src, dst, and slen.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@a = common global [60 x i8] zeroinitializer, align 1
+@b = common global [60 x i8] zeroinitializer, align 1
+@.str = private constant [12 x i8] c"abcdefghijk\00"
+
+; Check cases where slen >= strlen (src).
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 60)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 12)
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: @test_simplify3
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret void
+}
+
+; Check cases where there are no string constants.
+
+define void @test_simplify4() {
+; CHECK: @test_simplify4
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @strcpy
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret void
+}
+
+; Check case where the string length is not constant.
+
+define void @test_simplify5() {
+; CHECK: @test_simplify5
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK: @__memcpy_chk
+  %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false)
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 %len)
+  ret void
+}
+
+; Check case where the source and destination are the same.
+
+define i8* @test_simplify6() {
+; CHECK: @test_simplify6
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+
+; CHECK: getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0)
+  %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false)
+  %ret = call i8* @__strcpy_chk(i8* %dst, i8* %dst, i32 %len)
+  ret i8* %ret
+}
+
+; Check case where slen < strlen (src).
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @__strcpy_chk
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 8)
+  ret void
+}
+
+declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind
+declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
diff --git a/test/Transforms/InstCombine/strcpy_chk-2.ll b/test/Transforms/InstCombine/strcpy_chk-2.ll
new file mode 100644
index 0000000..d76ea5d
--- /dev/null
+++ b/test/Transforms/InstCombine/strcpy_chk-2.ll
@@ -0,0 +1,21 @@
+; Test that lib call simplification doesn't simplify __strcpy_chk calls
+; with the wrong prototype.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@a = common global [60 x i16] zeroinitializer, align 1
+@.str = private constant [8 x i8] c"abcdefg\00"
+
+define void @test_no_simplify() {
+; CHECK: @test_no_simplify
+  %dst = getelementptr inbounds [60 x i16]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [8 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call i16* @__strcpy_chk
+  call i16* @__strcpy_chk(i16* %dst, i8* %src, i32 8)
+  ret void
+}
+
+declare i16* @__strcpy_chk(i16*, i8*, i32)
diff --git a/test/Transforms/InstCombine/strcpy_chk.ll b/test/Transforms/InstCombine/strcpy_chk.ll
deleted file mode 100644
index 8835a0b..0000000
--- a/test/Transforms/InstCombine/strcpy_chk.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-@a = common global [60 x i8] zeroinitializer, align 1 ; <[60 x i8]*> [#uses=1]
-@.str = private constant [8 x i8] c"abcdefg\00"   ; <[8 x i8]*> [#uses=1]
-
-define i8* @foo() nounwind {
-; CHECK: @foo
-; CHECK-NEXT: call i8* @strcpy
-  %call = call i8* @__strcpy_chk(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 60) ; <i8*> [#uses=1]
-  ret i8* %call
-}
-
-declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind
diff --git a/test/Transforms/InstCombine/strcspn-1.ll b/test/Transforms/InstCombine/strcspn-1.ll
new file mode 100644
index 0000000..60fad89
--- /dev/null
+++ b/test/Transforms/InstCombine/strcspn-1.ll
@@ -0,0 +1,57 @@
+; Test that the strcspn library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@abcba = constant [6 x i8] c"abcba\00"
+@abc = constant [4 x i8] c"abc\00"
+@null = constant [1 x i8] zeroinitializer
+
+declare i64 @strcspn(i8*, i8*)
+
+; Check strcspn(s, "") -> strlen(s).
+
+define i64 @test_simplify1(i8* %str) {
+; CHECK: @test_simplify1
+  %pat = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i64 @strcspn(i8* %str, i8* %pat)
+; CHECK-NEXT: [[VAR:%[a-z]+]] = call i64 @strlen(i8* %str)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 [[VAR]]
+}
+
+; Check strcspn("", s) -> 0.
+
+define i64 @test_simplify2(i8* %pat) {
+; CHECK: @test_simplify2
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i64 @strcspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 0
+}
+
+; Check strcspn(s1, s2), where s1 and s2 are constants.
+
+define i64 @test_simplify3() {
+; CHECK: @test_simplify3
+  %str = getelementptr [6 x i8]* @abcba, i32 0, i32 0
+  %pat = getelementptr [4 x i8]* @abc, i32 0, i32 0
+
+  %ret = call i64 @strcspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 0
+}
+
+; Check cases that shouldn't be simplified.
+
+define i64 @test_no_simplify1(i8* %str, i8* %pat) {
+; CHECK: @test_no_simplify1
+
+  %ret = call i64 @strcspn(i8* %str, i8* %pat)
+; CHECK-NEXT: %ret = call i64 @strcspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 %ret
+}
diff --git a/test/Transforms/InstCombine/strcspn-2.ll b/test/Transforms/InstCombine/strcspn-2.ll
new file mode 100644
index 0000000..4e23936
--- /dev/null
+++ b/test/Transforms/InstCombine/strcspn-2.ll
@@ -0,0 +1,21 @@
+; Test that the strcspn library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@null = constant [1 x i8] zeroinitializer
+
+declare double @strcspn(i8*, i8*)
+
+; Check that strcspn functions with the wrong prototype aren't simplified.
+
+define double @test_no_simplify1(i8* %pat) {
+; CHECK: @test_no_simplify1
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call double @strcspn(i8* %str, i8* %pat)
+; CHECK-NEXT: call double @strcspn
+  ret double %ret
+; CHECK-NEXT: ret double %ret
+}
diff --git a/test/Transforms/InstCombine/strlen-1.ll b/test/Transforms/InstCombine/strlen-1.ll
new file mode 100644
index 0000000..6d7464a
--- /dev/null
+++ b/test/Transforms/InstCombine/strlen-1.ll
@@ -0,0 +1,97 @@
+; Test that the strlen library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@null = constant [1 x i8] zeroinitializer
+@null_hello = constant [7 x i8] c"\00hello\00"
+@nullstring = constant i8 0
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i32 @strlen(i8*)
+
+; Check strlen(string constant) -> integer constant.
+
+define i32 @test_simplify1() {
+; CHECK: @test_simplify1
+  %hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %hello_l = call i32 @strlen(i8* %hello_p)
+  ret i32 %hello_l
+; CHECK-NEXT: ret i32 5
+}
+
+define i32 @test_simplify2() {
+; CHECK: @test_simplify2
+  %null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %null_l = call i32 @strlen(i8* %null_p)
+  ret i32 %null_l
+; CHECK-NEXT: ret i32 0
+}
+
+define i32 @test_simplify3() {
+; CHECK: @test_simplify3
+  %null_hello_p = getelementptr [7 x i8]* @null_hello, i32 0, i32 0
+  %null_hello_l = call i32 @strlen(i8* %null_hello_p)
+  ret i32 %null_hello_l
+; CHECK-NEXT: ret i32 0
+}
+
+define i32 @test_simplify4() {
+; CHECK: @test_simplify4
+  %len = tail call i32 @strlen(i8* @nullstring) nounwind
+  ret i32 %len
+; CHECK-NEXT: ret i32 0
+}
+
+; Check strlen(x) == 0 --> *x == 0.
+
+define i1 @test_simplify5() {
+; CHECK: @test_simplify5
+  %hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %hello_l = call i32 @strlen(i8* %hello_p)
+  %eq_hello = icmp eq i32 %hello_l, 0
+  ret i1 %eq_hello
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @test_simplify6() {
+; CHECK: @test_simplify6
+  %null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %null_l = call i32 @strlen(i8* %null_p)
+  %eq_null = icmp eq i32 %null_l, 0
+  ret i1 %eq_null
+; CHECK-NEXT: ret i1 true
+}
+
+; Check strlen(x) != 0 --> *x != 0.
+
+define i1 @test_simplify7() {
+; CHECK: @test_simplify7
+  %hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %hello_l = call i32 @strlen(i8* %hello_p)
+  %ne_hello = icmp ne i32 %hello_l, 0
+  ret i1 %ne_hello
+; CHECK-NEXT: ret i1 true
+}
+
+define i1 @test_simplify8() {
+; CHECK: @test_simplify8
+  %null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %null_l = call i32 @strlen(i8* %null_p)
+  %ne_null = icmp ne i32 %null_l, 0
+  ret i1 %ne_null
+; CHECK-NEXT: ret i1 false
+}
+
+; Check cases that shouldn't be simplified.
+
+define i32 @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %a_p = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %a_l = call i32 @strlen(i8* %a_p)
+; CHECK-NEXT: %a_l = call i32 @strlen
+  ret i32 %a_l
+; CHECK-NEXT: ret i32 %a_l
+}
diff --git a/test/Transforms/InstCombine/strlen-2.ll b/test/Transforms/InstCombine/strlen-2.ll
new file mode 100644
index 0000000..c4fd54c
--- /dev/null
+++ b/test/Transforms/InstCombine/strlen-2.ll
@@ -0,0 +1,18 @@
+; Test that the strlen library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+
+declare i32 @strlen(i8*, i32)
+
+define i32 @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %hello_l = call i32 @strlen(i8* %hello_p, i32 187)
+; CHECK-NEXT: %hello_l = call i32 @strlen
+  ret i32 %hello_l
+; CHECK-NEXT: ret i32 %hello_l
+}
diff --git a/test/Transforms/InstCombine/strncat-1.ll b/test/Transforms/InstCombine/strncat-1.ll
new file mode 100644
index 0000000..ad2a18b
--- /dev/null
+++ b/test/Transforms/InstCombine/strncat-1.ll
@@ -0,0 +1,37 @@
+; Test that the strncat libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@null = constant [1 x i8] zeroinitializer
+@null_hello = constant [7 x i8] c"\00hello\00"
+
+declare i8* @strncat(i8*, i8*, i32)
+declare i32 @puts(i8*)
+
+define i32 @main() {
+; CHECK: @main
+; CHECK-NOT: call i8* @strncat
+; CHECK: call i32 @puts
+
+  %target = alloca [1024 x i8]
+  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
+  store i8 0, i8* %arg1
+
+  ; rslt1 = strncat(target, "hello\00")
+  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %rslt1 = call i8* @strncat(i8* %arg1, i8* %arg2, i32 6)
+
+  ; rslt2 = strncat(rslt1, "\00")
+  %arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %rslt2 = call i8* @strncat(i8* %rslt1, i8* %arg3, i32 42)
+
+  ; rslt3 = strncat(rslt2, "\00hello\00")
+  %arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0
+  %rslt3 = call i8* @strncat(i8* %rslt2, i8* %arg4, i32 42)
+
+  call i32 @puts(i8* %rslt3)
+  ret i32 0
+}
diff --git a/test/Transforms/InstCombine/strncat-2.ll b/test/Transforms/InstCombine/strncat-2.ll
new file mode 100644
index 0000000..c56deac
--- /dev/null
+++ b/test/Transforms/InstCombine/strncat-2.ll
@@ -0,0 +1,53 @@
+; Test that the strncat libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@empty = constant [1 x i8] c"\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i8* @strncat(i8*, i8*, i32)
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+; CHECK-NOT: call i8* @strncat
+; CHECK: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  call i8* @strncat(i8* %dst, i8* %src, i32 13)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+; CHECK-NEXT: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [1 x i8]* @empty, i32 0, i32 0
+  call i8* @strncat(i8* %dst, i8* %src, i32 13)
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: @test_simplify3
+; CHECK-NEXT: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  call i8* @strncat(i8* %dst, i8* %src, i32 0)
+  ret void
+}
+
+define void @test_nosimplify1() {
+; CHECK: @test_nosimplify1
+; CHECK: call i8* @strncat
+; CHECK: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  call i8* @strncat(i8* %dst, i8* %src, i32 1)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strncat-3.ll b/test/Transforms/InstCombine/strncat-3.ll
new file mode 100644
index 0000000..3cd7971
--- /dev/null
+++ b/test/Transforms/InstCombine/strncat-3.ll
@@ -0,0 +1,22 @@
+; Test that the strncat libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@empty = constant [1 x i8] c"\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i16* @strncat(i8*, i8*, i32)
+
+define void @test_nosimplify1() {
+; CHECK: @test_nosimplify1
+; CHECK: call i16* @strncat
+; CHECK: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  call i16* @strncat(i8* %dst, i8* %src, i32 13)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strncmp-1.ll b/test/Transforms/InstCombine/strncmp-1.ll
new file mode 100644
index 0000000..187c2fa
--- /dev/null
+++ b/test/Transforms/InstCombine/strncmp-1.ll
@@ -0,0 +1,99 @@
+; Test that the strncmp library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@hell = constant [5 x i8] c"hell\00"
+@bell = constant [5 x i8] c"bell\00"
+@null = constant [1 x i8] zeroinitializer
+
+declare i32 @strncmp(i8*, i8*, i32)
+
+; strncmp("", x, n) -> -*x
+define i32 @test1(i8* %str2) {
+; CHECK: @test1
+; CHECK: %strcmpload = load i8* %str
+; CHECK: %1 = zext i8 %strcmpload to i32
+; CHECK: %2 = sub i32 0, %1
+; CHECK: ret i32 %2
+
+  %str1 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 10)
+  ret i32 %temp1
+}
+
+; strncmp(x, "", n) -> *x
+define i32 @test2(i8* %str1) {
+; CHECK: @test2
+; CHECK: %strcmpload = load i8* %str1
+; CHECK: %1 = zext i8 %strcmpload to i32
+; CHECK: ret i32 %1
+
+  %str2 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 10)
+  ret i32 %temp1
+}
+
+; strncmp(x, y, n)  -> cnst
+define i32 @test3() {
+; CHECK: @test3
+; CHECK: ret i32 -1
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 10)
+  ret i32 %temp1
+}
+
+define i32 @test4() {
+; CHECK: @test4
+; CHECK: ret i32 1
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 10)
+  ret i32 %temp1
+}
+
+define i32 @test5() {
+; CHECK: @test5
+; CHECK: ret i32 0
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 4)
+  ret i32 %temp1
+}
+
+; strncmp(x,y,1) -> memcmp(x,y,1)
+define i32 @test6(i8* %str1, i8* %str2) {
+; CHECK: @test6
+; CHECK: [[LOAD1:%[a-z]+]] = load i8* %str1, align 1
+; CHECK: [[ZEXT1:%[a-z]+]] = zext i8 [[LOAD1]] to i32
+; CHECK: [[LOAD2:%[a-z]+]] = load i8* %str2, align 1
+; CHECK: [[ZEXT2:%[a-z]+]] = zext i8 [[LOAD2]] to i32
+; CHECK: [[RET:%[a-z]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+; CHECK: ret i32 [[RET]]
+
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 1)
+  ret i32 %temp1
+}
+
+; strncmp(x,y,0)   -> 0
+define i32 @test7(i8* %str1, i8* %str2) {
+; CHECK: @test7
+; CHECK: ret i32 0
+
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 0)
+  ret i32 %temp1
+}
+
+; strncmp(x,x,n)  -> 0
+define i32 @test8(i8* %str, i32 %n) {
+; CHECK: @test8
+; CHECK: ret i32 0
+
+  %temp1 = call i32 @strncmp(i8* %str, i8* %str, i32 %n)
+  ret i32 %temp1
+}
diff --git a/test/Transforms/InstCombine/strncmp-2.ll b/test/Transforms/InstCombine/strncmp-2.ll
new file mode 100644
index 0000000..3fc43a6
--- /dev/null
+++ b/test/Transforms/InstCombine/strncmp-2.ll
@@ -0,0 +1,20 @@
+; Test that the strncmp library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@hell = constant [5 x i8] c"hell\00"
+
+declare i16 @strncmp(i8*, i8*, i32)
+
+define i16 @test_nosimplify() {
+; CHECK: @test_nosimplify
+; CHECK: call i16 @strncmp
+; CHECK: ret i16 %temp1
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = call i16 @strncmp(i8* %str1, i8* %str2, i32 10)
+  ret i16 %temp1
+}
diff --git a/test/Transforms/InstCombine/strncpy-1.ll b/test/Transforms/InstCombine/strncpy-1.ll
new file mode 100644
index 0000000..3ce2b9b
--- /dev/null
+++ b/test/Transforms/InstCombine/strncpy-1.ll
@@ -0,0 +1,95 @@
+; Test that the strncpy library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@null = constant [1 x i8] zeroinitializer
+@null_hello = constant [7 x i8] c"\00hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+@b = common global [32 x i8] zeroinitializer, align 1
+
+declare i8* @strncpy(i8*, i8*, i32)
+declare i32 @puts(i8*)
+
+; Check a bunch of strncpy invocations together.
+
+define i32 @test_simplify1() {
+; CHECK: @test_simplify1
+; CHECK-NOT: call i8* @strncpy
+; CHECK: call i32 @puts
+  %target = alloca [1024 x i8]
+  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
+  store i8 0, i8* %arg1
+
+  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %rslt1 = call i8* @strncpy(i8* %arg1, i8* %arg2, i32 6)
+
+  %arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %rslt2 = call i8* @strncpy(i8* %rslt1, i8* %arg3, i32 42)
+
+  %arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0
+  %rslt3 = call i8* @strncpy(i8* %rslt2, i8* %arg4, i32 42)
+
+  call i32 @puts( i8* %rslt3 )
+  ret i32 0
+}
+
+; Check strncpy(x, "", y) -> memset(x, '\0', y, 1).
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  call i8* @strncpy(i8* %dst, i8* %src, i32 32)
+; CHECK: call void @llvm.memset.p0i8.i32
+  ret void
+}
+
+; Check strncpy(x, y, 0) -> x.
+
+define i8* @test_simplify3() {
+; CHECK: @test_simplify3
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  %ret = call i8* @strncpy(i8* %dst, i8* %src, i32 0)
+  ret i8* %ret
+; CHECK: ret i8* getelementptr inbounds ([32 x i8]* @a, i32 0, i32 0)
+}
+
+; Check  strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant].
+
+define void @test_simplify4() {
+; CHECK: @test_simplify4
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i8* @strncpy(i8* %dst, i8* %src, i32 6)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32
+  ret void
+}
+
+; Check cases that shouldn't be simplified.
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [32 x i8]* @b, i32 0, i32 0
+
+  call i8* @strncpy(i8* %dst, i8* %src, i32 32)
+; CHECK: call i8* @strncpy
+  ret void
+}
+
+define void @test_no_simplify2() {
+; CHECK: @test_no_simplify2
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i8* @strncpy(i8* %dst, i8* %src, i32 8)
+; CHECK: call i8* @strncpy
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strncpy-2.ll b/test/Transforms/InstCombine/strncpy-2.ll
new file mode 100644
index 0000000..ac28ea6
--- /dev/null
+++ b/test/Transforms/InstCombine/strncpy-2.ll
@@ -0,0 +1,22 @@
+; Test that the strncpy library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i16* @strncpy(i8*, i8*, i32)
+
+; Check that 'strncpy' functions with the wrong prototype aren't simplified.
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i16* @strncpy(i8* %dst, i8* %src, i32 6)
+; CHECK: call i16* @strncpy
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strncpy_chk-1.ll b/test/Transforms/InstCombine/strncpy_chk-1.ll
new file mode 100644
index 0000000..aadff42
--- /dev/null
+++ b/test/Transforms/InstCombine/strncpy_chk-1.ll
@@ -0,0 +1,66 @@
+; Test lib call simplification of __strncpy_chk calls with various values
+; for len and dstlen.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@a = common global [60 x i8] zeroinitializer, align 1
+@b = common global [60 x i8] zeroinitializer, align 1
+@.str = private constant [12 x i8] c"abcdefghijk\00"
+
+; Check cases where dstlen >= len
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 60)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 12)
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: @test_simplify3
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @strncpy
+  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 60)
+  ret void
+}
+
+; Check cases where dstlen < len
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @__strncpy_chk
+  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 8, i32 4)
+  ret void
+}
+
+define void @test_no_simplify2() {
+; CHECK: @test_no_simplify2
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @__strncpy_chk
+  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 8, i32 0)
+  ret void
+}
+
+declare i8* @__strncpy_chk(i8*, i8*, i32, i32)
diff --git a/test/Transforms/InstCombine/strncpy_chk-2.ll b/test/Transforms/InstCombine/strncpy_chk-2.ll
new file mode 100644
index 0000000..a0f132e
--- /dev/null
+++ b/test/Transforms/InstCombine/strncpy_chk-2.ll
@@ -0,0 +1,21 @@
+; Test that lib call simplification doesn't simplify __strncpy_chk calls
+; with the wrong prototype.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@a = common global [60 x i16] zeroinitializer, align 1
+@b = common global [60 x i16] zeroinitializer, align 1
+
+define void @test_no_simplify() {
+; CHECK: @test_no_simplify
+  %dst = getelementptr inbounds [60 x i16]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i16]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i16* @__strncpy_chk
+  call i16* @__strncpy_chk(i16* %dst, i16* %src, i32 60, i32 60)
+  ret void
+}
+
+declare i16* @__strncpy_chk(i16*, i16*, i32, i32)
diff --git a/test/Transforms/InstCombine/strpbrk-1.ll b/test/Transforms/InstCombine/strpbrk-1.ll
new file mode 100644
index 0000000..a5d0d86
--- /dev/null
+++ b/test/Transforms/InstCombine/strpbrk-1.ll
@@ -0,0 +1,68 @@
+; Test that the strpbrk library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [12 x i8] c"hello world\00"
+@w = constant [2 x i8] c"w\00"
+@null = constant [1 x i8] zeroinitializer
+
+declare i8* @strpbrk(i8*, i8*)
+
+; Check strpbrk(s, "") -> NULL.
+
+define i8* @test_simplify1(i8* %str) {
+; CHECK: @test_simplify1
+  %pat = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i8* @strpbrk(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* null
+}
+
+; Check strpbrk("", s) -> NULL.
+
+define i8* @test_simplify2(i8* %pat) {
+; CHECK: @test_simplify2
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i8* @strpbrk(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* null
+}
+
+; Check strpbrk(s1, s2), where s1 and s2 are constants.
+
+define i8* @test_simplify3() {
+; CHECK: @test_simplify3
+  %str = getelementptr [12 x i8]* @hello, i32 0, i32 0
+  %pat = getelementptr [2 x i8]* @w, i32 0, i32 0
+
+  %ret = call i8* @strpbrk(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* getelementptr inbounds ([12 x i8]* @hello, i32 0, i32 6)
+}
+
+; Check strpbrk(s, "a") -> strchr(s, 'a').
+
+define i8* @test_simplify4(i8* %str) {
+; CHECK: @test_simplify4
+  %pat = getelementptr [2 x i8]* @w, i32 0, i32 0
+
+  %ret = call i8* @strpbrk(i8* %str, i8* %pat)
+; CHECK-NEXT: [[VAR:%[a-z]+]] = call i8* @strchr(i8* %str, i32 119)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* [[VAR]]
+}
+
+; Check cases that shouldn't be simplified.
+
+define i8* @test_no_simplify1(i8* %str, i8* %pat) {
+; CHECK: @test_no_simplify1
+
+  %ret = call i8* @strpbrk(i8* %str, i8* %pat)
+; CHECK-NEXT: %ret = call i8* @strpbrk(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* %ret
+}
diff --git a/test/Transforms/InstCombine/strpbrk-2.ll b/test/Transforms/InstCombine/strpbrk-2.ll
new file mode 100644
index 0000000..31ac290
--- /dev/null
+++ b/test/Transforms/InstCombine/strpbrk-2.ll
@@ -0,0 +1,23 @@
+; Test that the strpbrk library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [12 x i8] c"hello world\00"
+@w = constant [2 x i8] c"w\00"
+
+declare i16* @strpbrk(i8*, i8*)
+
+; Check that 'strpbrk' functions with the wrong prototype aren't simplified.
+
+define i16* @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %str = getelementptr [12 x i8]* @hello, i32 0, i32 0
+  %pat = getelementptr [2 x i8]* @w, i32 0, i32 0
+
+  %ret = call i16* @strpbrk(i8* %str, i8* %pat)
+; CHECK-NEXT: %ret = call i16* @strpbrk
+  ret i16* %ret
+; CHECK-NEXT: ret i16* %ret
+}
diff --git a/test/Transforms/InstCombine/strrchr-1.ll b/test/Transforms/InstCombine/strrchr-1.ll
new file mode 100644
index 0000000..854ce45
--- /dev/null
+++ b/test/Transforms/InstCombine/strrchr-1.ll
@@ -0,0 +1,54 @@
+; Test that the strrchr library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [14 x i8] c"hello world\5Cn\00"
+@null = constant [1 x i8] zeroinitializer
+@chp = global i8* zeroinitializer
+
+declare i8* @strrchr(i8*, i32)
+
+define void @test_simplify1() {
+; CHECK: store i8* getelementptr inbounds ([14 x i8]* @hello, i32 0, i32 6)
+; CHECK-NOT: call i8* @strrchr
+; CHECK: ret void
+
+  %str = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strrchr(i8* %str, i32 119)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: store i8* null, i8** @chp, align 4
+; CHECK-NOT: call i8* @strrchr
+; CHECK: ret void
+
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %dst = call i8* @strrchr(i8* %str, i32 119)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: store i8* getelementptr inbounds ([14 x i8]* @hello, i32 0, i32 13)
+; CHECK-NOT: call i8* @strrchr
+; CHECK: ret void
+
+  %src = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strrchr(i8* %src, i32 0)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
+define void @test_nosimplify1(i32 %chr) {
+; CHECK: @test_nosimplify1
+; CHECK: call i8* @strrchr
+; CHECK: ret void
+
+  %src = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strrchr(i8* %src, i32 %chr)
+  store i8* %dst, i8** @chp
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strrchr-2.ll b/test/Transforms/InstCombine/strrchr-2.ll
new file mode 100644
index 0000000..1974f6c
--- /dev/null
+++ b/test/Transforms/InstCombine/strrchr-2.ll
@@ -0,0 +1,21 @@
+; Test that the strrchr libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [14 x i8] c"hello world\5Cn\00"
+@chr = global i8 zeroinitializer
+
+declare i8 @strrchr(i8*, i32)
+
+define void @test_nosimplify1() {
+; CHECK: test_nosimplify1
+; CHECK: call i8 @strrchr
+; CHECK: ret void
+
+  %str = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8 @strrchr(i8* %str, i32 119)
+  store i8 %dst, i8* @chr
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strspn-1.ll b/test/Transforms/InstCombine/strspn-1.ll
new file mode 100644
index 0000000..393f887
--- /dev/null
+++ b/test/Transforms/InstCombine/strspn-1.ll
@@ -0,0 +1,56 @@
+; Test that the strspn library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@abcba = constant [6 x i8] c"abcba\00"
+@abc = constant [4 x i8] c"abc\00"
+@null = constant [1 x i8] zeroinitializer
+
+declare i64 @strspn(i8*, i8*)
+
+; Check strspn(s, "") -> 0.
+
+define i64 @test_simplify1(i8* %str) {
+; CHECK: @test_simplify1
+  %pat = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i64 @strspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 0
+}
+
+; Check strspn("", s) -> 0.
+
+define i64 @test_simplify2(i8* %pat) {
+; CHECK: @test_simplify2
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i64 @strspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 0
+}
+
+; Check strspn(s1, s2), where s1 and s2 are constants.
+
+define i64 @test_simplify3() {
+; CHECK: @test_simplify3
+  %str = getelementptr [6 x i8]* @abcba, i32 0, i32 0
+  %pat = getelementptr [4 x i8]* @abc, i32 0, i32 0
+
+  %ret = call i64 @strspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 5
+}
+
+; Check cases that shouldn't be simplified.
+
+define i64 @test_no_simplify1(i8* %str, i8* %pat) {
+; CHECK: @test_no_simplify1
+
+  %ret = call i64 @strspn(i8* %str, i8* %pat)
+; CHECK-NEXT: %ret = call i64 @strspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 %ret
+}
diff --git a/test/Transforms/InstCombine/strstr-1.ll b/test/Transforms/InstCombine/strstr-1.ll
new file mode 100644
index 0000000..81f5271
--- /dev/null
+++ b/test/Transforms/InstCombine/strstr-1.ll
@@ -0,0 +1,65 @@
+; Test that the strstr library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@.str = private constant [1 x i8] zeroinitializer
+@.str1 = private constant [2 x i8] c"a\00"
+@.str2 = private constant [6 x i8] c"abcde\00"
+@.str3 = private constant [4 x i8] c"bcd\00"
+
+declare i8* @strstr(i8*, i8*)
+
+; Check strstr(str, "") -> str.
+
+define i8* @test_simplify1(i8* %str) {
+; CHECK: @test_simplify1
+  %pat = getelementptr inbounds [1 x i8]* @.str, i32 0, i32 0
+  %ret = call i8* @strstr(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* %str
+}
+
+; Check strstr(str, "a") -> strchr(str, 'a').
+
+define i8* @test_simplify2(i8* %str) {
+; CHECK: @test_simplify2
+  %pat = getelementptr inbounds [2 x i8]* @.str1, i32 0, i32 0
+  %ret = call i8* @strstr(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: @strchr(i8* %str, i32 97)
+}
+
+; Check strstr("abcde", "bcd") -> "abcde" + 1.
+
+define i8* @test_simplify3() {
+; CHECK: @test_simplify3
+  %str = getelementptr inbounds [6 x i8]* @.str2, i32 0, i32 0
+  %pat = getelementptr inbounds [4 x i8]* @.str3, i32 0, i32 0
+  %ret = call i8* @strstr(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: getelementptr inbounds ([6 x i8]* @.str2, i64 0, i64 1)
+}
+
+; Check strstr(str, str) -> str.
+
+define i8* @test_simplify4(i8* %str) {
+; CHECK: @test_simplify4
+  %ret = call i8* @strstr(i8* %str, i8* %str)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* %str
+}
+
+; Check strstr(str, pat) == str -> strncmp(str, pat, strlen(str)) == 0.
+
+define i1 @test_simplify5(i8* %str, i8* %pat) {
+; CHECK: @test_simplify5
+  %ret = call i8* @strstr(i8* %str, i8* %pat)
+  %cmp = icmp eq i8* %ret, %str
+  ret i1 %cmp
+; CHECK: [[LEN:%[a-z]+]] = call {{i[0-9]+}} @strlen(i8* %pat)
+; CHECK: [[NCMP:%[a-z]+]] = call {{i[0-9]+}} @strncmp(i8* %str, i8* %pat, {{i[0-9]+}} [[LEN]])
+; CHECK: icmp eq {{i[0-9]+}} [[NCMP]], 0
+; CHECK: ret i1
+}
diff --git a/test/Transforms/InstCombine/strstr-2.ll b/test/Transforms/InstCombine/strstr-2.ll
new file mode 100644
index 0000000..5092f9b
--- /dev/null
+++ b/test/Transforms/InstCombine/strstr-2.ll
@@ -0,0 +1,18 @@
+; Test that the strstr library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@null = private constant [1 x i8] zeroinitializer
+
+declare i8 @strstr(i8*, i8*)
+
+define i8 @test_no_simplify1(i8* %str) {
+; CHECK: @test_no_simplify1
+  %pat = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %ret = call i8 @strstr(i8* %str, i8* %pat)
+; CHECK-NEXT: call i8 @strstr
+  ret i8 %ret
+; CHECK-NEXT: ret i8 %ret
+}
diff --git a/test/Transforms/InstCombine/strto-1.ll b/test/Transforms/InstCombine/strto-1.ll
new file mode 100644
index 0000000..16c0c67
--- /dev/null
+++ b/test/Transforms/InstCombine/strto-1.ll
@@ -0,0 +1,82 @@
+; Test that the strto* library call simplifiers works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i64 @strtol(i8* %s, i8** %endptr, i32 %base)
+; CHECK: declare i64 @strtol(i8*, i8**, i32)
+
+declare double @strtod(i8* %s, i8** %endptr, i32 %base)
+; CHECK: declare double @strtod(i8*, i8**, i32)
+
+declare float @strtof(i8* %s, i8** %endptr, i32 %base)
+; CHECK: declare float @strtof(i8*, i8**, i32)
+
+declare i64 @strtoul(i8* %s, i8** %endptr, i32 %base)
+; CHECK: declare i64 @strtoul(i8*, i8**, i32)
+
+declare i64 @strtoll(i8* %s, i8** %endptr, i32 %base)
+; CHECK: declare i64 @strtoll(i8*, i8**, i32)
+
+declare double @strtold(i8* %s, i8** %endptr)
+; CHECK: declare double @strtold(i8*, i8**)
+
+declare i64 @strtoull(i8* %s, i8** %endptr, i32 %base)
+; CHECK: declare i64 @strtoull(i8*, i8**, i32)
+
+define void @test_simplify1(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify1
+  call i64 @strtol(i8* %x, i8** null, i32 10)
+; CHECK-NEXT: call i64 @strtol(i8* nocapture %x, i8** null, i32 10)
+  ret void
+}
+
+define void @test_simplify2(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify2
+  call double @strtod(i8* %x, i8** null, i32 10)
+; CHECK-NEXT: call double @strtod(i8* nocapture %x, i8** null, i32 10)
+  ret void
+}
+
+define void @test_simplify3(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify3
+  call float @strtof(i8* %x, i8** null, i32 10)
+; CHECK-NEXT: call float @strtof(i8* nocapture %x, i8** null, i32 10)
+  ret void
+}
+
+define void @test_simplify4(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify4
+  call i64 @strtoul(i8* %x, i8** null, i32 10)
+; CHECK-NEXT: call i64 @strtoul(i8* nocapture %x, i8** null, i32 10)
+  ret void
+}
+
+define void @test_simplify5(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify5
+  call i64 @strtoll(i8* %x, i8** null, i32 10)
+; CHECK-NEXT: call i64 @strtoll(i8* nocapture %x, i8** null, i32 10)
+  ret void
+}
+
+define void @test_simplify6(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify6
+  call double @strtold(i8* %x, i8** null)
+; CHECK-NEXT: call double @strtold(i8* nocapture %x, i8** null)
+  ret void
+}
+
+define void @test_simplify7(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify7
+  call i64 @strtoull(i8* %x, i8** null, i32 10)
+; CHECK-NEXT: call i64 @strtoull(i8* nocapture %x, i8** null, i32 10)
+  ret void
+}
+
+define void @test_no_simplify1(i8* %x, i8** %endptr) {
+; CHECK: @test_no_simplify1
+  call i64 @strtol(i8* %x, i8** %endptr, i32 10)
+; CHECK-NEXT: call i64 @strtol(i8* %x, i8** %endptr, i32 10)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/struct-assign-tbaa.ll b/test/Transforms/InstCombine/struct-assign-tbaa.ll
new file mode 100644
index 0000000..33a771e
--- /dev/null
+++ b/test/Transforms/InstCombine/struct-assign-tbaa.ll
@@ -0,0 +1,44 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+; Verify that instcombine preserves TBAA tags when converting a memcpy into
+; a scalar load and store.
+
+%struct.test1 = type { float }
+
+; CHECK: @test
+; CHECK: %2 = load float* %0, align 4, !tbaa !0
+; CHECK: store float %2, float* %1, align 4, !tbaa !0
+; CHECK: ret
+define void @test1(%struct.test1* nocapture %a, %struct.test1* nocapture %b) {
+entry:
+  %0 = bitcast %struct.test1* %a to i8*
+  %1 = bitcast %struct.test1* %b to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 4, i32 4, i1 false), !tbaa.struct !3
+  ret void
+}
+
+%struct.test2 = type { i32 (i8*, i32*, double*)** }
+
+define i32 (i8*, i32*, double*)*** @test2() {
+; CHECK: @test2
+; CHECK-NOT: memcpy
+; CHECK: ret
+  %tmp = alloca %struct.test2, align 8
+  %tmp1 = bitcast %struct.test2* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp1, i8* undef, i64 8, i32 8, i1 false), !tbaa.struct !4
+  %tmp2 = getelementptr %struct.test2* %tmp, i32 0, i32 0
+  %tmp3 = load i32 (i8*, i32*, double*)*** %tmp2
+  ret i32 (i8*, i32*, double*)*** %tmp2
+}
+
+; CHECK: !0 = metadata !{metadata !"float", metadata !1}
+
+!0 = metadata !{metadata !"Simple C/C++ TBAA"}
+!1 = metadata !{metadata !"omnipotent char", metadata !0}
+!2 = metadata !{metadata !"float", metadata !0}
+!3 = metadata !{i64 0, i64 4, metadata !2}
+!4 = metadata !{i64 0, i64 8, null}
diff --git a/test/Transforms/InstCombine/udiv-simplify-bug-1.ll b/test/Transforms/InstCombine/udiv-simplify-bug-1.ll
index d95e8f8..74f2fdd 100644
--- a/test/Transforms/InstCombine/udiv-simplify-bug-1.ll
+++ b/test/Transforms/InstCombine/udiv-simplify-bug-1.ll
@@ -6,9 +6,9 @@
 ; The udiv instructions shouldn't be optimized away, and the
 ; sext instructions should be optimized to zext.
 
-define i64 @bar(i32 %x) nounwind {
+define i64 @bar(i32 %x, i32 %g) nounwind {
   %y = lshr i32 %x, 30
-  %r = udiv i32 %y, 3
+  %r = udiv i32 %y, %g
   %z = sext i32 %r to i64
   ret i64 %z
 }
diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll
index 0019a57..2d90750 100644
--- a/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/test/Transforms/InstCombine/vec_demanded_elts.ll
@@ -196,7 +196,7 @@ define <4 x float> @test_select(float %f, float %g) {
 ; CHECK-NOT: insertelement
 ; CHECK: %a3 = insertelement <4 x float> %a0, float 3.000000e+00, i32 3
 ; CHECK-NOT: insertelement
-; CHECK: %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef>
+; CHECK: shufflevector <4 x float> %a3, <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
   %a0 = insertelement <4 x float> undef, float %f, i32 0
   %a1 = insertelement <4 x float> %a0, float 1.000000e+00, i32 1
   %a2 = insertelement <4 x float> %a1, float 2.000000e+00, i32 2
diff --git a/test/Transforms/InstCombine/vec_shuffle.ll b/test/Transforms/InstCombine/vec_shuffle.ll
index 8f78c2e..14f5321 100644
--- a/test/Transforms/InstCombine/vec_shuffle.ll
+++ b/test/Transforms/InstCombine/vec_shuffle.ll
@@ -153,3 +153,46 @@ define <8 x i8> @test12a(<8 x i8> %tmp6, <8 x i8> %tmp2) nounwind {
   ret <8 x i8> %tmp3
 }
 
+; We should form a shuffle out of a select with constant condition.
+define <4 x i16> @test13a(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: @test13a
+; CHECK-NEXT: shufflevector <4 x i16> %lhs, <4 x i16> %rhs, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT: ret
+  %A = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>,
+           <4 x i16> %lhs, <4 x i16> %rhs
+  ret <4 x i16> %A
+}
+
+define <4 x i16> @test13b(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: @test13b
+; CHECK-NEXT: ret <4 x i16> %lhs
+  %A = select <4 x i1> <i1 true, i1 undef, i1 true, i1 true>,
+           <4 x i16> %lhs, <4 x i16> %rhs
+  ret <4 x i16> %A
+}
+
+define <4 x i16> @test13c(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: @test13c
+; CHECK-NEXT: shufflevector <4 x i16> %lhs, <4 x i16> %rhs, <4 x i32> <i32 0, i32 undef, i32 2, i32 7>
+; CHECK-NEXT: ret
+  %A = select <4 x i1> <i1 true, i1 undef, i1 true, i1 false>,
+           <4 x i16> %lhs, <4 x i16> %rhs
+  ret <4 x i16> %A
+}
+
+define <4 x i16> @test13d(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: @test13d
+; CHECK: select
+; CHECK-NEXT: ret
+  %A = select <4 x i1> <i1 true, i1 icmp ugt (<4 x i16>(<4 x i16>, <4 x i16>)* @test13a, <4 x i16>(<4 x i16>, <4 x i16>)* @test13b), i1 true, i1 false>,
+           <4 x i16> %lhs, <4 x i16> %rhs
+  ret <4 x i16> %A
+}
+
+define <4 x i16> @test13e(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: @test13e
+; CHECK-NEXT: ret <4 x i16> %rhs
+  %A = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>,
+           <4 x i16> %lhs, <4 x i16> %rhs
+  ret <4 x i16> %A
+}
diff --git a/test/Transforms/InstCombine/vector_gep2.ll b/test/Transforms/InstCombine/vector_gep2.ll
new file mode 100644
index 0000000..20165b1
--- /dev/null
+++ b/test/Transforms/InstCombine/vector_gep2.ll
@@ -0,0 +1,11 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define <2 x i8*> @testa(<2 x i8*> %a) {
+; CHECK: @testa
+  %g = getelementptr <2 x i8*> %a, <2 x i32> <i32 0, i32 1>
+; CHECK: getelementptr <2 x i8*> %a, <2 x i64> <i64 0, i64 1>
+  ret <2 x i8*> %g
+}
diff --git a/test/Transforms/InstCombine/weak-symbols.ll b/test/Transforms/InstCombine/weak-symbols.ll
new file mode 100644
index 0000000..0039b59
--- /dev/null
+++ b/test/Transforms/InstCombine/weak-symbols.ll
@@ -0,0 +1,33 @@
+; PR4738 - Test that the library call simplifier doesn't assume anything about
+; weak symbols.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+@real_init = weak_odr constant [2 x i8] c"y\00"
+@fake_init = weak constant [2 x i8] c"y\00"
+@.str = private constant [2 x i8] c"y\00"
+
+define i32 @foo() nounwind {
+; CHECK: define i32 @foo
+; CHECK: call i32 @strcmp
+; CHECK: ret i32 %temp1
+
+entry:
+  %str1 = getelementptr inbounds [2 x i8]* @fake_init, i64 0, i64 0
+  %str2 = getelementptr inbounds [2 x i8]* @.str, i64 0, i64 0
+  %temp1 = call i32 @strcmp(i8* %str1, i8* %str2) nounwind readonly
+  ret i32 %temp1
+}
+
+define i32 @bar() nounwind {
+; CHECK: define i32 @bar
+; CHECK: ret i32 0
+
+entry:
+  %str1 = getelementptr inbounds [2 x i8]* @real_init, i64 0, i64 0
+  %str2 = getelementptr inbounds [2 x i8]* @.str, i64 0, i64 0
+  %temp1 = call i32 @strcmp(i8* %str1, i8* %str2) nounwind readonly
+  ret i32 %temp1
+}
+
+declare i32 @strcmp(i8*, i8*) nounwind readonly
diff --git a/test/Transforms/InstSimplify/compare.ll b/test/Transforms/InstSimplify/compare.ll
index ced74bd..ce2bb79 100644
--- a/test/Transforms/InstSimplify/compare.ll
+++ b/test/Transforms/InstSimplify/compare.ll
@@ -266,6 +266,15 @@ define i1 @add5(i32 %x, i32 %y) {
 ; CHECK: ret i1 true
 }
 
+define i1 @add6(i64 %A, i64 %B) {
+; CHECK: @add6
+  %s1 = add i64 %A, %B
+  %s2 = add i64 %B, %A
+  %cmp = icmp eq i64 %s1, %s2
+  ret i1 %cmp
+; CHECK: ret i1 true
+}
+
 define i1 @addpowtwo(i32 %x, i32 %y) {
 ; CHECK: @addpowtwo
   %l = lshr i32 %x, 1
diff --git a/test/Transforms/Internalize/2008-05-09-AllButMain.ll b/test/Transforms/Internalize/2008-05-09-AllButMain.ll
index a85e834..c07abb0 100644
--- a/test/Transforms/Internalize/2008-05-09-AllButMain.ll
+++ b/test/Transforms/Internalize/2008-05-09-AllButMain.ll
@@ -1,27 +1,55 @@
-; No arguments means internalize all but main
-; RUN: opt < %s -internalize -S | grep internal | count 4
+; No arguments means internalize everything
+; RUN: opt < %s -internalize -S | FileCheck --check-prefix=NOARGS %s
+
 ; Internalize all but foo and j
-; RUN: opt < %s -internalize -internalize-public-api-list foo -internalize-public-api-list j -S | grep internal | count 3
-; Non existent files should be treated as if they were empty (so internalize all but main)
-; RUN: opt < %s -internalize -internalize-public-api-file /nonexistent/file 2> /dev/null -S | grep internal | count 4
-; RUN: opt < %s -internalize -internalize-public-api-list bar -internalize-public-api-list foo -internalize-public-api-file /nonexistent/file 2> /dev/null -S | grep internal | count 3
+; RUN: opt < %s -internalize -internalize-public-api-list foo -internalize-public-api-list j -S | FileCheck --check-prefix=LIST %s
+
+; Non existent files should be treated as if they were empty (so internalize
+; everything)
+; RUN: opt < %s -internalize -internalize-public-api-file /nonexistent/file 2> /dev/null -S | FileCheck --check-prefix=EMPTYFILE %s
+
+; RUN: opt < %s -S -internalize -internalize-public-api-list bar -internalize-public-api-list foo -internalize-public-api-file /nonexistent/file  2> /dev/null | FileCheck --check-prefix=LIST2 %s
+
 ; -file and -list options should be merged, the .apifile contains foo and j
-; RUN: opt < %s -internalize -internalize-public-api-list bar -internalize-public-api-file %s.apifile -S | grep internal | count 2
+; RUN: opt < %s -internalize -internalize-public-api-list bar -internalize-public-api-file %s.apifile -S | FileCheck --check-prefix=MERGE %s
+
+; NOARGS: @i = internal global
+; LIST: @i = internal global
+; EMPTYFILE: @i = internal global
+; LIST2: @i = internal global
+; MERGE: @i = internal global
+@i = global i32 0
 
-@i = weak global i32 0          ; <i32*> [#uses=0]
-@j = weak global i32 0          ; <i32*> [#uses=0]
+; NOARGS: @j = internal global
+; LIST: @j = global
+; EMPTYFILE: @j = internal global
+; LIST2: @j = internal global
+; MERGE: @j = global
+@j = global i32 0
 
-define void @main(...) {
-entry:  
+; NOARGS: define internal void @main
+; LIST: define internal void @main
+; EMPTYFILE: define internal void @main
+; LIST2: define internal void @main
+; MERGE: define internal void @main
+define void @main() {
         ret void
 }
 
-define void @foo(...) {
-entry:  
+; NOARGS: define internal void @foo
+; LIST: define void @foo
+; EMPTYFILE: define internal void @foo
+; LIST2: define void @foo
+; MERGE: define void @foo
+define void @foo() {
         ret void
 }
 
-define void @bar(...) {
-entry:  
+; NOARGS: define internal void @bar
+; LIST: define internal void @bar
+; EMPTYFILE: define internal void @bar
+; LIST2: define void @bar
+; MERGE: define void @bar
+define void @bar() {
         ret void
 }
diff --git a/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll b/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll
index 7b18a04..47cf3f0 100644
--- a/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll
+++ b/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -internalize -S | grep internal | count 3
+; RUN: opt < %s -internalize -internalize-public-api-list main -S | grep internal | count 3
 
 @A = global i32 0
 @B = alias i32* @A
diff --git a/test/Transforms/JumpThreading/crash.ll b/test/Transforms/JumpThreading/crash.ll
index b9c0354..2fe8746 100644
--- a/test/Transforms/JumpThreading/crash.ll
+++ b/test/Transforms/JumpThreading/crash.ll
@@ -511,3 +511,56 @@ lbl_260:                                          ; preds = %for.cond, %entry
 if.end:                                           ; preds = %for.cond
   ret void
 }
+
+define void @PR14233(i1 %cmp, i1 %cmp2, i1 %cmp3, i1 %cmp4) {
+entry:
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  br label %if.end
+
+cond.false:
+  br label %if.end
+
+if.end:
+  %A = phi i64 [ 0, %cond.true ], [ 1, %cond.false ]
+  br i1 %cmp2, label %bb, label %if.end2
+
+bb:
+  br label %if.end2
+
+if.end2:
+  %B = phi i64 [ ptrtoint (i8* ()* @PR14233.f1 to i64), %bb ], [ %A, %if.end ]
+  %cmp.ptr = icmp eq i64 %B, ptrtoint (i8* ()* @PR14233.f2 to i64)
+  br i1 %cmp.ptr, label %cond.true2, label %if.end3
+
+cond.true2:
+  br i1 %cmp3, label %bb2, label %ur
+
+bb2:
+  br i1 %cmp4, label %if.end4, label %if.end3
+
+if.end4:
+  unreachable
+
+if.end3:
+  %cmp.ptr2 = icmp eq i64 %B, ptrtoint (i8* ()* @PR14233.f2 to i64)
+  br i1 %cmp.ptr2, label %ur, label %if.then601
+
+if.then601:
+  %C = icmp eq i64 %B, 0
+  br i1 %C, label %bb3, label %bb4
+
+bb3:
+  unreachable
+
+bb4:
+  unreachable
+
+ur:
+  unreachable
+}
+
+declare i8* @PR14233.f1()
+
+declare i8* @PR14233.f2()
diff --git a/test/Transforms/JumpThreading/select.ll b/test/Transforms/JumpThreading/select.ll
index 8a81857..9676efe 100644
--- a/test/Transforms/JumpThreading/select.ll
+++ b/test/Transforms/JumpThreading/select.ll
@@ -121,3 +121,39 @@ L4:
   call void @quux()
   br label %L0
 }
+
+; Make sure the edge value of %0 from entry to L2 includes 0 and L3 is
+; reachable.
+; CHECK: test_switch_default
+; CHECK: entry:
+; CHECK: load
+; CHECK: switch
+; CHECK: [[THREADED:[A-Za-z.0-9]+]]:
+; CHECK: store
+; CHECK: br
+; CHECK: L2:
+; CHECK: icmp
+define void @test_switch_default(i32* nocapture %status) nounwind {
+entry:
+  %0 = load i32* %status, align 4
+  switch i32 %0, label %L2 [
+    i32 5061, label %L1
+    i32 0, label %L2
+  ]
+
+L1:
+  store i32 10025, i32* %status, align 4
+  br label %L2
+
+L2:
+  %1 = load i32* %status, align 4
+  %cmp57.i = icmp eq i32 %1, 0
+  br i1 %cmp57.i, label %L3, label %L4
+
+L3:
+  store i32 10000, i32* %status, align 4
+  br label %L4
+
+L4:
+  ret void
+}
diff --git a/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll b/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll
index 67c3951..fe8d445 100644
--- a/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll
+++ b/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -licm | lli
+; RUN: opt < %s -licm | lli %defaultjit
 
 define i32 @main() {
 entry:
diff --git a/test/Transforms/LICM/hoisting.ll b/test/Transforms/LICM/hoisting.ll
index 6f28d53..98f9334 100644
--- a/test/Transforms/LICM/hoisting.ll
+++ b/test/Transforms/LICM/hoisting.ll
@@ -29,7 +29,7 @@ Out:		; preds = %LoopTail
 }
 
 
-declare void @foo2(i32)
+declare void @foo2(i32) nounwind
 
 
 ;; It is ok and desirable to hoist this potentially trapping instruction.
@@ -64,3 +64,29 @@ Out:		; preds = %Loop
 	%C = sub i32 %A, %B		; <i32> [#uses=1]
 	ret i32 %C
 }
+
+; CHECK: @test4
+; CHECK: call
+; CHECK: sdiv
+; CHECK: ret
+define i32 @test4(i32 %x, i32 %y) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %n.01 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  call void @foo_may_call_exit(i32 0)
+  %div = sdiv i32 %x, %y
+  %add = add nsw i32 %n.01, %div
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %n.0.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %n.0.lcssa
+}
+
+declare void @foo_may_call_exit(i32)
+
diff --git a/test/Transforms/LoopIdiom/basic.ll b/test/Transforms/LoopIdiom/basic.ll
index 46ab7e5..06a5bd9 100644
--- a/test/Transforms/LoopIdiom/basic.ll
+++ b/test/Transforms/LoopIdiom/basic.ll
@@ -383,4 +383,37 @@ for.end:                                          ; preds = %for.inc
 
 }
 
+define void @PR14241(i32* %s, i64 %size) {
+; Ensure that we don't form a memcpy for strided loops. Briefly, when we taught
+; LoopIdiom about memmove and strided loops, this got miscompiled into a memcpy
+; instead of a memmove. If we get the memmove transform back, this will catch
+; regressions.
+;
+; CHECK: @PR14241
 
+entry:
+  %end.idx = add i64 %size, -1
+  %end.ptr = getelementptr inbounds i32* %s, i64 %end.idx
+  br label %while.body
+; CHECK-NOT: memcpy
+;
+; FIXME: When we regain the ability to form a memmove here, this test should be
+; reversed and turned into a positive assertion.
+; CHECK-NOT: memmove
+
+while.body:
+  %phi.ptr = phi i32* [ %s, %entry ], [ %next.ptr, %while.body ]
+  %src.ptr = getelementptr inbounds i32* %phi.ptr, i64 1
+  %val = load i32* %src.ptr, align 4
+; CHECK: load
+  %dst.ptr = getelementptr inbounds i32* %phi.ptr, i64 0
+  store i32 %val, i32* %dst.ptr, align 4
+; CHECK: store
+  %next.ptr = getelementptr inbounds i32* %phi.ptr, i64 1
+  %cmp = icmp eq i32* %next.ptr, %end.ptr
+  br i1 %cmp, label %exit, label %while.body
+
+exit:
+  ret void
+; CHECK: ret void
+}
diff --git a/test/Transforms/LoopIdiom/crash.ll b/test/Transforms/LoopIdiom/crash.ll
new file mode 100644
index 0000000..969adbc
--- /dev/null
+++ b/test/Transforms/LoopIdiom/crash.ll
@@ -0,0 +1,25 @@
+; RUN: opt -basicaa -loop-idiom -S < %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Don't crash inside DependenceAnalysis
+; PR14219
+define void @test1(i64* %iwork, i64 %x)  {
+bb0:
+  %mul116 = mul nsw i64 %x, %x
+  %incdec.ptr6.sum175 = add i64 42, %x
+  %arrayidx135 = getelementptr inbounds i64* %iwork, i64 %incdec.ptr6.sum175
+  br label %bb1
+bb1:
+  %storemerge4226 = phi i64 [ 0, %bb0 ], [ %inc139, %bb1 ]
+  store i64 1, i64* %arrayidx135, align 8
+  %incdec.ptr6.sum176 = add i64 %mul116, %storemerge4226
+  %arrayidx137 = getelementptr inbounds i64* %iwork, i64 %incdec.ptr6.sum176
+  store i64 1, i64* %arrayidx137, align 8
+  %inc139 = add nsw i64 %storemerge4226, 1
+  %cmp131 = icmp sgt i64 %storemerge4226, 42
+  br i1 %cmp131, label %bb2, label %bb1
+bb2:
+  ret void
+}
+
diff --git a/test/Transforms/LoopIdiom/non-canonical-loop.ll b/test/Transforms/LoopIdiom/non-canonical-loop.ll
new file mode 100644
index 0000000..a6a4f92
--- /dev/null
+++ b/test/Transforms/LoopIdiom/non-canonical-loop.ll
@@ -0,0 +1,34 @@
+; RUN: opt -S -loop-idiom < %s
+; Don't crash
+; PR13892
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test(i32* %currMB) nounwind uwtable {
+entry:
+  br i1 undef, label %start.exit, label %if.then.i
+
+if.then.i:                                        ; preds = %entry
+  unreachable
+
+start.exit:                       ; preds = %entry
+  indirectbr i8* undef, [label %0, label %for.bodyprime]
+
+; <label>:0                                       ; preds = %start.exit
+  unreachable
+
+for.bodyprime:                                    ; preds = %for.bodyprime, %start.exit
+  %i.057375 = phi i32 [ 0, %start.exit ], [ %1, %for.bodyprime ]
+  %arrayidx8prime = getelementptr inbounds i32* %currMB, i32 %i.057375
+  store i32 0, i32* %arrayidx8prime, align 4
+  %1 = add i32 %i.057375, 1
+  %cmp5prime = icmp slt i32 %1, 4
+  br i1 %cmp5prime, label %for.bodyprime, label %for.endprime
+
+for.endprime:                                     ; preds = %for.bodyprime
+  br label %for.body23prime
+
+for.body23prime:                                  ; preds = %for.body23prime, %for.endprime
+  br label %for.body23prime
+}
diff --git a/test/Transforms/LoopIdiom/scev-invalidation.ll b/test/Transforms/LoopIdiom/scev-invalidation.ll
new file mode 100644
index 0000000..a244d9a
--- /dev/null
+++ b/test/Transforms/LoopIdiom/scev-invalidation.ll
@@ -0,0 +1,74 @@
+; RUN: opt -S -indvars -loop-idiom < %s
+; PR14214
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @quote_arg() nounwind {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %backslashes.0 = phi i32 [ undef, %entry ], [ %backslashes.2, %for.inc ]
+  %p.0 = phi i8* [ undef, %entry ], [ %incdec.ptr3, %for.inc ]
+  %q.0 = phi i8* [ undef, %entry ], [ %q.2, %for.inc ]
+  %0 = load i8* %p.0, align 1
+  switch i8 %0, label %while.cond.preheader [
+    i8 0, label %for.cond4.preheader
+    i8 92, label %for.inc
+  ]
+
+while.cond.preheader:                             ; preds = %for.cond
+  %tobool210 = icmp eq i32 %backslashes.0, 0
+  br i1 %tobool210, label %for.inc.loopexit, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %while.cond.preheader
+  %1 = add i32 %backslashes.0, -1
+  %2 = zext i32 %1 to i64
+  br label %while.body
+
+for.cond4.preheader:                              ; preds = %for.cond
+  %tobool57 = icmp eq i32 %backslashes.0, 0
+  br i1 %tobool57, label %for.end10, label %for.body6.lr.ph
+
+for.body6.lr.ph:                                  ; preds = %for.cond4.preheader
+  br label %for.body6
+
+while.body:                                       ; preds = %while.body.lr.ph, %while.body
+  %q.112 = phi i8* [ %q.0, %while.body.lr.ph ], [ %incdec.ptr, %while.body ]
+  %backslashes.111 = phi i32 [ %backslashes.0, %while.body.lr.ph ], [ %dec, %while.body ]
+  %incdec.ptr = getelementptr inbounds i8* %q.112, i64 1
+  store i8 92, i8* %incdec.ptr, align 1
+  %dec = add nsw i32 %backslashes.111, -1
+  %tobool2 = icmp eq i32 %dec, 0
+  br i1 %tobool2, label %while.cond.for.inc.loopexit_crit_edge, label %while.body
+
+while.cond.for.inc.loopexit_crit_edge:            ; preds = %while.body
+  %scevgep.sum = add i64 %2, 1
+  %scevgep13 = getelementptr i8* %q.0, i64 %scevgep.sum
+  br label %for.inc.loopexit
+
+for.inc.loopexit:                                 ; preds = %while.cond.for.inc.loopexit_crit_edge, %while.cond.preheader
+  %q.1.lcssa = phi i8* [ %scevgep13, %while.cond.for.inc.loopexit_crit_edge ], [ %q.0, %while.cond.preheader ]
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.inc.loopexit, %for.cond
+  %backslashes.2 = phi i32 [ %backslashes.0, %for.cond ], [ 0, %for.inc.loopexit ]
+  %q.2 = phi i8* [ %q.0, %for.cond ], [ %q.1.lcssa, %for.inc.loopexit ]
+  %incdec.ptr3 = getelementptr inbounds i8* %p.0, i64 1
+  br label %for.cond
+
+for.body6:                                        ; preds = %for.body6.lr.ph, %for.body6
+  %q.39 = phi i8* [ %q.0, %for.body6.lr.ph ], [ %incdec.ptr7, %for.body6 ]
+  %backslashes.38 = phi i32 [ %backslashes.0, %for.body6.lr.ph ], [ %dec9, %for.body6 ]
+  %incdec.ptr7 = getelementptr inbounds i8* %q.39, i64 1
+  store i8 92, i8* %incdec.ptr7, align 1
+  %dec9 = add nsw i32 %backslashes.38, -1
+  %tobool5 = icmp eq i32 %dec9, 0
+  br i1 %tobool5, label %for.cond4.for.end10_crit_edge, label %for.body6
+
+for.cond4.for.end10_crit_edge:                    ; preds = %for.body6
+  br label %for.end10
+
+for.end10:                                        ; preds = %for.cond4.for.end10_crit_edge, %for.cond4.preheader
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopRotate/multiple-exits.ll b/test/Transforms/LoopRotate/multiple-exits.ll
new file mode 100644
index 0000000..675d71f
--- /dev/null
+++ b/test/Transforms/LoopRotate/multiple-exits.ll
@@ -0,0 +1,236 @@
+; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; PR7447
+define i32 @test1([100 x i32]* nocapture %a) nounwind readonly {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond1, %entry
+  %sum.0 = phi i32 [ 0, %entry ], [ %sum.1, %for.cond1 ]
+  %i.0 = phi i1 [ true, %entry ], [ false, %for.cond1 ]
+  br i1 %i.0, label %for.cond1, label %return
+
+for.cond1:                                        ; preds = %for.cond, %land.rhs
+  %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.0, %for.cond ]
+  %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond ]
+  %cmp2 = icmp ult i32 %i.1, 100
+  br i1 %cmp2, label %land.rhs, label %for.cond
+
+land.rhs:                                         ; preds = %for.cond1
+  %conv = zext i32 %i.1 to i64
+  %arrayidx = getelementptr inbounds [100 x i32]* %a, i64 0, i64 %conv
+  %0 = load i32* %arrayidx, align 4
+  %add = add i32 %0, %sum.1
+  %cmp4 = icmp ugt i32 %add, 1000
+  %inc = add i32 %i.1, 1
+  br i1 %cmp4, label %return, label %for.cond1
+
+return:                                           ; preds = %for.cond, %land.rhs
+  %retval.0 = phi i32 [ 1000, %land.rhs ], [ %sum.0, %for.cond ]
+  ret i32 %retval.0
+
+; CHECK: @test1
+; CHECK: for.cond1.preheader:
+; CHECK: %sum.04 = phi i32 [ 0, %entry ], [ %sum.1.lcssa, %for.cond.loopexit ]
+; CHECK: br label %for.cond1
+
+; CHECK: for.cond1:
+; CHECK: %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.04, %for.cond1.preheader ]
+; CHECK: %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond1.preheader ]
+; CHECK: %cmp2 = icmp ult i32 %i.1, 100
+; CHECK: br i1 %cmp2, label %land.rhs, label %for.cond.loopexit
+}
+
+define void @test2(i32 %x) nounwind {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
+  %cmp = icmp eq i32 %i.0, %x
+  br i1 %cmp, label %return.loopexit, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  %call = tail call i32 @foo(i32 %i.0) nounwind
+  %tobool = icmp eq i32 %call, 0
+  br i1 %tobool, label %if.end, label %a
+
+if.end:                                           ; preds = %for.body
+  %call1 = tail call i32 @foo(i32 42) nounwind
+  %inc = add i32 %i.0, 1
+  br label %for.cond
+
+a:                                                ; preds = %for.body
+  %call2 = tail call i32 @bar(i32 1) nounwind
+  br label %return
+
+return.loopexit:                                  ; preds = %for.cond
+  br label %return
+
+return:                                           ; preds = %return.loopexit, %a
+  ret void
+
+; CHECK: @test2
+; CHECK: if.end:
+; CHECK: %inc = add i32 %i.02, 1
+; CHECK: %cmp = icmp eq i32 %inc, %x
+; CHECK: br i1 %cmp, label %for.cond.return.loopexit_crit_edge, label %for.body
+}
+
+declare i32 @foo(i32)
+
+declare i32 @bar(i32)
+
+@_ZTIi = external constant i8*
+
+; Verify dominators.
+define void @test3(i32 %x) {
+entry:
+  %cmp2 = icmp eq i32 0, %x
+  br i1 %cmp2, label %try.cont.loopexit, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc
+  %i.03 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  invoke void @_Z3fooi(i32 %i.03)
+          to label %for.inc unwind label %lpad
+
+for.inc:                                          ; preds = %for.body
+  %inc = add i32 %i.03, 1
+  %cmp = icmp eq i32 %inc, %x
+  br i1 %cmp, label %for.cond.try.cont.loopexit_crit_edge, label %for.body
+
+lpad:                                             ; preds = %for.body
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = extractvalue { i8*, i32 } %0, 1
+  %3 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  %matches = icmp eq i32 %2, %3
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  %4 = tail call i8* @__cxa_begin_catch(i8* %1) nounwind
+  br i1 true, label %invoke.cont2.loopexit, label %for.body.i.lr.ph
+
+for.body.i.lr.ph:                                 ; preds = %catch
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.body.i.lr.ph, %for.inc.i
+  %i.0.i1 = phi i32 [ 0, %for.body.i.lr.ph ], [ %inc.i, %for.inc.i ]
+  invoke void @_Z3fooi(i32 %i.0.i1)
+          to label %for.inc.i unwind label %lpad.i
+
+for.inc.i:                                        ; preds = %for.body.i
+  %inc.i = add i32 %i.0.i1, 1
+  %cmp.i = icmp eq i32 %inc.i, 0
+  br i1 %cmp.i, label %for.cond.i.invoke.cont2.loopexit_crit_edge, label %for.body.i
+
+lpad.i:                                           ; preds = %for.body.i
+  %5 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %6 = extractvalue { i8*, i32 } %5, 0
+  %7 = extractvalue { i8*, i32 } %5, 1
+  %matches.i = icmp eq i32 %7, %3
+  br i1 %matches.i, label %catch.i, label %lpad1.body
+
+catch.i:                                          ; preds = %lpad.i
+  %8 = tail call i8* @__cxa_begin_catch(i8* %6) nounwind
+  invoke void @test3(i32 0)
+          to label %invoke.cont2.i unwind label %lpad1.i
+
+invoke.cont2.i:                                   ; preds = %catch.i
+  tail call void @__cxa_end_catch() nounwind
+  br label %invoke.cont2
+
+lpad1.i:                                          ; preds = %catch.i
+  %9 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  %10 = extractvalue { i8*, i32 } %9, 0
+  %11 = extractvalue { i8*, i32 } %9, 1
+  tail call void @__cxa_end_catch() nounwind
+  br label %lpad1.body
+
+for.cond.i.invoke.cont2.loopexit_crit_edge:       ; preds = %for.inc.i
+  br label %invoke.cont2.loopexit
+
+invoke.cont2.loopexit:                            ; preds = %for.cond.i.invoke.cont2.loopexit_crit_edge, %catch
+  br label %invoke.cont2
+
+invoke.cont2:                                     ; preds = %invoke.cont2.loopexit, %invoke.cont2.i
+  tail call void @__cxa_end_catch() nounwind
+  br label %try.cont
+
+for.cond.try.cont.loopexit_crit_edge:             ; preds = %for.inc
+  br label %try.cont.loopexit
+
+try.cont.loopexit:                                ; preds = %for.cond.try.cont.loopexit_crit_edge, %entry
+  br label %try.cont
+
+try.cont:                                         ; preds = %try.cont.loopexit, %invoke.cont2
+  ret void
+
+lpad1.body:                                       ; preds = %lpad1.i, %lpad.i
+  %exn.slot.0.i = phi i8* [ %10, %lpad1.i ], [ %6, %lpad.i ]
+  %ehselector.slot.0.i = phi i32 [ %11, %lpad1.i ], [ %7, %lpad.i ]
+  tail call void @__cxa_end_catch() nounwind
+  br label %eh.resume
+
+eh.resume:                                        ; preds = %lpad1.body, %lpad
+  %exn.slot.0 = phi i8* [ %exn.slot.0.i, %lpad1.body ], [ %1, %lpad ]
+  %ehselector.slot.0 = phi i32 [ %ehselector.slot.0.i, %lpad1.body ], [ %2, %lpad ]
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn.slot.0, 0
+  %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %ehselector.slot.0, 1
+  resume { i8*, i32 } %lpad.val5
+}
+
+declare void @_Z3fooi(i32)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+define void @test4() nounwind uwtable {
+entry:
+  br label %"7"
+
+"3":                                              ; preds = %"7"
+  br i1 undef, label %"31", label %"4"
+
+"4":                                              ; preds = %"3"
+  %. = select i1 undef, float 0x3F50624DE0000000, float undef
+  %0 = add i32 %1, 1
+  br label %"7"
+
+"7":                                              ; preds = %"4", %entry
+  %1 = phi i32 [ %0, %"4" ], [ 0, %entry ]
+  %2 = icmp slt i32 %1, 100
+  br i1 %2, label %"3", label %"8"
+
+"8":                                              ; preds = %"7"
+  br i1 undef, label %"9", label %"31"
+
+"9":                                              ; preds = %"8"
+  br label %"33"
+
+"27":                                             ; preds = %"31"
+  unreachable
+
+"31":                                             ; preds = %"8", %"3"
+  br i1 undef, label %"27", label %"32"
+
+"32":                                             ; preds = %"31"
+  br label %"33"
+
+"33":                                             ; preds = %"32", %"9"
+  ret void
+}
diff --git a/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll b/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll
index a6996a8..af3a537 100644
--- a/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll
+++ b/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll
@@ -1,15 +1,15 @@
 ; RUN: opt -loop-reduce -S < %s | FileCheck %s
 ;
 ; Test LSR's use of SplitCriticalEdge during phi rewriting.
-; Verify that identical edges are merged. rdar://problem/6453893
 
 target triple = "x86-apple-darwin"
 
-; CHECK: @test
+; Verify that identical edges are merged. rdar://problem/6453893
+; CHECK: @test1
 ; CHECK: bb89:
 ; CHECK: phi i8* [ %lsr.iv.next1, %bbA.bb89_crit_edge ], [ %lsr.iv.next1, %bbB.bb89_crit_edge ]{{$}}
 
-define i8* @test() {
+define i8* @test1() {
 entry:
   br label %loop
 
@@ -41,3 +41,41 @@ bb89:
 exit:
   ret i8* %tmp75phi
 }
+
+; Handle single-predecessor phis: PR13756
+; CHECK: @test2
+; CHECK: bb89:
+; CHECK: phi i8* [ %lsr.iv.next1, %bbA ], [ %lsr.iv.next1, %bbA ], [ %lsr.iv.next1, %bbA ]{{$}}
+define i8* @test2() {
+entry:
+  br label %loop
+
+loop:
+  %rec = phi i32 [ %next, %loop ], [ 0, %entry ]
+  %next = add i32 %rec, 1
+  %tmp75 = getelementptr i8* null, i32 %next
+  br i1 false, label %loop, label %loopexit
+
+loopexit:
+  br i1 false, label %bbA, label %bbB
+
+bbA:
+  switch i32 0, label %bb89 [
+    i32 47, label %bb89
+    i32 58, label %bb89
+  ]
+
+bbB:
+  switch i8 0, label %exit [
+    i8 47, label %exit
+    i8 58, label %exit
+  ]
+
+bb89:
+  %tmp75phi = phi i8* [ %tmp75, %bbA ], [ %tmp75, %bbA ], [ %tmp75, %bbA ]
+  br label %exit
+
+exit:
+  %result = phi i8* [ %tmp75phi, %bb89 ], [ %tmp75, %bbB ], [ %tmp75, %bbB ], [ %tmp75, %bbB ]
+  ret i8* %result
+}
diff --git a/test/Transforms/LoopUnroll/pr11361.ll b/test/Transforms/LoopUnroll/pr11361.ll
index 7ce7f5f..62de2f7 100644
--- a/test/Transforms/LoopUnroll/pr11361.ll
+++ b/test/Transforms/LoopUnroll/pr11361.ll
@@ -1,4 +1,4 @@
-; RUN: opt -loop-unroll -disable-output
+; RUN: opt -loop-unroll -disable-output < %s
 ; PR11361
 
 ; This tests for an iterator invalidation issue.
diff --git a/test/Transforms/LoopUnroll/pr14167.ll b/test/Transforms/LoopUnroll/pr14167.ll
new file mode 100644
index 0000000..205ae44
--- /dev/null
+++ b/test/Transforms/LoopUnroll/pr14167.ll
@@ -0,0 +1,44 @@
+; RUN: opt < %s -S -loop-unroll -unroll-runtime | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+define void @test1() nounwind {
+; Ensure that we don't crash when the trip count == -1.
+; CHECK: @test1
+entry:
+  br label %for.cond2.preheader
+
+for.cond2.preheader:                              ; preds = %for.end, %entry
+  br i1 false, label %middle.block, label %vector.ph
+
+vector.ph:                                        ; preds = %for.cond2.preheader
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  br i1 undef, label %middle.block.loopexit, label %vector.body
+
+middle.block.loopexit:                            ; preds = %vector.body
+  br label %middle.block
+
+middle.block:                                     ; preds = %middle.block.loopexit, %for.cond2.preheader
+  br i1 true, label %for.end, label %scalar.preheader
+
+scalar.preheader:                                 ; preds = %middle.block
+  br label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %scalar.preheader
+  %indvars.iv = phi i64 [ 16000, %scalar.preheader ], [ %indvars.iv.next, %for.body4 ]
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 16000
+  br i1 %exitcond, label %for.body4, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body4
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %middle.block
+  br i1 undef, label %for.cond2.preheader, label %for.end15
+
+for.end15:                                        ; preds = %for.end
+  ret void
+}
diff --git a/test/Transforms/LoopUnswitch/2011-06-02-CritSwitch.ll b/test/Transforms/LoopUnswitch/2011-06-02-CritSwitch.ll
index 61c54dd..6095200 100644
--- a/test/Transforms/LoopUnswitch/2011-06-02-CritSwitch.ll
+++ b/test/Transforms/LoopUnswitch/2011-06-02-CritSwitch.ll
@@ -1,4 +1,4 @@
-; RUN: opt -loop-unswitch -disable-output
+; RUN: opt -loop-unswitch -disable-output < %s
 ; PR10031
 
 define i32 @test(i32 %command) {
diff --git a/test/Transforms/LoopVectorize/2012-10-20-infloop.ll b/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
new file mode 100644
index 0000000..0176c9a
--- /dev/null
+++ b/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce
+
+; Check that we don't fall into an infinite loop.
+define void @test() nounwind {
+entry:
+ br label %for.body
+
+for.body:
+ %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
+ br label %for.body
+}
+
+
+
+define void @test2() nounwind {
+entry:
+ br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+ %indvars.iv47 = phi i64 [ 0, %entry ], [ %indvars.iv.next48, %for.body ]
+ %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
+ %indvars.iv.next48 = add i64 %indvars.iv47, 1
+ br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+ unreachable
+}
diff --git a/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll b/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
new file mode 100644
index 0000000..2516e24
--- /dev/null
+++ b/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s  -loop-vectorize -dce -force-vector-width=4 
+
+; Check that we don't crash.
+
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+module asm "\09.ident\09\22GCC: (GNU) 4.6.3 LLVM: 3.2svn\22"
+
+@b = common global [32000 x float] zeroinitializer, align 16
+
+define i32 @set1ds(i32 %_n, float* nocapture %arr, float %value, i32 %stride) nounwind uwtable {
+entry:
+  %0 = icmp sgt i32 %_n, 0
+  br i1 %0, label %"3.lr.ph", label %"5"
+
+"3.lr.ph":                                        ; preds = %entry
+  %1 = bitcast float* %arr to i8*
+  %2 = sext i32 %stride to i64
+  br label %"3"
+
+"3":                                              ; preds = %"3.lr.ph", %"3"
+  %indvars.iv = phi i64 [ 0, %"3.lr.ph" ], [ %indvars.iv.next, %"3" ]
+  %3 = shl nsw i64 %indvars.iv, 2
+  %4 = getelementptr inbounds i8* %1, i64 %3
+  %5 = bitcast i8* %4 to float*
+  store float %value, float* %5, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, %2
+  %6 = trunc i64 %indvars.iv.next to i32
+  %7 = icmp slt i32 %6, %_n
+  br i1 %7, label %"3", label %"5"
+
+"5":                                              ; preds = %"3", %entry
+  ret i32 0
+}
+
+define i32 @init(i8* nocapture %name) unnamed_addr nounwind uwtable {
+entry:
+  br label %"3"
+
+"3":                                              ; preds = %"3", %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %"3" ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %1 = getelementptr inbounds i8* bitcast (float* getelementptr inbounds ([32000 x float]* @b, i64 0, i64 16000) to i8*), i64 %0
+  %2 = bitcast i8* %1 to float*
+  store float -1.000000e+00, float* %2, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 16000
+  br i1 %exitcond, label %"5", label %"3"
+
+"5":                                              ; preds = %"3"
+  ret i32 0
+}
+
+!0 = metadata !{metadata !"alias set 7: float", metadata !1}
+!1 = metadata !{metadata !1}
diff --git a/test/Transforms/LoopVectorize/X86/avx1.ll b/test/Transforms/LoopVectorize/X86/avx1.ll
new file mode 100644
index 0000000..a2d176a
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/avx1.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @read_mod_write_single_ptr
+;CHECK: load <8 x float>
+;CHECK: ret i32
+define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds float* %a, i64 %indvars.iv
+  %3 = load float* %2, align 4
+  %4 = fmul float %3, 3.000000e+00
+  store float %4, float* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+
+;CHECK: @read_mod_i64
+;CHECK: load <8 x i64>
+;CHECK: ret i32
+define i32 @read_mod_i64(i64* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i64* %a, i64 %indvars.iv
+  %3 = load i64* %2, align 4
+  %4 = mul i64 %3, 3
+  store i64 %4, i64* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/test/Transforms/LoopVectorize/X86/conversion-cost.ll
new file mode 100644
index 0000000..8f1bb54
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -0,0 +1,48 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @conversion_cost1
+;CHECK: store <2 x i8>
+;CHECK: ret
+define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 3
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 3, %0 ]
+  %2 = trunc i64 %indvars.iv to i8
+  %3 = getelementptr inbounds i8* %A, i64 %indvars.iv
+  store i8 %2, i8* %3, align 1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+;CHECK: @conversion_cost2
+;CHECK: <2 x float>
+;CHECK: ret
+define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 9
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
+  %2 = add nsw i64 %indvars.iv, 3
+  %3 = trunc i64 %2 to i32
+  %4 = sitofp i32 %3 to float
+  %5 = getelementptr inbounds float* %B, i64 %indvars.iv
+  store float %4, float* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/X86/cost-model.ll b/test/Transforms/LoopVectorize/X86/cost-model.ll
new file mode 100644
index 0000000..628f991
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@c = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@d = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: cost_model_1
+;CHECK: <4 x i32>
+;CHECK: ret void
+define void @cost_model_1() nounwind uwtable noinline ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %0
+  %1 = load i32* %arrayidx, align 8
+  %idxprom1 = sext i32 %1 to i64
+  %arrayidx2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %idxprom1
+  %2 = load i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [2048 x i32]* @d, i64 0, i64 %indvars.iv
+  %3 = load i32* %arrayidx4, align 4
+  %idxprom5 = sext i32 %3 to i64
+  %arrayidx6 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %idxprom5
+  store i32 %2, i32* %arrayidx6, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
new file mode 100644
index 0000000..574c529
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
@@ -0,0 +1,62 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; Select VF = 8;
+;CHECK: @example1
+;CHECK: load <8 x i32>
+;CHECK: add nsw <8 x i32>
+;CHECK: store <8 x i32>
+;CHECK: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+
+; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive. 
+;CHECK: @example10b
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i16* %sb, i64 %indvars.iv
+  %3 = load i16* %2, align 2
+  %4 = sext i16 %3 to i32
+  %5 = getelementptr inbounds i32* %ia, i64 %indvars.iv
+  store i32 %4, i32* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/X86/lit.local.cfg b/test/Transforms/LoopVectorize/X86/lit.local.cfg
new file mode 100644
index 0000000..a8ad0f1
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/LoopVectorize/cpp-new-array.ll b/test/Transforms/LoopVectorize/cpp-new-array.ll
new file mode 100644
index 0000000..26902eb
--- /dev/null
+++ b/test/Transforms/LoopVectorize/cpp-new-array.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @cpp_new_arrays
+;CHECK: insertelement <4 x i32>
+;CHECK: load <4 x float>
+;CHECK: fadd <4 x float>
+;CHECK: ret i32
+define i32 @cpp_new_arrays() uwtable ssp {
+entry:
+  %call = call noalias i8* @_Znwm(i64 4)
+  %0 = bitcast i8* %call to float*
+  store float 1.000000e+03, float* %0, align 4
+  %call1 = call noalias i8* @_Znwm(i64 4)
+  %1 = bitcast i8* %call1 to float*
+  store float 1.000000e+03, float* %1, align 4
+  %call3 = call noalias i8* @_Znwm(i64 4)
+  %2 = bitcast i8* %call3 to float*
+  store float 1.000000e+03, float* %2, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %idxprom = sext i32 %i.01 to i64
+  %arrayidx = getelementptr inbounds float* %0, i64 %idxprom
+  %3 = load float* %arrayidx, align 4
+  %idxprom5 = sext i32 %i.01 to i64
+  %arrayidx6 = getelementptr inbounds float* %1, i64 %idxprom5
+  %4 = load float* %arrayidx6, align 4
+  %add = fadd float %3, %4
+  %idxprom7 = sext i32 %i.01 to i64
+  %arrayidx8 = getelementptr inbounds float* %2, i64 %idxprom7
+  store float %add, float* %arrayidx8, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 1000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %5 = load float* %2, align 4
+  %conv10 = fptosi float %5 to i32
+  ret i32 %conv10
+}
+
+declare noalias i8* @_Znwm(i64)
diff --git a/test/Transforms/LoopVectorize/flags.ll b/test/Transforms/LoopVectorize/flags.ll
new file mode 100644
index 0000000..2f22a76
--- /dev/null
+++ b/test/Transforms/LoopVectorize/flags.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @flags1
+;CHECK: load <4 x i32>
+;CHECK: mul nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret i32
+define i32 @flags1(i32 %n, i32* nocapture %A) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 9
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = mul nsw i32 %3, 3
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+
+;CHECK: @flags2
+;CHECK: load <4 x i32>
+;CHECK: mul <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret i32
+define i32 @flags2(i32 %n, i32* nocapture %A) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 9
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = mul i32 %3, 3
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/gcc-examples.ll b/test/Transforms/LoopVectorize/gcc-examples.ll
new file mode 100644
index 0000000..fce29d2
--- /dev/null
+++ b/test/Transforms/LoopVectorize/gcc-examples.ll
@@ -0,0 +1,650 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+@G = common global [32 x [1024 x i32]] zeroinitializer, align 16
+@ub = common global [1024 x i32] zeroinitializer, align 16
+@uc = common global [1024 x i32] zeroinitializer, align 16
+@d = common global [2048 x i32] zeroinitializer, align 16
+@fa = common global [1024 x float] zeroinitializer, align 16
+@fb = common global [1024 x float] zeroinitializer, align 16
+@ic = common global [1024 x i32] zeroinitializer, align 16
+@da = common global [1024 x float] zeroinitializer, align 16
+@db = common global [1024 x float] zeroinitializer, align 16
+@dc = common global [1024 x float] zeroinitializer, align 16
+@dd = common global [1024 x float] zeroinitializer, align 16
+@dj = common global [1024 x i32] zeroinitializer, align 16
+
+;CHECK: @example1
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+;CHECK: @example2
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example2(i32 %n, i32 %x) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph5, label %.preheader
+
+..preheader_crit_edge:                            ; preds = %.lr.ph5
+  %phitmp = sext i32 %n to i64
+  br label %.preheader
+
+.preheader:                                       ; preds = %..preheader_crit_edge, %0
+  %i.0.lcssa = phi i64 [ %phitmp, %..preheader_crit_edge ], [ 0, %0 ]
+  %2 = icmp eq i32 %n, 0
+  br i1 %2, label %._crit_edge, label %.lr.ph
+
+.lr.ph5:                                          ; preds = %0, %.lr.ph5
+  %indvars.iv6 = phi i64 [ %indvars.iv.next7, %.lr.ph5 ], [ 0, %0 ]
+  %3 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv6
+  store i32 %x, i32* %3, align 4
+  %indvars.iv.next7 = add i64 %indvars.iv6, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next7 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %..preheader_crit_edge, label %.lr.ph5
+
+.lr.ph:                                           ; preds = %.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ %i.0.lcssa, %.preheader ]
+  %.02 = phi i32 [ %4, %.lr.ph ], [ %n, %.preheader ]
+  %4 = add nsw i32 %.02, -1
+  %5 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %6 = load i32* %5, align 4
+  %7 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %8 = load i32* %7, align 4
+  %9 = and i32 %8, %6
+  %10 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %9, i32* %10, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %11 = icmp eq i32 %4, 0
+  br i1 %11, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %.preheader
+  ret void
+}
+
+; We can't vectorize this loop because it has non constant loop bounds.
+;CHECK: @example3
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
+  %1 = icmp eq i32 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ]
+  %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
+  %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
+  %2 = add nsw i32 %.05, -1
+  %3 = getelementptr inbounds i32* %.023, i64 1
+  %4 = load i32* %.023, align 16
+  %5 = getelementptr inbounds i32* %.014, i64 1
+  store i32 %4, i32* %.014, align 16
+  %6 = icmp eq i32 %2, 0
+  br i1 %6, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+;CHECK: @example4
+;CHECK: load <4 x i32>
+;CHECK: ret void
+define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
+  %1 = add nsw i32 %n, -1
+  %2 = icmp eq i32 %n, 0
+  br i1 %2, label %.preheader4, label %.lr.ph10
+
+.preheader4:                                      ; preds = %0
+  %3 = icmp sgt i32 %1, 0
+  br i1 %3, label %.lr.ph6, label %._crit_edge
+
+.lr.ph10:                                         ; preds = %0, %.lr.ph10
+  %4 = phi i32 [ %9, %.lr.ph10 ], [ %1, %0 ]
+  %.018 = phi i32* [ %8, %.lr.ph10 ], [ %p, %0 ]
+  %.027 = phi i32* [ %5, %.lr.ph10 ], [ %q, %0 ]
+  %5 = getelementptr inbounds i32* %.027, i64 1
+  %6 = load i32* %.027, align 16
+  %7 = add nsw i32 %6, 5
+  %8 = getelementptr inbounds i32* %.018, i64 1
+  store i32 %7, i32* %.018, align 16
+  %9 = add nsw i32 %4, -1
+  %10 = icmp eq i32 %4, 0
+  br i1 %10, label %._crit_edge, label %.lr.ph10
+
+.preheader:                                       ; preds = %.lr.ph6
+  br i1 %3, label %.lr.ph, label %._crit_edge
+
+.lr.ph6:                                          ; preds = %.preheader4, %.lr.ph6
+  %indvars.iv11 = phi i64 [ %indvars.iv.next12, %.lr.ph6 ], [ 0, %.preheader4 ]
+  %indvars.iv.next12 = add i64 %indvars.iv11, 1
+  %11 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv.next12
+  %12 = load i32* %11, align 4
+  %13 = add nsw i64 %indvars.iv11, 3
+  %14 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %13
+  %15 = load i32* %14, align 4
+  %16 = add nsw i32 %15, %12
+  %17 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv11
+  store i32 %16, i32* %17, align 4
+  %lftr.wideiv13 = trunc i64 %indvars.iv.next12 to i32
+  %exitcond14 = icmp eq i32 %lftr.wideiv13, %1
+  br i1 %exitcond14, label %.preheader, label %.lr.ph6
+
+.lr.ph:                                           ; preds = %.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.preheader ]
+  %18 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %19 = load i32* %18, align 4
+  %20 = icmp sgt i32 %19, 4
+  %21 = select i1 %20, i32 4, i32 0
+  %22 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  store i32 %21, i32* %22, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %1
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph10, %.preheader4, %.lr.ph, %.preheader
+  ret void
+}
+
+;CHECK: @example8
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example8(i32 %x) nounwind uwtable ssp {
+  br label %.preheader
+
+.preheader:                                       ; preds = %3, %0
+  %indvars.iv3 = phi i64 [ 0, %0 ], [ %indvars.iv.next4, %3 ]
+  br label %1
+
+; <label>:1                                       ; preds = %1, %.preheader
+  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [32 x [1024 x i32]]* @G, i64 0, i64 %indvars.iv3, i64 %indvars.iv
+  store i32 %x, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %3, label %1
+
+; <label>:3                                       ; preds = %1
+  %indvars.iv.next4 = add i64 %indvars.iv3, 1
+  %lftr.wideiv5 = trunc i64 %indvars.iv.next4 to i32
+  %exitcond6 = icmp eq i32 %lftr.wideiv5, 32
+  br i1 %exitcond6, label %4, label %.preheader
+
+; <label>:4                                       ; preds = %3
+  ret void
+}
+
+;CHECK: @example9
+;CHECK: phi <4 x i32>
+;CHECK: ret i32
+define i32 @example9() nounwind uwtable readonly ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %diff.01 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds [1024 x i32]* @ub, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [1024 x i32]* @uc, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add i32 %3, %diff.01
+  %7 = sub i32 %6, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret i32 %7
+}
+
+;CHECK: @example10a
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: load <4 x i16>
+;CHECK: add <4 x i16>
+;CHECK: store <4 x i16>
+;CHECK: ret void
+define void @example10a(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32* %ib, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %ic, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds i32* %ia, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %8 = getelementptr inbounds i16* %sb, i64 %indvars.iv
+  %9 = load i16* %8, align 2
+  %10 = getelementptr inbounds i16* %sc, i64 %indvars.iv
+  %11 = load i16* %10, align 2
+  %12 = add i16 %11, %9
+  %13 = getelementptr inbounds i16* %sa, i64 %indvars.iv
+  store i16 %12, i16* %13, align 2
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %14, label %1
+
+; <label>:14                                      ; preds = %1
+  ret void
+}
+
+;CHECK: @example10b
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i16* %sb, i64 %indvars.iv
+  %3 = load i16* %2, align 2
+  %4 = sext i16 %3 to i32
+  %5 = getelementptr inbounds i32* %ia, i64 %indvars.iv
+  store i32 %4, i32* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret void
+}
+
+;CHECK: @example11
+;CHECK: load i32
+;CHECK: load i32
+;CHECK: load i32
+;CHECK: load i32
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: ret void
+define void @example11() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = shl nsw i64 %indvars.iv, 1
+  %3 = or i64 %2, 1
+  %4 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %3
+  %5 = load i32* %4, align 4
+  %6 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %3
+  %7 = load i32* %6, align 4
+  %8 = mul nsw i32 %7, %5
+  %9 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %2
+  %10 = load i32* %9, align 8
+  %11 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %2
+  %12 = load i32* %11, align 8
+  %13 = mul nsw i32 %12, %10
+  %14 = sub nsw i32 %8, %13
+  %15 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %14, i32* %15, align 4
+  %16 = mul nsw i32 %7, %10
+  %17 = mul nsw i32 %12, %5
+  %18 = add nsw i32 %17, %16
+  %19 = getelementptr inbounds [2048 x i32]* @d, i64 0, i64 %indvars.iv
+  store i32 %18, i32* %19, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %20, label %1
+
+; <label>:20                                      ; preds = %1
+  ret void
+}
+
+;CHECK: @example12
+;CHECK: trunc <4 x i64>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example12() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = trunc i64 %indvars.iv to i32
+  store i32 %3, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %4, label %1
+
+; <label>:4                                       ; preds = %1
+  ret void
+}
+
+; Can't vectorize because of reductions.
+;CHECK: @example13
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example13(i32** nocapture %A, i32** nocapture %B, i32* nocapture %out) nounwind uwtable ssp {
+  br label %.preheader
+
+.preheader:                                       ; preds = %14, %0
+  %indvars.iv4 = phi i64 [ 0, %0 ], [ %indvars.iv.next5, %14 ]
+  %1 = getelementptr inbounds i32** %A, i64 %indvars.iv4
+  %2 = load i32** %1, align 8
+  %3 = getelementptr inbounds i32** %B, i64 %indvars.iv4
+  %4 = load i32** %3, align 8
+  br label %5
+
+; <label>:5                                       ; preds = %.preheader, %5
+  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %5 ]
+  %diff.02 = phi i32 [ 0, %.preheader ], [ %11, %5 ]
+  %6 = getelementptr inbounds i32* %2, i64 %indvars.iv
+  %7 = load i32* %6, align 4
+  %8 = getelementptr inbounds i32* %4, i64 %indvars.iv
+  %9 = load i32* %8, align 4
+  %10 = add i32 %7, %diff.02
+  %11 = sub i32 %10, %9
+  %indvars.iv.next = add i64 %indvars.iv, 8
+  %12 = trunc i64 %indvars.iv.next to i32
+  %13 = icmp slt i32 %12, 1024
+  br i1 %13, label %5, label %14
+
+; <label>:14                                      ; preds = %5
+  %15 = getelementptr inbounds i32* %out, i64 %indvars.iv4
+  store i32 %11, i32* %15, align 4
+  %indvars.iv.next5 = add i64 %indvars.iv4, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next5 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 32
+  br i1 %exitcond, label %16, label %.preheader
+
+; <label>:16                                      ; preds = %14
+  ret void
+}
+
+; Can't vectorize because of reductions.
+;CHECK: @example14
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example14(i32** nocapture %in, i32** nocapture %coeff, i32* nocapture %out) nounwind uwtable ssp {
+.preheader3:
+  br label %.preheader
+
+.preheader:                                       ; preds = %11, %.preheader3
+  %indvars.iv7 = phi i64 [ 0, %.preheader3 ], [ %indvars.iv.next8, %11 ]
+  %sum.05 = phi i32 [ 0, %.preheader3 ], [ %10, %11 ]
+  br label %0
+
+; <label>:0                                       ; preds = %0, %.preheader
+  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %0 ]
+  %sum.12 = phi i32 [ %sum.05, %.preheader ], [ %10, %0 ]
+  %1 = getelementptr inbounds i32** %in, i64 %indvars.iv
+  %2 = load i32** %1, align 8
+  %3 = getelementptr inbounds i32* %2, i64 %indvars.iv7
+  %4 = load i32* %3, align 4
+  %5 = getelementptr inbounds i32** %coeff, i64 %indvars.iv
+  %6 = load i32** %5, align 8
+  %7 = getelementptr inbounds i32* %6, i64 %indvars.iv7
+  %8 = load i32* %7, align 4
+  %9 = mul nsw i32 %8, %4
+  %10 = add nsw i32 %9, %sum.12
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %11, label %0
+
+; <label>:11                                      ; preds = %0
+  %indvars.iv.next8 = add i64 %indvars.iv7, 1
+  %lftr.wideiv9 = trunc i64 %indvars.iv.next8 to i32
+  %exitcond10 = icmp eq i32 %lftr.wideiv9, 32
+  br i1 %exitcond10, label %.preheader3.1, label %.preheader
+
+.preheader3.1:                                    ; preds = %11
+  store i32 %10, i32* %out, align 4
+  br label %.preheader.1
+
+.preheader.1:                                     ; preds = %24, %.preheader3.1
+  %indvars.iv7.1 = phi i64 [ 0, %.preheader3.1 ], [ %indvars.iv.next8.1, %24 ]
+  %sum.05.1 = phi i32 [ 0, %.preheader3.1 ], [ %23, %24 ]
+  br label %12
+
+; <label>:12                                      ; preds = %12, %.preheader.1
+  %indvars.iv.1 = phi i64 [ 0, %.preheader.1 ], [ %13, %12 ]
+  %sum.12.1 = phi i32 [ %sum.05.1, %.preheader.1 ], [ %23, %12 ]
+  %13 = add nsw i64 %indvars.iv.1, 1
+  %14 = getelementptr inbounds i32** %in, i64 %13
+  %15 = load i32** %14, align 8
+  %16 = getelementptr inbounds i32* %15, i64 %indvars.iv7.1
+  %17 = load i32* %16, align 4
+  %18 = getelementptr inbounds i32** %coeff, i64 %indvars.iv.1
+  %19 = load i32** %18, align 8
+  %20 = getelementptr inbounds i32* %19, i64 %indvars.iv7.1
+  %21 = load i32* %20, align 4
+  %22 = mul nsw i32 %21, %17
+  %23 = add nsw i32 %22, %sum.12.1
+  %lftr.wideiv.1 = trunc i64 %13 to i32
+  %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 1024
+  br i1 %exitcond.1, label %24, label %12
+
+; <label>:24                                      ; preds = %12
+  %indvars.iv.next8.1 = add i64 %indvars.iv7.1, 1
+  %lftr.wideiv9.1 = trunc i64 %indvars.iv.next8.1 to i32
+  %exitcond10.1 = icmp eq i32 %lftr.wideiv9.1, 32
+  br i1 %exitcond10.1, label %.preheader3.2, label %.preheader.1
+
+.preheader3.2:                                    ; preds = %24
+  %25 = getelementptr inbounds i32* %out, i64 1
+  store i32 %23, i32* %25, align 4
+  br label %.preheader.2
+
+.preheader.2:                                     ; preds = %38, %.preheader3.2
+  %indvars.iv7.2 = phi i64 [ 0, %.preheader3.2 ], [ %indvars.iv.next8.2, %38 ]
+  %sum.05.2 = phi i32 [ 0, %.preheader3.2 ], [ %37, %38 ]
+  br label %26
+
+; <label>:26                                      ; preds = %26, %.preheader.2
+  %indvars.iv.2 = phi i64 [ 0, %.preheader.2 ], [ %indvars.iv.next.2, %26 ]
+  %sum.12.2 = phi i32 [ %sum.05.2, %.preheader.2 ], [ %37, %26 ]
+  %27 = add nsw i64 %indvars.iv.2, 2
+  %28 = getelementptr inbounds i32** %in, i64 %27
+  %29 = load i32** %28, align 8
+  %30 = getelementptr inbounds i32* %29, i64 %indvars.iv7.2
+  %31 = load i32* %30, align 4
+  %32 = getelementptr inbounds i32** %coeff, i64 %indvars.iv.2
+  %33 = load i32** %32, align 8
+  %34 = getelementptr inbounds i32* %33, i64 %indvars.iv7.2
+  %35 = load i32* %34, align 4
+  %36 = mul nsw i32 %35, %31
+  %37 = add nsw i32 %36, %sum.12.2
+  %indvars.iv.next.2 = add i64 %indvars.iv.2, 1
+  %lftr.wideiv.2 = trunc i64 %indvars.iv.next.2 to i32
+  %exitcond.2 = icmp eq i32 %lftr.wideiv.2, 1024
+  br i1 %exitcond.2, label %38, label %26
+
+; <label>:38                                      ; preds = %26
+  %indvars.iv.next8.2 = add i64 %indvars.iv7.2, 1
+  %lftr.wideiv9.2 = trunc i64 %indvars.iv.next8.2 to i32
+  %exitcond10.2 = icmp eq i32 %lftr.wideiv9.2, 32
+  br i1 %exitcond10.2, label %.preheader3.3, label %.preheader.2
+
+.preheader3.3:                                    ; preds = %38
+  %39 = getelementptr inbounds i32* %out, i64 2
+  store i32 %37, i32* %39, align 4
+  br label %.preheader.3
+
+.preheader.3:                                     ; preds = %52, %.preheader3.3
+  %indvars.iv7.3 = phi i64 [ 0, %.preheader3.3 ], [ %indvars.iv.next8.3, %52 ]
+  %sum.05.3 = phi i32 [ 0, %.preheader3.3 ], [ %51, %52 ]
+  br label %40
+
+; <label>:40                                      ; preds = %40, %.preheader.3
+  %indvars.iv.3 = phi i64 [ 0, %.preheader.3 ], [ %indvars.iv.next.3, %40 ]
+  %sum.12.3 = phi i32 [ %sum.05.3, %.preheader.3 ], [ %51, %40 ]
+  %41 = add nsw i64 %indvars.iv.3, 3
+  %42 = getelementptr inbounds i32** %in, i64 %41
+  %43 = load i32** %42, align 8
+  %44 = getelementptr inbounds i32* %43, i64 %indvars.iv7.3
+  %45 = load i32* %44, align 4
+  %46 = getelementptr inbounds i32** %coeff, i64 %indvars.iv.3
+  %47 = load i32** %46, align 8
+  %48 = getelementptr inbounds i32* %47, i64 %indvars.iv7.3
+  %49 = load i32* %48, align 4
+  %50 = mul nsw i32 %49, %45
+  %51 = add nsw i32 %50, %sum.12.3
+  %indvars.iv.next.3 = add i64 %indvars.iv.3, 1
+  %lftr.wideiv.3 = trunc i64 %indvars.iv.next.3 to i32
+  %exitcond.3 = icmp eq i32 %lftr.wideiv.3, 1024
+  br i1 %exitcond.3, label %52, label %40
+
+; <label>:52                                      ; preds = %40
+  %indvars.iv.next8.3 = add i64 %indvars.iv7.3, 1
+  %lftr.wideiv9.3 = trunc i64 %indvars.iv.next8.3 to i32
+  %exitcond10.3 = icmp eq i32 %lftr.wideiv9.3, 32
+  br i1 %exitcond10.3, label %53, label %.preheader.3
+
+; <label>:53                                      ; preds = %52
+  %54 = getelementptr inbounds i32* %out, i64 3
+  store i32 %51, i32* %54, align 4
+  ret void
+}
+
+; Can't vectorize because the src and dst pointers are not disjoint.
+;CHECK: @example21
+;CHECK-NOT: <4 x i32>
+;CHECK: ret i32
+define i32 @example21(i32* nocapture %b, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = sext i32 %n to i64
+  br label %3
+
+; <label>:3                                       ; preds = %.lr.ph, %3
+  %indvars.iv = phi i64 [ %2, %.lr.ph ], [ %indvars.iv.next, %3 ]
+  %a.02 = phi i32 [ 0, %.lr.ph ], [ %6, %3 ]
+  %indvars.iv.next = add i64 %indvars.iv, -1
+  %4 = getelementptr inbounds i32* %b, i64 %indvars.iv.next
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %a.02
+  %7 = trunc i64 %indvars.iv.next to i32
+  %8 = icmp sgt i32 %7, 0
+  br i1 %8, label %3, label %._crit_edge
+
+._crit_edge:                                      ; preds = %3, %0
+  %a.0.lcssa = phi i32 [ 0, %0 ], [ %6, %3 ]
+  ret i32 %a.0.lcssa
+}
+
+; Can't vectorize because there are multiple PHIs.
+;CHECK: @example23
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example23(i16* nocapture %src, i32* nocapture %dst) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16* %.04, i64 1
+  %3 = load i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %7, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+;CHECK: @example24
+;CHECK: shufflevector <4 x i16>
+;CHECK: ret void
+define void @example24(i16 signext %x, i16 signext %y) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [1024 x float]* @fa, i64 0, i64 %indvars.iv
+  %3 = load float* %2, align 4
+  %4 = getelementptr inbounds [1024 x float]* @fb, i64 0, i64 %indvars.iv
+  %5 = load float* %4, align 4
+  %6 = fcmp olt float %3, %5
+  %x.y = select i1 %6, i16 %x, i16 %y
+  %7 = sext i16 %x.y to i32
+  %8 = getelementptr inbounds [1024 x i32]* @ic, i64 0, i64 %indvars.iv
+  store i32 %7, i32* %8, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %9, label %1
+
+; <label>:9                                       ; preds = %1
+  ret void
+}
+
+;CHECK: @example25
+;CHECK: and <4 x i1>
+;CHECK: zext <4 x i1>
+;CHECK: ret void
+define void @example25() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [1024 x float]* @da, i64 0, i64 %indvars.iv
+  %3 = load float* %2, align 4
+  %4 = getelementptr inbounds [1024 x float]* @db, i64 0, i64 %indvars.iv
+  %5 = load float* %4, align 4
+  %6 = fcmp olt float %3, %5
+  %7 = getelementptr inbounds [1024 x float]* @dc, i64 0, i64 %indvars.iv
+  %8 = load float* %7, align 4
+  %9 = getelementptr inbounds [1024 x float]* @dd, i64 0, i64 %indvars.iv
+  %10 = load float* %9, align 4
+  %11 = fcmp olt float %8, %10
+  %12 = and i1 %6, %11
+  %13 = zext i1 %12 to i32
+  %14 = getelementptr inbounds [1024 x i32]* @dj, i64 0, i64 %indvars.iv
+  store i32 %13, i32* %14, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %15, label %1
+
+; <label>:15                                      ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/increment.ll b/test/Transforms/LoopVectorize/increment.ll
new file mode 100644
index 0000000..71ea768
--- /dev/null
+++ b/test/Transforms/LoopVectorize/increment.ll
@@ -0,0 +1,66 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; This is the loop.
+;  for (i=0; i<n; i++){
+;    a[i] += i;
+;  }
+;CHECK: @inc
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @inc(i32 %n) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = trunc i64 %indvars.iv to i32
+  %5 = add nsw i32 %3, %4
+  store i32 %5, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+; Can't vectorize this loop because the access to A[X] is non linear.
+;
+;  for (i = 0; i < n; ++i) {
+;    A[B[i]]++;
+;
+;CHECK: @histogram
+;CHECK-NOT: <4 x i32>
+;CHECK: ret i32
+define i32 @histogram(i32* nocapture noalias %A, i32* nocapture noalias %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %idxprom1 = sext i32 %0 to i64
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1
+  %1 = load i32* %arrayidx2, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 0
+}
diff --git a/test/Transforms/LoopVectorize/induction_plus.ll b/test/Transforms/LoopVectorize/induction_plus.ll
new file mode 100644
index 0000000..b31bceb
--- /dev/null
+++ b/test/Transforms/LoopVectorize/induction_plus.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@array = common global [1024 x i32] zeroinitializer, align 16
+
+;CHECK: @array_at_plus_one
+;CHECK: add <4 x i64>
+;CHECK: trunc <4 x i64>
+;CHECK: add i64 %index, 12
+;CHECK: ret i32
+define i32 @array_at_plus_one(i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = add nsw i64 %indvars.iv, 12
+  %3 = getelementptr inbounds [1024 x i32]* @array, i64 0, i64 %2
+  %4 = trunc i64 %indvars.iv to i32
+  store i32 %4, i32* %3, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/lit.local.cfg b/test/Transforms/LoopVectorize/lit.local.cfg
new file mode 100644
index 0000000..19eebc0
--- /dev/null
+++ b/test/Transforms/LoopVectorize/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/LoopVectorize/non-const-n.ll b/test/Transforms/LoopVectorize/non-const-n.ll
new file mode 100644
index 0000000..1a6c15e
--- /dev/null
+++ b/test/Transforms/LoopVectorize/non-const-n.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: @example1
+;CHECK: shl i32
+;CHECK: zext i32
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1(i32 %n) nounwind uwtable ssp {
+  %n4 = shl i32 %n, 2
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n4
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/read-only.ll b/test/Transforms/LoopVectorize/read-only.ll
new file mode 100644
index 0000000..b4d1bac
--- /dev/null
+++ b/test/Transforms/LoopVectorize/read-only.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @read_only_func
+;CHECK: load <4 x i32>
+;CHECK: ret i32
+define i32 @read_only_func(i32* nocapture %A, i32* nocapture %B, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = add nsw i64 %indvars.iv, 13
+  %5 = getelementptr inbounds i32* %B, i64 %4
+  %6 = load i32* %5, align 4
+  %7 = shl i32 %6, 1
+  %8 = add i32 %3, %sum.02
+  %9 = add i32 %8, %7
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll
new file mode 100644
index 0000000..c1848b3
--- /dev/null
+++ b/test/Transforms/LoopVectorize/reduction.ll
@@ -0,0 +1,232 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @reduction_sum
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %sum.02, %6
+  %8 = add i32 %7, %3
+  %9 = add i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK: @reduction_prod
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: mul <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %prod.02 = phi i32 [ %9, %.lr.ph ], [ 1, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = mul i32 %prod.02, %6
+  %8 = mul i32 %7, %3
+  %9 = mul i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %prod.0.lcssa = phi i32 [ 1, %0 ], [ %9, %.lr.ph ]
+  ret i32 %prod.0.lcssa
+}
+
+;CHECK: @reduction_mix
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: mul nsw <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = mul nsw i32 %5, %3
+  %7 = trunc i64 %indvars.iv to i32
+  %8 = add i32 %sum.02, %7
+  %9 = add i32 %8, %6
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK: @reduction_mul
+;CHECK: mul <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 19, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %3, %6
+  %8 = add i32 %7, %5
+  %9 = mul i32 %8, %sum.02
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK: @start_at_non_zero
+;CHECK: phi <4 x i32>
+;CHECK: <i32 120, i32 0, i32 0, i32 0>
+;CHECK: ret i32
+define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ]
+  %arrayidx = getelementptr inbounds i32* %in, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %coeff, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %sum.09
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK: @reduction_and
+;CHECK: and <4 x i32>
+;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1>
+;CHECK: ret i32
+define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %and = and i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ -1, %entry ], [ %and, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+;CHECK: @reduction_or
+;CHECK: or <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %or = or i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %or, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+;CHECK: @reduction_xor
+;CHECK: xor <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %xor = xor i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ]
+  ret i32 %result.0.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll
new file mode 100644
index 0000000..23933cf
--- /dev/null
+++ b/test/Transforms/LoopVectorize/runtime-check.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Make sure we vectorize this loop:
+; int foo(float *a, float *b, int n) {
+;   for (int i=0; i<n; ++i)
+;     a[i] = b[i] * 3;
+; }
+
+;CHECK: load <4 x float>
+define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtable ssp {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4, !tbaa !0
+  %mul = fmul float %0, 3.000000e+00
+  %arrayidx2 = getelementptr inbounds float* %a, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 undef
+}
+
+!0 = metadata !{metadata !"float", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LoopVectorize/scalar-select.ll b/test/Transforms/LoopVectorize/scalar-select.ll
new file mode 100644
index 0000000..e537bde
--- /dev/null
+++ b/test/Transforms/LoopVectorize/scalar-select.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: @example1
+;CHECK: load <4 x i32>
+; make sure that we have a scalar condition and a vector operand.
+;CHECK: select i1 %cond, <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1(i1 %cond) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %sel = select i1 %cond, i32 %6, i32 zeroinitializer
+  store i32 %sel, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/small-loop.ll b/test/Transforms/LoopVectorize/small-loop.ll
new file mode 100644
index 0000000..4a6e4b2
--- /dev/null
+++ b/test/Transforms/LoopVectorize/small-loop.ll
@@ -0,0 +1,33 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: @example1
+;CHECK-NOT: load <4 x i32>
+;CHECK: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 8  ;   <-----  A really small trip count.
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/start-non-zero.ll b/test/Transforms/LoopVectorize/start-non-zero.ll
new file mode 100644
index 0000000..5aa3bc0
--- /dev/null
+++ b/test/Transforms/LoopVectorize/start-non-zero.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @start_at_nonzero
+;CHECK: mul nuw <4 x i32>
+;CHECK: ret i32
+define i32 @start_at_nonzero(i32* nocapture %a, i32 %start, i32 %end) nounwind uwtable ssp {
+entry:
+  %cmp3 = icmp slt i32 %start, %end
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = sext i32 %start to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %indvars.iv = phi i64 [ %0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %1 = load i32* %arrayidx, align 4, !tbaa !0
+  %mul = mul nuw i32 %1, 333
+  store i32 %mul, i32* %arrayidx, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %2 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %2, %end
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 4
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LoopVectorize/write-only.ll b/test/Transforms/LoopVectorize/write-only.ll
new file mode 100644
index 0000000..eb02760
--- /dev/null
+++ b/test/Transforms/LoopVectorize/write-only.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @read_mod_write_single_ptr
+;CHECK: load <4 x float>
+;CHECK: ret i32
+define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds float* %a, i64 %indvars.iv
+  %3 = load float* %2, align 4
+  %4 = fmul float %3, 3.000000e+00
+  store float %4, float* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll b/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll
index 61ba3c7..597b69d 100644
--- a/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll
+++ b/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll
@@ -9,11 +9,11 @@ declare void @g(%a*)
 define float @f() {
 entry:
   %a_var = alloca %a
-  %b_var = alloca %b
+  %b_var = alloca %b, align 1
   call void @g(%a* %a_var)
   %a_i8 = bitcast %a* %a_var to i8*
   %b_i8 = bitcast %b* %b_var to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b_i8, i8* %a_i8, i32 4, i32 4, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b_i8, i8* %a_i8, i32 4, i32 1, i1 false)
   %tmp1 = getelementptr %b* %b_var, i32 0, i32 0
   %tmp2 = load float* %tmp1
   ret float %tmp2
diff --git a/test/Transforms/MemCpyOpt/align.ll b/test/Transforms/MemCpyOpt/align.ll
index b1f900d..1b98f6a 100644
--- a/test/Transforms/MemCpyOpt/align.ll
+++ b/test/Transforms/MemCpyOpt/align.ll
@@ -1,12 +1,15 @@
-; RUN: opt < %s -S -memcpyopt | FileCheck %s
+; RUN: opt < %s -S -basicaa -memcpyopt | FileCheck %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
 ; The resulting memset is only 4-byte aligned, despite containing
 ; a 16-byte aligned store in the middle.
 
-; CHECK: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 0, i64 16, i32 4, i1 false)
-
 define void @foo(i32* %p) {
+; CHECK: @foo
+; CHECK: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 0, i64 16, i32 4, i1 false)
   %a0 = getelementptr i32* %p, i64 0
   store i32 0, i32* %a0, align 4
   %a1 = getelementptr i32* %p, i64 1
@@ -17,3 +20,18 @@ define void @foo(i32* %p) {
   store i32 0, i32* %a3, align 4
   ret void
 }
+
+; Replacing %a8 with %a4 in the memset requires boosting the alignment of %a4.
+
+define void @bar() {
+; CHECK: @bar
+; CHECK: %a4 = alloca i32, align 8
+; CHECK-NOT: memcpy
+  %a4 = alloca i32, align 4
+  %a8 = alloca i32, align 8
+  %a8.cast = bitcast i32* %a8 to i8*
+  %a4.cast = bitcast i32* %a4 to i8*
+  call void @llvm.memset.p0i8.i64(i8* %a8.cast, i8 0, i64 4, i32 8, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a4.cast, i8* %a8.cast, i64 4, i32 4, i1 false)
+  ret void
+}
diff --git a/test/Transforms/MemCpyOpt/form-memset.ll b/test/Transforms/MemCpyOpt/form-memset.ll
index 8832f89..f63b1dc 100644
--- a/test/Transforms/MemCpyOpt/form-memset.ll
+++ b/test/Transforms/MemCpyOpt/form-memset.ll
@@ -248,3 +248,27 @@ entry:
 ; CHECK: @test8
 ; CHECK: store <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32>* %0, align 16
 }
+
+@test9buf = internal unnamed_addr global [16 x i64] zeroinitializer, align 16
+
+define void @test9() nounwind {
+  store i8 -1, i8* bitcast ([16 x i64]* @test9buf to i8*), align 16
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 1), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 2), align 2
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 3), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 4), align 4
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 5), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 6), align 2
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 7), align 1
+  store i8 -1, i8* bitcast (i64* getelementptr inbounds ([16 x i64]* @test9buf, i64 0, i64 1) to i8*), align 8
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 9), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 10), align 2
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 11), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 12), align 4
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 13), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 14), align 2
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 15), align 1
+  ret void
+; CHECK: @test9(
+; CHECK: call void @llvm.memset.p0i8.i64(i8* bitcast ([16 x i64]* @test9buf to i8*), i8 -1, i64 16, i32 16, i1 false)
+}
diff --git a/test/Transforms/MetaRenamer/lit.local.cfg b/test/Transforms/MetaRenamer/lit.local.cfg
new file mode 100644
index 0000000..c6106e4
--- /dev/null
+++ b/test/Transforms/MetaRenamer/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.ll']
diff --git a/test/Transforms/MetaRenamer/metarenamer.ll b/test/Transforms/MetaRenamer/metarenamer.ll
new file mode 100644
index 0000000..ad41bcf
--- /dev/null
+++ b/test/Transforms/MetaRenamer/metarenamer.ll
@@ -0,0 +1,96 @@
+; RUN: opt %s -metarenamer -S | FileCheck %s
+
+; CHECK: target triple {{.*}}
+; CHECK-NOT: {{^x*}}xxx{{^x*}}
+; CHECK: ret i32 6
+
+target triple = "x86_64-pc-linux-gnu"
+
+%struct.bar_xxx = type { i32, double }
+%struct.foo_xxx = type { i32, float, %struct.bar_xxx }
+
+@func_5_xxx.static_local_3_xxx = internal global i32 3, align 4
+@global_3_xxx = common global i32 0, align 4
+
+@func_7_xxx = alias weak i32 (...)* @aliased_func_7_xxx
+
+declare i32 @aliased_func_7_xxx(...)
+
+define i32 @func_3_xxx() nounwind uwtable ssp {
+  ret i32 3
+}
+
+define void @func_4_xxx(%struct.foo_xxx* sret %agg.result) nounwind uwtable ssp {
+  %1 = alloca %struct.foo_xxx, align 8
+  %2 = getelementptr inbounds %struct.foo_xxx* %1, i32 0, i32 0
+  store i32 1, i32* %2, align 4
+  %3 = getelementptr inbounds %struct.foo_xxx* %1, i32 0, i32 1
+  store float 2.000000e+00, float* %3, align 4
+  %4 = getelementptr inbounds %struct.foo_xxx* %1, i32 0, i32 2
+  %5 = getelementptr inbounds %struct.bar_xxx* %4, i32 0, i32 0
+  store i32 3, i32* %5, align 4
+  %6 = getelementptr inbounds %struct.bar_xxx* %4, i32 0, i32 1
+  store double 4.000000e+00, double* %6, align 8
+  %7 = bitcast %struct.foo_xxx* %agg.result to i8*
+  %8 = bitcast %struct.foo_xxx* %1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %7, i8* %8, i64 24, i32 8, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define i32 @func_5_xxx(i32 %arg_1_xxx, i32 %arg_2_xxx, i32 %arg_3_xxx, i32 %arg_4_xxx) nounwind uwtable ssp {
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %local_1_xxx = alloca i32, align 4
+  %local_2_xxx = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %arg_1_xxx, i32* %1, align 4
+  store i32 %arg_2_xxx, i32* %2, align 4
+  store i32 %arg_3_xxx, i32* %3, align 4
+  store i32 %arg_4_xxx, i32* %4, align 4
+  store i32 1, i32* %local_1_xxx, align 4
+  store i32 2, i32* %local_2_xxx, align 4
+  store i32 0, i32* %i, align 4
+  br label %5
+
+; <label>:5                                       ; preds = %9, %0
+  %6 = load i32* %i, align 4
+  %7 = icmp slt i32 %6, 10
+  br i1 %7, label %8, label %12
+
+; <label>:8                                       ; preds = %5
+  br label %9
+
+; <label>:9                                       ; preds = %8
+  %10 = load i32* %i, align 4
+  %11 = add nsw i32 %10, 1
+  store i32 %11, i32* %i, align 4
+  br label %5
+
+; <label>:12                                      ; preds = %5
+  %13 = load i32* %local_1_xxx, align 4
+  %14 = load i32* %1, align 4
+  %15 = add nsw i32 %13, %14
+  %16 = load i32* %local_2_xxx, align 4
+  %17 = add nsw i32 %15, %16
+  %18 = load i32* %2, align 4
+  %19 = add nsw i32 %17, %18
+  %20 = load i32* @func_5_xxx.static_local_3_xxx, align 4
+  %21 = add nsw i32 %19, %20
+  %22 = load i32* %3, align 4
+  %23 = add nsw i32 %21, %22
+  %24 = load i32* %4, align 4
+  %25 = add nsw i32 %23, %24
+  ret i32 %25
+}
+
+define i32 @varargs_func_6_xxx(i32 %arg_1_xxx, i32 %arg_2_xxx, ...) nounwind uwtable ssp {
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  store i32 %arg_1_xxx, i32* %1, align 4
+  store i32 %arg_2_xxx, i32* %2, align 4
+  ret i32 6
+}
diff --git a/test/Transforms/ObjCARC/basic.ll b/test/Transforms/ObjCARC/basic.ll
index 0a7ba5d..7b64b1b 100644
--- a/test/Transforms/ObjCARC/basic.ll
+++ b/test/Transforms/ObjCARC/basic.ll
@@ -1,4 +1,4 @@
-; RUN: opt -objc-arc -S < %s | FileCheck %s
+; RUN: opt -basicaa -objc-arc -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64"
 
@@ -1498,7 +1498,7 @@ define i8* @test49(i8* %p) nounwind {
 }
 
 ; Do delete retain+release with intervening stores of the
-; address value;
+; address value.
 
 ; CHECK: define void @test50(
 ; CHECK-NOT: @objc_
diff --git a/test/Transforms/ObjCARC/nested.ll b/test/Transforms/ObjCARC/nested.ll
index a618a21..32be03e 100644
--- a/test/Transforms/ObjCARC/nested.ll
+++ b/test/Transforms/ObjCARC/nested.ll
@@ -16,6 +16,10 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
 declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
 declare void @use(i8*)
 declare void @objc_release(i8*)
+declare i8* @def()
+declare void @__crasher_block_invoke(i8* nocapture)
+declare i8* @objc_retainBlock(i8*)
+declare void @__crasher_block_invoke1(i8* nocapture)
 
 !0 = metadata !{}
 
@@ -279,11 +283,13 @@ forcoll.empty:
   ret void
 }
 
-; Delete a nested retain+release pair.
+; TODO: Delete a nested retain+release pair.
+; The optimizer currently can't do this, because isn't isn't sophisticated enough in
+; reasnoning about nesting.
 
 ; CHECK: define void @test6(
 ; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: @objc_retain
 ; CHECK: }
 define void @test6() nounwind {
 entry:
@@ -345,11 +351,13 @@ forcoll.empty:
   ret void
 }
 
-; Delete a nested retain+release pair.
+; TODO: Delete a nested retain+release pair.
+; The optimizer currently can't do this, because isn't isn't sophisticated enough in
+; reasnoning about nesting.
 
 ; CHECK: define void @test7(
 ; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: @objc_retain
 ; CHECK: }
 define void @test7() nounwind {
 entry:
@@ -553,12 +561,12 @@ forcoll.empty:
   ret void
 }
 
-; Like test9, but without a split backedge. This we can optimize.
+; Like test9, but without a split backedge. TODO: optimize this.
 
 ; CHECK: define void @test9b(
 ; CHECK: call i8* @objc_retain
 ; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: @objc_retain
 ; CHECK: }
 define void @test9b() nounwind {
 entry:
@@ -687,12 +695,12 @@ forcoll.empty:
   ret void
 }
 
-; Like test10, but without a split backedge. This we can optimize.
+; Like test10, but without a split backedge. TODO: optimize this.
 
 ; CHECK: define void @test10b(
 ; CHECK: call i8* @objc_retain
 ; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: @objc_retain
 ; CHECK: }
 define void @test10b() nounwind {
 entry:
@@ -751,3 +759,64 @@ forcoll.empty:
   call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
   ret void
 }
+
+; Pointers to strong pointers can obscure provenance relationships. Be conservative
+; in the face of escaping pointers. rdar://12150909.
+
+%struct.__block_d = type { i64, i64 }
+
+@_NSConcreteStackBlock = external global i8*
+@__block_d_tmp = external hidden constant { i64, i64, i8*, i8*, i8*, i8* }
+@__block_d_tmp5 = external hidden constant { i64, i64, i8*, i8*, i8*, i8* }
+
+; CHECK: define void @test11(
+; CHECK: tail call i8* @objc_retain(i8* %call) nounwind
+; CHECK: tail call i8* @objc_retain(i8* %call) nounwind
+; CHECK: call void @objc_release(i8* %call) nounwind, !clang.imprecise_release !0
+; CHECK: }
+define void @test11() {
+entry:
+  %block = alloca <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>, align 8
+  %block9 = alloca <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>, align 8
+  %call = call i8* @def(), !clang.arc.no_objc_arc_exceptions !0
+  %foo = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 5
+  %block.isa = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 0
+  store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** %block.isa, align 8
+  %block.flags = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 1
+  store i32 1107296256, i32* %block.flags, align 8
+  %block.reserved = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 2
+  store i32 0, i32* %block.reserved, align 4
+  %block.invoke = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 3
+  store i8* bitcast (void (i8*)* @__crasher_block_invoke to i8*), i8** %block.invoke, align 8
+  %block.d = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 4
+  store %struct.__block_d* bitcast ({ i64, i64, i8*, i8*, i8*, i8* }* @__block_d_tmp to %struct.__block_d*), %struct.__block_d** %block.d, align 8
+  %foo2 = tail call i8* @objc_retain(i8* %call) nounwind
+  store i8* %foo2, i8** %foo, align 8
+  %foo4 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block to i8*
+  %foo5 = call i8* @objc_retainBlock(i8* %foo4) nounwind
+  call void @use(i8* %foo5), !clang.arc.no_objc_arc_exceptions !0
+  call void @objc_release(i8* %foo5) nounwind
+  %strongdestroy = load i8** %foo, align 8
+  call void @objc_release(i8* %strongdestroy) nounwind, !clang.imprecise_release !0
+  %foo10 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 5
+  %block.isa11 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 0
+  store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** %block.isa11, align 8
+  %block.flags12 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 1
+  store i32 1107296256, i32* %block.flags12, align 8
+  %block.reserved13 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 2
+  store i32 0, i32* %block.reserved13, align 4
+  %block.invoke14 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 3
+  store i8* bitcast (void (i8*)* @__crasher_block_invoke1 to i8*), i8** %block.invoke14, align 8
+  %block.d15 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 4
+  store %struct.__block_d* bitcast ({ i64, i64, i8*, i8*, i8*, i8* }* @__block_d_tmp5 to %struct.__block_d*), %struct.__block_d** %block.d15, align 8
+  %foo18 = call i8* @objc_retain(i8* %call) nounwind
+  store i8* %call, i8** %foo10, align 8
+  %foo20 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9 to i8*
+  %foo21 = call i8* @objc_retainBlock(i8* %foo20) nounwind
+  call void @use(i8* %foo21), !clang.arc.no_objc_arc_exceptions !0
+  call void @objc_release(i8* %foo21) nounwind
+  %strongdestroy25 = load i8** %foo10, align 8
+  call void @objc_release(i8* %strongdestroy25) nounwind, !clang.imprecise_release !0
+  call void @objc_release(i8* %call) nounwind, !clang.imprecise_release !0
+  ret void
+}
diff --git a/test/Transforms/ObjCARC/path-overflow.ll b/test/Transforms/ObjCARC/path-overflow.ll
new file mode 100644
index 0000000..e7866ed
--- /dev/null
+++ b/test/Transforms/ObjCARC/path-overflow.ll
@@ -0,0 +1,329 @@
+; RUN: opt -objc-arc -S < %s
+; rdar://12277446
+
+; The total number of paths grows exponentially with the number of branches, and a
+; computation of this number can overflow any reasonable fixed-sized integer.
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios5.0.0"
+
+%struct.NSConstantString.11.33.55.77.99.121.143.332.1130.1340.2768 = type { i32*, i32, i8*, i32 }
+
+@_unnamed_cfstring_591 = external constant %struct.NSConstantString.11.33.55.77.99.121.143.332.1130.1340.2768, section "__DATA,__cfstring"
+
+declare i8* @objc_retain(i8*) nonlazybind
+
+declare void @objc_release(i8*) nonlazybind
+
+define hidden void @foo() {
+entry:
+  br i1 undef, label %msgSend.nullinit, label %msgSend.call
+
+msgSend.call:                                     ; preds = %entry
+  br label %msgSend.cont
+
+msgSend.nullinit:                                 ; preds = %entry
+  br label %msgSend.cont
+
+msgSend.cont:                                     ; preds = %msgSend.nullinit, %msgSend.call
+  %0 = bitcast %struct.NSConstantString.11.33.55.77.99.121.143.332.1130.1340.2768* @_unnamed_cfstring_591 to i8*
+  %1 = call i8* @objc_retain(i8* %0) nounwind
+  br i1 undef, label %msgSend.nullinit33, label %msgSend.call32
+
+msgSend.call32:                                   ; preds = %if.end10
+  br label %msgSend.cont34
+
+msgSend.nullinit33:                               ; preds = %if.end10
+  br label %msgSend.cont34
+
+msgSend.cont34:                                   ; preds = %msgSend.nullinit33, %msgSend.call32
+  br i1 undef, label %msgSend.nullinit38, label %msgSend.call37
+
+msgSend.call37:                                   ; preds = %msgSend.cont34
+  br label %msgSend.cont39
+
+msgSend.nullinit38:                               ; preds = %msgSend.cont34
+  br label %msgSend.cont39
+
+msgSend.cont39:                                   ; preds = %msgSend.nullinit38, %msgSend.call37
+  br i1 undef, label %msgSend.nullinit49, label %msgSend.call48
+
+msgSend.call48:                                   ; preds = %msgSend.cont39
+  br label %msgSend.cont50
+
+msgSend.nullinit49:                               ; preds = %msgSend.cont39
+  br label %msgSend.cont50
+
+msgSend.cont50:                                   ; preds = %msgSend.nullinit49, %msgSend.call48
+  br i1 undef, label %msgSend.nullinit61, label %msgSend.call60
+
+msgSend.call60:                                   ; preds = %msgSend.cont50
+  br label %msgSend.cont62
+
+msgSend.nullinit61:                               ; preds = %msgSend.cont50
+  br label %msgSend.cont62
+
+msgSend.cont62:                                   ; preds = %msgSend.nullinit61, %msgSend.call60
+  br i1 undef, label %msgSend.nullinit67, label %msgSend.call66
+
+msgSend.call66:                                   ; preds = %msgSend.cont62
+  br label %msgSend.cont68
+
+msgSend.nullinit67:                               ; preds = %msgSend.cont62
+  br label %msgSend.cont68
+
+msgSend.cont68:                                   ; preds = %msgSend.nullinit67, %msgSend.call66
+  br i1 undef, label %msgSend.nullinit84, label %msgSend.call83
+
+msgSend.call83:                                   ; preds = %msgSend.cont68
+  br label %msgSend.cont85
+
+msgSend.nullinit84:                               ; preds = %msgSend.cont68
+  br label %msgSend.cont85
+
+msgSend.cont85:                                   ; preds = %msgSend.nullinit84, %msgSend.call83
+  br i1 undef, label %msgSend.nullinit90, label %msgSend.call89
+
+msgSend.call89:                                   ; preds = %msgSend.cont85
+  br label %msgSend.cont91
+
+msgSend.nullinit90:                               ; preds = %msgSend.cont85
+  br label %msgSend.cont91
+
+msgSend.cont91:                                   ; preds = %msgSend.nullinit90, %msgSend.call89
+  br i1 undef, label %msgSend.nullinit104, label %msgSend.call103
+
+msgSend.call103:                                  ; preds = %msgSend.cont91
+  br label %msgSend.cont105
+
+msgSend.nullinit104:                              ; preds = %msgSend.cont91
+  br label %msgSend.cont105
+
+msgSend.cont105:                                  ; preds = %msgSend.nullinit104, %msgSend.call103
+  br i1 undef, label %land.lhs.true, label %if.end123
+
+land.lhs.true:                                    ; preds = %msgSend.cont105
+  br i1 undef, label %if.then117, label %if.end123
+
+if.then117:                                       ; preds = %land.lhs.true
+  br label %if.end123
+
+if.end123:                                        ; preds = %if.then117, %land.lhs.true, %msgSend.cont105
+  br i1 undef, label %msgSend.nullinit132, label %msgSend.call131
+
+msgSend.call131:                                  ; preds = %if.end123
+  br label %msgSend.cont133
+
+msgSend.nullinit132:                              ; preds = %if.end123
+  br label %msgSend.cont133
+
+msgSend.cont133:                                  ; preds = %msgSend.nullinit132, %msgSend.call131
+  br i1 undef, label %msgSend.nullinit139, label %msgSend.call138
+
+msgSend.call138:                                  ; preds = %msgSend.cont133
+  br label %msgSend.cont140
+
+msgSend.nullinit139:                              ; preds = %msgSend.cont133
+  br label %msgSend.cont140
+
+msgSend.cont140:                                  ; preds = %msgSend.nullinit139, %msgSend.call138
+  br i1 undef, label %if.then151, label %if.end157
+
+if.then151:                                       ; preds = %msgSend.cont140
+  br label %if.end157
+
+if.end157:                                        ; preds = %if.then151, %msgSend.cont140
+  br i1 undef, label %msgSend.nullinit164, label %msgSend.call163
+
+msgSend.call163:                                  ; preds = %if.end157
+  br label %msgSend.cont165
+
+msgSend.nullinit164:                              ; preds = %if.end157
+  br label %msgSend.cont165
+
+msgSend.cont165:                                  ; preds = %msgSend.nullinit164, %msgSend.call163
+  br i1 undef, label %msgSend.nullinit176, label %msgSend.call175
+
+msgSend.call175:                                  ; preds = %msgSend.cont165
+  br label %msgSend.cont177
+
+msgSend.nullinit176:                              ; preds = %msgSend.cont165
+  br label %msgSend.cont177
+
+msgSend.cont177:                                  ; preds = %msgSend.nullinit176, %msgSend.call175
+  br i1 undef, label %land.lhs.true181, label %if.end202
+
+land.lhs.true181:                                 ; preds = %msgSend.cont177
+  br i1 undef, label %if.then187, label %if.end202
+
+if.then187:                                       ; preds = %land.lhs.true181
+  br i1 undef, label %msgSend.nullinit199, label %msgSend.call198
+
+msgSend.call198:                                  ; preds = %if.then187
+  br label %msgSend.cont200
+
+msgSend.nullinit199:                              ; preds = %if.then187
+  br label %msgSend.cont200
+
+msgSend.cont200:                                  ; preds = %msgSend.nullinit199, %msgSend.call198
+  br label %if.end202
+
+if.end202:                                        ; preds = %msgSend.cont200, %land.lhs.true181, %msgSend.cont177
+  br i1 undef, label %msgSend.nullinit236, label %msgSend.call235
+
+msgSend.call235:                                  ; preds = %if.end202
+  br label %msgSend.cont237
+
+msgSend.nullinit236:                              ; preds = %if.end202
+  br label %msgSend.cont237
+
+msgSend.cont237:                                  ; preds = %msgSend.nullinit236, %msgSend.call235
+  br i1 undef, label %msgSend.nullinit254, label %msgSend.call253
+
+msgSend.call253:                                  ; preds = %msgSend.cont237
+  br label %msgSend.cont255
+
+msgSend.nullinit254:                              ; preds = %msgSend.cont237
+  br label %msgSend.cont255
+
+msgSend.cont255:                                  ; preds = %msgSend.nullinit254, %msgSend.call253
+  br i1 undef, label %msgSend.nullinit269, label %msgSend.call268
+
+msgSend.call268:                                  ; preds = %msgSend.cont255
+  br label %msgSend.cont270
+
+msgSend.nullinit269:                              ; preds = %msgSend.cont255
+  br label %msgSend.cont270
+
+msgSend.cont270:                                  ; preds = %msgSend.nullinit269, %msgSend.call268
+  br i1 undef, label %msgSend.nullinit281, label %msgSend.call280
+
+msgSend.call280:                                  ; preds = %msgSend.cont270
+  br label %msgSend.cont282
+
+msgSend.nullinit281:                              ; preds = %msgSend.cont270
+  br label %msgSend.cont282
+
+msgSend.cont282:                                  ; preds = %msgSend.nullinit281, %msgSend.call280
+  br i1 undef, label %msgSend.nullinit287, label %msgSend.call286
+
+msgSend.call286:                                  ; preds = %msgSend.cont282
+  br label %msgSend.cont288
+
+msgSend.nullinit287:                              ; preds = %msgSend.cont282
+  br label %msgSend.cont288
+
+msgSend.cont288:                                  ; preds = %msgSend.nullinit287, %msgSend.call286
+  br i1 undef, label %msgSend.nullinit303, label %msgSend.call302
+
+msgSend.call302:                                  ; preds = %msgSend.cont288
+  br label %msgSend.cont304
+
+msgSend.nullinit303:                              ; preds = %msgSend.cont288
+  br label %msgSend.cont304
+
+msgSend.cont304:                                  ; preds = %msgSend.nullinit303, %msgSend.call302
+  br i1 undef, label %msgSend.nullinit344, label %msgSend.call343
+
+msgSend.call343:                                  ; preds = %msgSend.cont304
+  br label %msgSend.cont345
+
+msgSend.nullinit344:                              ; preds = %msgSend.cont304
+  br label %msgSend.cont345
+
+msgSend.cont345:                                  ; preds = %msgSend.nullinit344, %msgSend.call343
+  br i1 undef, label %msgSend.nullinit350, label %msgSend.call349
+
+msgSend.call349:                                  ; preds = %msgSend.cont345
+  br label %msgSend.cont351
+
+msgSend.nullinit350:                              ; preds = %msgSend.cont345
+  br label %msgSend.cont351
+
+msgSend.cont351:                                  ; preds = %msgSend.nullinit350, %msgSend.call349
+  br i1 undef, label %msgSend.nullinit366, label %msgSend.call365
+
+msgSend.call365:                                  ; preds = %msgSend.cont351
+  br label %msgSend.cont367
+
+msgSend.nullinit366:                              ; preds = %msgSend.cont351
+  br label %msgSend.cont367
+
+msgSend.cont367:                                  ; preds = %msgSend.nullinit366, %msgSend.call365
+  br i1 undef, label %msgSend.nullinit376, label %msgSend.call375
+
+msgSend.call375:                                  ; preds = %msgSend.cont367
+  br label %msgSend.cont377
+
+msgSend.nullinit376:                              ; preds = %msgSend.cont367
+  br label %msgSend.cont377
+
+msgSend.cont377:                                  ; preds = %msgSend.nullinit376, %msgSend.call375
+  br i1 undef, label %if.then384, label %if.else401
+
+if.then384:                                       ; preds = %msgSend.cont377
+  br i1 undef, label %msgSend.nullinit392, label %msgSend.call391
+
+msgSend.call391:                                  ; preds = %if.then384
+  br label %msgSend.cont393
+
+msgSend.nullinit392:                              ; preds = %if.then384
+  br label %msgSend.cont393
+
+msgSend.cont393:                                  ; preds = %msgSend.nullinit392, %msgSend.call391
+  br label %if.end418
+
+if.else401:                                       ; preds = %msgSend.cont377
+  br i1 undef, label %msgSend.nullinit409, label %msgSend.call408
+
+msgSend.call408:                                  ; preds = %if.else401
+  br label %msgSend.cont410
+
+msgSend.nullinit409:                              ; preds = %if.else401
+  br label %msgSend.cont410
+
+msgSend.cont410:                                  ; preds = %msgSend.nullinit409, %msgSend.call408
+  br label %if.end418
+
+if.end418:                                        ; preds = %msgSend.cont410, %msgSend.cont393
+  br i1 undef, label %msgSend.nullinit470, label %msgSend.call469
+
+msgSend.call469:                                  ; preds = %if.end418
+  br label %msgSend.cont471
+
+msgSend.nullinit470:                              ; preds = %if.end418
+  br label %msgSend.cont471
+
+msgSend.cont471:                                  ; preds = %msgSend.nullinit470, %msgSend.call469
+  br i1 undef, label %msgSend.nullinit484, label %msgSend.call483
+
+msgSend.call483:                                  ; preds = %msgSend.cont471
+  br label %msgSend.cont485
+
+msgSend.nullinit484:                              ; preds = %msgSend.cont471
+  br label %msgSend.cont485
+
+msgSend.cont485:                                  ; preds = %msgSend.nullinit484, %msgSend.call483
+  br i1 undef, label %msgSend.nullinit500, label %msgSend.call499
+
+msgSend.call499:                                  ; preds = %msgSend.cont485
+  br label %msgSend.cont501
+
+msgSend.nullinit500:                              ; preds = %msgSend.cont485
+  br label %msgSend.cont501
+
+msgSend.cont501:                                  ; preds = %msgSend.nullinit500, %msgSend.call499
+  br i1 undef, label %msgSend.nullinit506, label %msgSend.call505
+
+msgSend.call505:                                  ; preds = %msgSend.cont501
+  br label %msgSend.cont507
+
+msgSend.nullinit506:                              ; preds = %msgSend.cont501
+  br label %msgSend.cont507
+
+msgSend.cont507:                                  ; preds = %msgSend.nullinit506, %msgSend.call505
+  call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  ret void
+}
+
+!0 = metadata !{}
diff --git a/test/Transforms/PhaseOrdering/gdce.ll b/test/Transforms/PhaseOrdering/gdce.ll
new file mode 100644
index 0000000..273e47e
--- /dev/null
+++ b/test/Transforms/PhaseOrdering/gdce.ll
@@ -0,0 +1,106 @@
+; RUN: opt -O2 -S %s | FileCheck %s
+
+; Run global DCE to eliminate unused ctor and dtor.
+; rdar://9142819
+
+; CHECK: main
+; CHECK-NOT: _ZN4BaseC1Ev
+; CHECK-NOT: _ZN4BaseD1Ev
+; CHECK-NOT: _ZN4BaseD2Ev
+; CHECK-NOT: _ZN4BaseC2Ev
+; CHECK-NOT: _ZN4BaseD0Ev
+
+%class.Base = type { i32 (...)** }
+
+@_ZTV4Base = linkonce_odr unnamed_addr constant [4 x i8*] [i8* null, i8* bitcast ({ i8*, i8* }* @_ZTI4Base to i8*), i8* bitcast (void (%class.Base*)* @_ZN4BaseD1Ev to i8*), i8* bitcast (void (%class.Base*)* @_ZN4BaseD0Ev to i8*)]
+@_ZTVN10__cxxabiv117__class_type_infoE = external global i8*
+@_ZTS4Base = linkonce_odr constant [6 x i8] c"4Base\00"
+@_ZTI4Base = linkonce_odr unnamed_addr constant { i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8** @_ZTVN10__cxxabiv117__class_type_infoE, i64 2) to i8*), i8* getelementptr inbounds ([6 x i8]* @_ZTS4Base, i32 0, i32 0) }
+
+define i32 @main() uwtable ssp {
+entry:
+  %retval = alloca i32, align 4
+  %b = alloca %class.Base, align 8
+  %cleanup.dest.slot = alloca i32
+  store i32 0, i32* %retval
+  call void @_ZN4BaseC1Ev(%class.Base* %b)
+  store i32 0, i32* %retval
+  store i32 1, i32* %cleanup.dest.slot
+  call void @_ZN4BaseD1Ev(%class.Base* %b)
+  %0 = load i32* %retval
+  ret i32 %0
+}
+
+define linkonce_odr void @_ZN4BaseC1Ev(%class.Base* %this) unnamed_addr uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base** %this.addr
+  call void @_ZN4BaseC2Ev(%class.Base* %this1)
+  ret void
+}
+
+define linkonce_odr void @_ZN4BaseD1Ev(%class.Base* %this) unnamed_addr uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base** %this.addr
+  call void @_ZN4BaseD2Ev(%class.Base* %this1)
+  ret void
+}
+
+define linkonce_odr void @_ZN4BaseD2Ev(%class.Base* %this) unnamed_addr nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base** %this.addr
+  ret void
+}
+
+define linkonce_odr void @_ZN4BaseC2Ev(%class.Base* %this) unnamed_addr nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base** %this.addr
+  %0 = bitcast %class.Base* %this1 to i8***
+  store i8** getelementptr inbounds ([4 x i8*]* @_ZTV4Base, i64 0, i64 2), i8*** %0
+  ret void
+}
+
+define linkonce_odr void @_ZN4BaseD0Ev(%class.Base* %this) unnamed_addr uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base** %this.addr
+  invoke void @_ZN4BaseD1Ev(%class.Base* %this1)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  %0 = bitcast %class.Base* %this1 to i8*
+  call void @_ZdlPv(i8* %0) nounwind
+  ret void
+
+lpad:                                             ; preds = %entry
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  %2 = extractvalue { i8*, i32 } %1, 0
+  store i8* %2, i8** %exn.slot
+  %3 = extractvalue { i8*, i32 } %1, 1
+  store i32 %3, i32* %ehselector.slot
+  %4 = bitcast %class.Base* %this1 to i8*
+  call void @_ZdlPv(i8* %4) nounwind
+  br label %eh.resume
+
+eh.resume:                                        ; preds = %lpad
+  %exn = load i8** %exn.slot
+  %sel = load i32* %ehselector.slot
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0
+  %lpad.val2 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1
+  resume { i8*, i32 } %lpad.val2
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @_ZdlPv(i8*) nounwind
diff --git a/test/Transforms/Reassociate/crash.ll b/test/Transforms/Reassociate/crash.ll
index ce586e1..e29b5dc 100644
--- a/test/Transforms/Reassociate/crash.ll
+++ b/test/Transforms/Reassociate/crash.ll
@@ -144,3 +144,31 @@ define i32 @sozefx_(i32 %x, i32 %y) {
   %t6 = add i32 %t4, %t5
   ret i32 %t6
 }
+
+define i32 @bar(i32 %arg, i32 %arg1, i32 %arg2) {
+  %tmp1 = mul i32 %arg1, 2
+  %tmp2 = mul i32 %tmp1, 3
+  %tmp3 = mul i32 %arg2, 2
+  %tmp4 = add i32 %tmp1, 1 ; dead code
+  %ret = add i32 %tmp2, %tmp3
+  ret i32 %ret
+}
+
+; PR14060
+define i8 @hang(i8 %p, i8 %p0, i8 %p1, i8 %p2, i8 %p3, i8 %p4, i8 %p5, i8 %p6, i8 %p7, i8 %p8, i8 %p9) {
+  %tmp = zext i1 false to i8
+  %tmp16 = or i8 %tmp, 1
+  %tmp22 = or i8 %p7, %p0
+  %tmp23 = or i8 %tmp16, %tmp22
+  %tmp28 = or i8 %p9, %p1
+  %tmp31 = or i8 %tmp23, %p2
+  %tmp32 = or i8 %tmp31, %tmp28
+  %tmp38 = or i8 %p8, %p3
+  %tmp39 = or i8 %tmp16, %tmp38
+  %tmp43 = or i8 %tmp39, %p4
+  %tmp44 = or i8 %tmp43, 1
+  %tmp47 = or i8 %tmp32, %p5
+  %tmp50 = or i8 %tmp47, %p6
+  %tmp51 = or i8 %tmp44, %tmp50
+  ret i8 %tmp51
+}
diff --git a/test/Transforms/SCCP/loadtest.ll b/test/Transforms/SCCP/loadtest.ll
index add2af4..dd1dba6 100644
--- a/test/Transforms/SCCP/loadtest.ll
+++ b/test/Transforms/SCCP/loadtest.ll
@@ -1,8 +1,9 @@
 ; This test makes sure that these instructions are properly constant propagated.
 
-target datalayout = "e-p:32:32"
+; RUN: opt < %s -default-data-layout="e-p:32:32" -sccp -S | FileCheck %s
+; RUN: opt < %s -default-data-layout="E-p:32:32" -sccp -S | FileCheck %s
 
-; RUN: opt < %s -sccp -S | not grep load
+; CHECK-NOT: load
 
 
 @X = constant i32 42		; <i32*> [#uses=1]
diff --git a/test/Transforms/SROA/alignment.ll b/test/Transforms/SROA/alignment.ll
new file mode 100644
index 0000000..ad5fb6c
--- /dev/null
+++ b/test/Transforms/SROA/alignment.ll
@@ -0,0 +1,171 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
+
+define void @test1({ i8, i8 }* %a, { i8, i8 }* %b) {
+; CHECK: @test1
+; CHECK: %[[gep_a0:.*]] = getelementptr inbounds { i8, i8 }* %a, i64 0, i32 0
+; CHECK: %[[a0:.*]] = load i8* %[[gep_a0]], align 16
+; CHECK: %[[gep_a1:.*]] = getelementptr inbounds { i8, i8 }* %a, i64 0, i32 1
+; CHECK: %[[a1:.*]] = load i8* %[[gep_a1]], align 1
+; CHECK: %[[gep_b0:.*]] = getelementptr inbounds { i8, i8 }* %b, i64 0, i32 0
+; CHECK: store i8 %[[a0]], i8* %[[gep_b0]], align 16
+; CHECK: %[[gep_b1:.*]] = getelementptr inbounds { i8, i8 }* %b, i64 0, i32 1
+; CHECK: store i8 %[[a1]], i8* %[[gep_b1]], align 1
+; CHECK: ret void
+
+entry:
+  %alloca = alloca { i8, i8 }, align 16
+  %gep_a = getelementptr { i8, i8 }* %a, i32 0, i32 0
+  %gep_alloca = getelementptr { i8, i8 }* %alloca, i32 0, i32 0
+  %gep_b = getelementptr { i8, i8 }* %b, i32 0, i32 0
+
+  store i8 420, i8* %gep_alloca, align 16
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %gep_alloca, i8* %gep_a, i32 2, i32 16, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %gep_b, i8* %gep_alloca, i32 2, i32 16, i1 false)
+  ret void
+}
+
+define void @test2() {
+; CHECK: @test2
+; CHECK: alloca i16
+; CHECK: load i8* %{{.*}}
+; CHECK: store i8 42, i8* %{{.*}}
+; CHECK: ret void
+
+entry:
+  %a = alloca { i8, i8, i8, i8 }, align 2
+  %gep1 = getelementptr { i8, i8, i8, i8 }* %a, i32 0, i32 1
+  %cast1 = bitcast i8* %gep1 to i16*
+  store volatile i16 0, i16* %cast1
+  %gep2 = getelementptr { i8, i8, i8, i8 }* %a, i32 0, i32 2
+  %result = load i8* %gep2
+  store i8 42, i8* %gep2
+  ret void
+}
+
+define void @PR13920(<2 x i64>* %a, i16* %b) {
+; Test that alignments on memcpy intrinsics get propagated to loads and stores.
+; CHECK: @PR13920
+; CHECK: load <2 x i64>* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64>* {{.*}}, align 2
+; CHECK: ret void
+
+entry:
+  %aa = alloca <2 x i64>, align 16
+  %aptr = bitcast <2 x i64>* %a to i8*
+  %aaptr = bitcast <2 x i64>* %aa to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %aaptr, i8* %aptr, i32 16, i32 2, i1 false)
+  %bptr = bitcast i16* %b to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %bptr, i8* %aaptr, i32 16, i32 2, i1 false)
+  ret void
+}
+
+define void @test3(i8* %x) {
+; Test that when we promote an alloca to a type with lower ABI alignment, we
+; provide the needed explicit alignment that code using the alloca may be
+; expecting. However, also check that any offset within an alloca can in turn
+; reduce the alignment.
+; CHECK: @test3
+; CHECK: alloca [22 x i8], align 8
+; CHECK: alloca [18 x i8], align 2
+; CHECK: ret void
+
+entry:
+  %a = alloca { i8*, i8*, i8* }
+  %b = alloca { i8*, i8*, i8* }
+  %a_raw = bitcast { i8*, i8*, i8* }* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a_raw, i8* %x, i32 22, i32 8, i1 false)
+  %b_raw = bitcast { i8*, i8*, i8* }* %b to i8*
+  %b_gep = getelementptr i8* %b_raw, i32 6
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b_gep, i8* %x, i32 18, i32 2, i1 false)
+  ret void
+}
+
+define void @test5() {
+; Test that we preserve underaligned loads and stores when splitting.
+; CHECK: @test5
+; CHECK: alloca [9 x i8]
+; CHECK: alloca [9 x i8]
+; CHECK: store volatile double 0.0{{.*}}, double* %{{.*}}, align 1
+; CHECK: load i16* %{{.*}}, align 1
+; CHECK: load double* %{{.*}}, align 1
+; CHECK: store volatile double %{{.*}}, double* %{{.*}}, align 1
+; CHECK: load i16* %{{.*}}, align 1
+; CHECK: ret void
+
+entry:
+  %a = alloca [18 x i8]
+  %raw1 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 0
+  %ptr1 = bitcast i8* %raw1 to double*
+  store volatile double 0.0, double* %ptr1, align 1
+  %weird_gep1 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 7
+  %weird_cast1 = bitcast i8* %weird_gep1 to i16*
+  %weird_load1 = load i16* %weird_cast1, align 1
+
+  %raw2 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 9
+  %ptr2 = bitcast i8* %raw2 to double*
+  %d1 = load double* %ptr1, align 1
+  store volatile double %d1, double* %ptr2, align 1
+  %weird_gep2 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 16
+  %weird_cast2 = bitcast i8* %weird_gep2 to i16*
+  %weird_load2 = load i16* %weird_cast2, align 1
+
+  ret void
+}
+
+define void @test6() {
+; Test that we promote alignment when the underlying alloca switches to one
+; that innately provides it.
+; CHECK: @test6
+; CHECK: alloca double
+; CHECK: alloca double
+; CHECK-NOT: align
+; CHECK: ret void
+
+entry:
+  %a = alloca [16 x i8]
+  %raw1 = getelementptr inbounds [16 x i8]* %a, i32 0, i32 0
+  %ptr1 = bitcast i8* %raw1 to double*
+  store volatile double 0.0, double* %ptr1, align 1
+
+  %raw2 = getelementptr inbounds [16 x i8]* %a, i32 0, i32 8
+  %ptr2 = bitcast i8* %raw2 to double*
+  %val = load double* %ptr1, align 1
+  store volatile double %val, double* %ptr2, align 1
+
+  ret void
+}
+
+define void @test7(i8* %out) {
+; Test that we properly compute the destination alignment when rewriting
+; memcpys as direct loads or stores.
+; CHECK: @test7
+; CHECK-NOT: alloca
+
+entry:
+  %a = alloca [16 x i8]
+  %raw1 = getelementptr inbounds [16 x i8]* %a, i32 0, i32 0
+  %ptr1 = bitcast i8* %raw1 to double*
+  %raw2 = getelementptr inbounds [16 x i8]* %a, i32 0, i32 8
+  %ptr2 = bitcast i8* %raw2 to double*
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %raw1, i8* %out, i32 16, i32 0, i1 false)
+; CHECK: %[[val2:.*]] = load double* %{{.*}}, align 1
+; CHECK: %[[val1:.*]] = load double* %{{.*}}, align 1
+
+  %val1 = load double* %ptr2, align 1
+  %val2 = load double* %ptr1, align 1
+
+  store double %val1, double* %ptr1, align 1
+  store double %val2, double* %ptr2, align 1
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %out, i8* %raw1, i32 16, i32 0, i1 false)
+; CHECK: store double %[[val1]], double* %{{.*}}, align 1
+; CHECK: store double %[[val2]], double* %{{.*}}, align 1
+
+  ret void
+; CHECK: ret void
+}
diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll
new file mode 100644
index 0000000..b363eef
--- /dev/null
+++ b/test/Transforms/SROA/basictest.ll
@@ -0,0 +1,1136 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+; RUN: opt < %s -sroa -force-ssa-updater -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+define i32 @test0() {
+; CHECK: @test0
+; CHECK-NOT: alloca
+; CHECK: ret i32
+
+entry:
+  %a1 = alloca i32
+  %a2 = alloca float
+
+  %a1.i8 = bitcast i32* %a1 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %a1.i8)
+
+  store i32 0, i32* %a1
+  %v1 = load i32* %a1
+
+  call void @llvm.lifetime.end(i64 4, i8* %a1.i8)
+
+  %a2.i8 = bitcast float* %a2 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %a2.i8)
+
+  store float 0.0, float* %a2
+  %v2 = load float * %a2
+  %v2.int = bitcast float %v2 to i32
+  %sum1 = add i32 %v1, %v2.int
+
+  call void @llvm.lifetime.end(i64 4, i8* %a2.i8)
+
+  ret i32 %sum1
+}
+
+define i32 @test1() {
+; CHECK: @test1
+; CHECK-NOT: alloca
+; CHECK: ret i32 0
+
+entry:
+  %X = alloca { i32, float }
+  %Y = getelementptr { i32, float }* %X, i64 0, i32 0
+  store i32 0, i32* %Y
+  %Z = load i32* %Y
+  ret i32 %Z
+}
+
+define i64 @test2(i64 %X) {
+; CHECK: @test2
+; CHECK-NOT: alloca
+; CHECK: ret i64 %X
+
+entry:
+  %A = alloca [8 x i8]
+  %B = bitcast [8 x i8]* %A to i64*
+  store i64 %X, i64* %B
+  br label %L2
+
+L2:
+  %Z = load i64* %B
+  ret i64 %Z
+}
+
+define void @test3(i8* %dst, i8* %src) {
+; CHECK: @test3
+
+entry:
+  %a = alloca [300 x i8]
+; CHECK-NOT:  alloca
+; CHECK:      %[[test3_a1:.*]] = alloca [42 x i8]
+; CHECK-NEXT: %[[test3_a2:.*]] = alloca [99 x i8]
+; CHECK-NEXT: %[[test3_a3:.*]] = alloca [16 x i8]
+; CHECK-NEXT: %[[test3_a4:.*]] = alloca [42 x i8]
+; CHECK-NEXT: %[[test3_a5:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test3_a6:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test3_a7:.*]] = alloca [85 x i8]
+
+  %b = getelementptr [300 x i8]* %a, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b, i8* %src, i32 300, i32 1, i1 false)
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a1]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %src, i32 42
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 42
+; CHECK-NEXT: %[[test3_r1:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 43
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [99 x i8]* %[[test3_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 99
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 142
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 16
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 158
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a4]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 42
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 200
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 207
+; CHECK-NEXT: %[[test3_r2:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 208
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 215
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [85 x i8]* %[[test3_a7]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 85
+
+  ; Clobber a single element of the array, this should be promotable.
+  %c = getelementptr [300 x i8]* %a, i64 0, i64 42
+  store i8 0, i8* %c
+
+  ; Make a sequence of overlapping stores to the array. These overlap both in
+  ; forward strides and in shrinking accesses.
+  %overlap.1.i8 = getelementptr [300 x i8]* %a, i64 0, i64 142
+  %overlap.2.i8 = getelementptr [300 x i8]* %a, i64 0, i64 143
+  %overlap.3.i8 = getelementptr [300 x i8]* %a, i64 0, i64 144
+  %overlap.4.i8 = getelementptr [300 x i8]* %a, i64 0, i64 145
+  %overlap.5.i8 = getelementptr [300 x i8]* %a, i64 0, i64 146
+  %overlap.6.i8 = getelementptr [300 x i8]* %a, i64 0, i64 147
+  %overlap.7.i8 = getelementptr [300 x i8]* %a, i64 0, i64 148
+  %overlap.8.i8 = getelementptr [300 x i8]* %a, i64 0, i64 149
+  %overlap.9.i8 = getelementptr [300 x i8]* %a, i64 0, i64 150
+  %overlap.1.i16 = bitcast i8* %overlap.1.i8 to i16*
+  %overlap.1.i32 = bitcast i8* %overlap.1.i8 to i32*
+  %overlap.1.i64 = bitcast i8* %overlap.1.i8 to i64*
+  %overlap.2.i64 = bitcast i8* %overlap.2.i8 to i64*
+  %overlap.3.i64 = bitcast i8* %overlap.3.i8 to i64*
+  %overlap.4.i64 = bitcast i8* %overlap.4.i8 to i64*
+  %overlap.5.i64 = bitcast i8* %overlap.5.i8 to i64*
+  %overlap.6.i64 = bitcast i8* %overlap.6.i8 to i64*
+  %overlap.7.i64 = bitcast i8* %overlap.7.i8 to i64*
+  %overlap.8.i64 = bitcast i8* %overlap.8.i8 to i64*
+  %overlap.9.i64 = bitcast i8* %overlap.9.i8 to i64*
+  store i8 1, i8* %overlap.1.i8
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 0
+; CHECK-NEXT: store i8 1, i8* %[[gep]]
+  store i16 1, i16* %overlap.1.i16
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [16 x i8]* %[[test3_a3]] to i16*
+; CHECK-NEXT: store i16 1, i16* %[[bitcast]]
+  store i32 1, i32* %overlap.1.i32
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [16 x i8]* %[[test3_a3]] to i32*
+; CHECK-NEXT: store i32 1, i32* %[[bitcast]]
+  store i64 1, i64* %overlap.1.i64
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [16 x i8]* %[[test3_a3]] to i64*
+; CHECK-NEXT: store i64 1, i64* %[[bitcast]]
+  store i64 2, i64* %overlap.2.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 1
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 2, i64* %[[bitcast]]
+  store i64 3, i64* %overlap.3.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 2
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 3, i64* %[[bitcast]]
+  store i64 4, i64* %overlap.4.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 3
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 4, i64* %[[bitcast]]
+  store i64 5, i64* %overlap.5.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 4
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 5, i64* %[[bitcast]]
+  store i64 6, i64* %overlap.6.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 5
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 6, i64* %[[bitcast]]
+  store i64 7, i64* %overlap.7.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 6
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 7, i64* %[[bitcast]]
+  store i64 8, i64* %overlap.8.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 7
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 8, i64* %[[bitcast]]
+  store i64 9, i64* %overlap.9.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 8
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 9, i64* %[[bitcast]]
+
+  ; Make two sequences of overlapping stores with more gaps and irregularities.
+  %overlap2.1.0.i8 = getelementptr [300 x i8]* %a, i64 0, i64 200
+  %overlap2.1.1.i8 = getelementptr [300 x i8]* %a, i64 0, i64 201
+  %overlap2.1.2.i8 = getelementptr [300 x i8]* %a, i64 0, i64 202
+  %overlap2.1.3.i8 = getelementptr [300 x i8]* %a, i64 0, i64 203
+
+  %overlap2.2.0.i8 = getelementptr [300 x i8]* %a, i64 0, i64 208
+  %overlap2.2.1.i8 = getelementptr [300 x i8]* %a, i64 0, i64 209
+  %overlap2.2.2.i8 = getelementptr [300 x i8]* %a, i64 0, i64 210
+  %overlap2.2.3.i8 = getelementptr [300 x i8]* %a, i64 0, i64 211
+
+  %overlap2.1.0.i16 = bitcast i8* %overlap2.1.0.i8 to i16*
+  %overlap2.1.0.i32 = bitcast i8* %overlap2.1.0.i8 to i32*
+  %overlap2.1.1.i32 = bitcast i8* %overlap2.1.1.i8 to i32*
+  %overlap2.1.2.i32 = bitcast i8* %overlap2.1.2.i8 to i32*
+  %overlap2.1.3.i32 = bitcast i8* %overlap2.1.3.i8 to i32*
+  store i8 1,  i8*  %overlap2.1.0.i8
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 0
+; CHECK-NEXT: store i8 1, i8* %[[gep]]
+  store i16 1, i16* %overlap2.1.0.i16
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [7 x i8]* %[[test3_a5]] to i16*
+; CHECK-NEXT: store i16 1, i16* %[[bitcast]]
+  store i32 1, i32* %overlap2.1.0.i32
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [7 x i8]* %[[test3_a5]] to i32*
+; CHECK-NEXT: store i32 1, i32* %[[bitcast]]
+  store i32 2, i32* %overlap2.1.1.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 1
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 2, i32* %[[bitcast]]
+  store i32 3, i32* %overlap2.1.2.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 2
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 3, i32* %[[bitcast]]
+  store i32 4, i32* %overlap2.1.3.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 3
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 4, i32* %[[bitcast]]
+
+  %overlap2.2.0.i32 = bitcast i8* %overlap2.2.0.i8 to i32*
+  %overlap2.2.1.i16 = bitcast i8* %overlap2.2.1.i8 to i16*
+  %overlap2.2.1.i32 = bitcast i8* %overlap2.2.1.i8 to i32*
+  %overlap2.2.2.i32 = bitcast i8* %overlap2.2.2.i8 to i32*
+  %overlap2.2.3.i32 = bitcast i8* %overlap2.2.3.i8 to i32*
+  store i32 1, i32* %overlap2.2.0.i32
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [7 x i8]* %[[test3_a6]] to i32*
+; CHECK-NEXT: store i32 1, i32* %[[bitcast]]
+  store i8 1,  i8*  %overlap2.2.1.i8
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 1
+; CHECK-NEXT: store i8 1, i8* %[[gep]]
+  store i16 1, i16* %overlap2.2.1.i16
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 1
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: store i16 1, i16* %[[bitcast]]
+  store i32 1, i32* %overlap2.2.1.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 1
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 1, i32* %[[bitcast]]
+  store i32 3, i32* %overlap2.2.2.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 2
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 3, i32* %[[bitcast]]
+  store i32 4, i32* %overlap2.2.3.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 3
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 4, i32* %[[bitcast]]
+
+  %overlap2.prefix = getelementptr i8* %overlap2.1.1.i8, i64 -4
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %overlap2.prefix, i8* %src, i32 8, i32 1, i1 false)
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a4]], i64 0, i64 39
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %src, i32 3
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 3
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 5
+
+  ; Bridge between the overlapping areas
+  call void @llvm.memset.p0i8.i32(i8* %overlap2.1.2.i8, i8 42, i32 8, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 2
+; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %[[gep]], i8 42, i32 5
+; ...promoted i8 store...
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %[[gep]], i8 42, i32 2
+
+  ; Entirely within the second overlap.
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %overlap2.2.1.i8, i8* %src, i32 5, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 1
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep]], i8* %src, i32 5
+
+  ; Trailing past the second overlap.
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %overlap2.2.2.i8, i8* %src, i32 8, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 2
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep]], i8* %src, i32 5
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 5
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [85 x i8]* %[[test3_a7]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 3
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %b, i32 300, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a1]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %[[gep]], i32 42
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 42
+; CHECK-NEXT: store i8 0, i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 43
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [99 x i8]* %[[test3_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 99
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 142
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 16
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 158
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a4]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 42
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 200
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 207
+; CHECK-NEXT: store i8 42, i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 208
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 215
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [85 x i8]* %[[test3_a7]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 85
+
+  ret void
+}
+
+define void @test4(i8* %dst, i8* %src) {
+; CHECK: @test4
+
+entry:
+  %a = alloca [100 x i8]
+; CHECK-NOT:  alloca
+; CHECK:      %[[test4_a1:.*]] = alloca [20 x i8]
+; CHECK-NEXT: %[[test4_a2:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test4_a3:.*]] = alloca [10 x i8]
+; CHECK-NEXT: %[[test4_a4:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test4_a5:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test4_a6:.*]] = alloca [40 x i8]
+
+  %b = getelementptr [100 x i8]* %a, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b, i8* %src, i32 100, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [20 x i8]* %[[test4_a1]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep]], i8* %src, i32 20
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 20
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: %[[test4_r1:.*]] = load i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 22
+; CHECK-NEXT: %[[test4_r2:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 23
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 30
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [10 x i8]* %[[test4_a3]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 10
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 40
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: %[[test4_r3:.*]] = load i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 42
+; CHECK-NEXT: %[[test4_r4:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 43
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a4]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 50
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: %[[test4_r5:.*]] = load i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 52
+; CHECK-NEXT: %[[test4_r6:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 53
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 60
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [40 x i8]* %[[test4_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 40
+
+  %a.src.1 = getelementptr [100 x i8]* %a, i64 0, i64 20
+  %a.dst.1 = getelementptr [100 x i8]* %a, i64 0, i64 40
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.dst.1, i8* %a.src.1, i32 10, i32 1, i1 false)
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a4]], i64 0, i64 0
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+
+  ; Clobber a single element of the array, this should be promotable, and be deleted.
+  %c = getelementptr [100 x i8]* %a, i64 0, i64 42
+  store i8 0, i8* %c
+
+  %a.src.2 = getelementptr [100 x i8]* %a, i64 0, i64 50
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %a.dst.1, i8* %a.src.2, i32 10, i32 1, i1 false)
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a4]], i64 0, i64 0
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %b, i32 100, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [20 x i8]* %[[test4_a1]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %[[gep]], i32 20
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 20
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: store i16 %[[test4_r1]], i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 22
+; CHECK-NEXT: store i8 %[[test4_r2]], i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 23
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 30
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [10 x i8]* %[[test4_a3]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 10
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 40
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: store i16 %[[test4_r5]], i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 42
+; CHECK-NEXT: store i8 %[[test4_r6]], i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 43
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a4]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 50
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: store i16 %[[test4_r5]], i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 52
+; CHECK-NEXT: store i8 %[[test4_r6]], i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 53
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 60
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [40 x i8]* %[[test4_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 40
+
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+
+define i16 @test5() {
+; CHECK: @test5
+; CHECK-NOT: alloca float
+; CHECK:      %[[cast:.*]] = bitcast float 0.0{{.*}} to i32
+; CHECK-NEXT: %[[shr:.*]] = lshr i32 %[[cast]], 16
+; CHECK-NEXT: %[[trunc:.*]] = trunc i32 %[[shr]] to i16
+; CHECK-NEXT: ret i16 %[[trunc]]
+
+entry:
+  %a = alloca [4 x i8]
+  %fptr = bitcast [4 x i8]* %a to float*
+  store float 0.0, float* %fptr
+  %ptr = getelementptr [4 x i8]* %a, i32 0, i32 2
+  %iptr = bitcast i8* %ptr to i16*
+  %val = load i16* %iptr
+  ret i16 %val
+}
+
+define i32 @test6() {
+; CHECK: @test6
+; CHECK: alloca i32
+; CHECK-NEXT: store volatile i32
+; CHECK-NEXT: load i32*
+; CHECK-NEXT: ret i32
+
+entry:
+  %a = alloca [4 x i8]
+  %ptr = getelementptr [4 x i8]* %a, i32 0, i32 0
+  call void @llvm.memset.p0i8.i32(i8* %ptr, i8 42, i32 4, i32 1, i1 true)
+  %iptr = bitcast i8* %ptr to i32*
+  %val = load i32* %iptr
+  ret i32 %val
+}
+
+define void @test7(i8* %src, i8* %dst) {
+; CHECK: @test7
+; CHECK: alloca i32
+; CHECK-NEXT: bitcast i8* %src to i32*
+; CHECK-NEXT: load volatile i32*
+; CHECK-NEXT: store volatile i32
+; CHECK-NEXT: bitcast i8* %dst to i32*
+; CHECK-NEXT: load volatile i32*
+; CHECK-NEXT: store volatile i32
+; CHECK-NEXT: ret
+
+entry:
+  %a = alloca [4 x i8]
+  %ptr = getelementptr [4 x i8]* %a, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr, i8* %src, i32 4, i32 1, i1 true)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr, i32 4, i32 1, i1 true)
+  ret void
+}
+
+
+%S1 = type { i32, i32, [16 x i8] }
+%S2 = type { %S1*, %S2* }
+
+define %S2 @test8(%S2* %s2) {
+; CHECK: @test8
+entry:
+  %new = alloca %S2
+; CHECK-NOT: alloca
+
+  %s2.next.ptr = getelementptr %S2* %s2, i64 0, i32 1
+  %s2.next = load %S2** %s2.next.ptr
+; CHECK:      %[[gep:.*]] = getelementptr %S2* %s2, i64 0, i32 1
+; CHECK-NEXT: %[[next:.*]] = load %S2** %[[gep]]
+
+  %s2.next.s1.ptr = getelementptr %S2* %s2.next, i64 0, i32 0
+  %s2.next.s1 = load %S1** %s2.next.s1.ptr
+  %new.s1.ptr = getelementptr %S2* %new, i64 0, i32 0
+  store %S1* %s2.next.s1, %S1** %new.s1.ptr
+  %s2.next.next.ptr = getelementptr %S2* %s2.next, i64 0, i32 1
+  %s2.next.next = load %S2** %s2.next.next.ptr
+  %new.next.ptr = getelementptr %S2* %new, i64 0, i32 1
+  store %S2* %s2.next.next, %S2** %new.next.ptr
+; CHECK-NEXT: %[[gep:.*]] = getelementptr %S2* %[[next]], i64 0, i32 0
+; CHECK-NEXT: %[[next_s1:.*]] = load %S1** %[[gep]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr %S2* %[[next]], i64 0, i32 1
+; CHECK-NEXT: %[[next_next:.*]] = load %S2** %[[gep]]
+
+  %new.s1 = load %S1** %new.s1.ptr
+  %result1 = insertvalue %S2 undef, %S1* %new.s1, 0
+; CHECK-NEXT: %[[result1:.*]] = insertvalue %S2 undef, %S1* %[[next_s1]], 0
+  %new.next = load %S2** %new.next.ptr
+  %result2 = insertvalue %S2 %result1, %S2* %new.next, 1
+; CHECK-NEXT: %[[result2:.*]] = insertvalue %S2 %[[result1]], %S2* %[[next_next]], 1
+  ret %S2 %result2
+; CHECK-NEXT: ret %S2 %[[result2]]
+}
+
+define i64 @test9() {
+; Ensure we can handle loads off the end of an alloca even when wrapped in
+; weird bit casts and types. The result is undef, but this shouldn't crash
+; anything.
+; CHECK: @test9
+; CHECK-NOT: alloca
+; CHECK: ret i64 undef
+
+entry:
+  %a = alloca { [3 x i8] }
+  %gep1 = getelementptr inbounds { [3 x i8] }* %a, i32 0, i32 0, i32 0
+  store i8 0, i8* %gep1, align 1
+  %gep2 = getelementptr inbounds { [3 x i8] }* %a, i32 0, i32 0, i32 1
+  store i8 0, i8* %gep2, align 1
+  %gep3 = getelementptr inbounds { [3 x i8] }* %a, i32 0, i32 0, i32 2
+  store i8 26, i8* %gep3, align 1
+  %cast = bitcast { [3 x i8] }* %a to { i64 }*
+  %elt = getelementptr inbounds { i64 }* %cast, i32 0, i32 0
+  %result = load i64* %elt
+  ret i64 %result
+}
+
+define %S2* @test10() {
+; CHECK: @test10
+; CHECK-NOT: alloca %S2*
+; CHECK: ret %S2* null
+
+entry:
+  %a = alloca [8 x i8]
+  %ptr = getelementptr [8 x i8]* %a, i32 0, i32 0
+  call void @llvm.memset.p0i8.i32(i8* %ptr, i8 0, i32 8, i32 1, i1 false)
+  %s2ptrptr = bitcast i8* %ptr to %S2**
+  %s2ptr = load %S2** %s2ptrptr
+  ret %S2* %s2ptr
+}
+
+define i32 @test11() {
+; CHECK: @test11
+; CHECK-NOT: alloca
+; CHECK: ret i32 0
+
+entry:
+  %X = alloca i32
+  br i1 undef, label %good, label %bad
+
+good:
+  %Y = getelementptr i32* %X, i64 0
+  store i32 0, i32* %Y
+  %Z = load i32* %Y
+  ret i32 %Z
+
+bad:
+  %Y2 = getelementptr i32* %X, i64 1
+  store i32 0, i32* %Y2
+  %Z2 = load i32* %Y2
+  ret i32 %Z2
+}
+
+define i8 @test12() {
+; We fully promote these to the i24 load or store size, resulting in just masks
+; and other operations that instcombine will fold, but no alloca.
+;
+; CHECK: @test12
+
+entry:
+  %a = alloca [3 x i8]
+  %b = alloca [3 x i8]
+; CHECK-NOT: alloca
+
+  %a0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0
+  store i8 0, i8* %a0ptr
+  %a1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1
+  store i8 0, i8* %a1ptr
+  %a2ptr = getelementptr [3 x i8]* %a, i64 0, i32 2
+  store i8 0, i8* %a2ptr
+  %aiptr = bitcast [3 x i8]* %a to i24*
+  %ai = load i24* %aiptr
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift2:.*]] = shl i24 %[[ext2]], 16
+; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, 65535
+; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[shift2]]
+; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
+; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
+; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
+; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], -256
+; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[ext0]]
+
+  %biptr = bitcast [3 x i8]* %b to i24*
+  store i24 %ai, i24* %biptr
+  %b0ptr = getelementptr [3 x i8]* %b, i64 0, i32 0
+  %b0 = load i8* %b0ptr
+  %b1ptr = getelementptr [3 x i8]* %b, i64 0, i32 1
+  %b1 = load i8* %b1ptr
+  %b2ptr = getelementptr [3 x i8]* %b, i64 0, i32 2
+  %b2 = load i8* %b2ptr
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+; CHECK:      %[[trunc0:.*]] = trunc i24 %[[insert0]] to i8
+; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
+; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
+; CHECK-NEXT: %[[shift2:.*]] = lshr i24 %[[insert0]], 16
+; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[shift2]] to i8
+
+  %bsum0 = add i8 %b0, %b1
+  %bsum1 = add i8 %bsum0, %b2
+  ret i8 %bsum1
+; CHECK:      %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]]
+; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]]
+; CHECK-NEXT: ret i8 %[[sum1]]
+}
+
+define i32 @test13() {
+; Ensure we don't crash and handle undefined loads that straddle the end of the
+; allocation.
+; CHECK: @test13
+; CHECK: %[[ret:.*]] = zext i16 undef to i32
+; CHECK: ret i32 %[[ret]]
+
+entry:
+  %a = alloca [3 x i8]
+  %b0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0
+  store i8 0, i8* %b0ptr
+  %b1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1
+  store i8 0, i8* %b1ptr
+  %b2ptr = getelementptr [3 x i8]* %a, i64 0, i32 2
+  store i8 0, i8* %b2ptr
+  %iptrcast = bitcast [3 x i8]* %a to i16*
+  %iptrgep = getelementptr i16* %iptrcast, i64 1
+  %i = load i16* %iptrgep
+  %ret = zext i16 %i to i32
+  ret i32 %ret
+}
+
+%test14.struct = type { [3 x i32] }
+
+define void @test14(...) nounwind uwtable {
+; This is a strange case where we split allocas into promotable partitions, but
+; also gain enough data to prove they must be dead allocas due to GEPs that walk
+; across two adjacent allocas. Test that we don't try to promote or otherwise
+; do bad things to these dead allocas, they should just be removed.
+; CHECK: @test14
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret void
+
+entry:
+  %a = alloca %test14.struct
+  %p = alloca %test14.struct*
+  %0 = bitcast %test14.struct* %a to i8*
+  %1 = getelementptr i8* %0, i64 12
+  %2 = bitcast i8* %1 to %test14.struct*
+  %3 = getelementptr inbounds %test14.struct* %2, i32 0, i32 0
+  %4 = getelementptr inbounds %test14.struct* %a, i32 0, i32 0
+  %5 = bitcast [3 x i32]* %3 to i32*
+  %6 = bitcast [3 x i32]* %4 to i32*
+  %7 = load i32* %6, align 4
+  store i32 %7, i32* %5, align 4
+  %8 = getelementptr inbounds i32* %5, i32 1
+  %9 = getelementptr inbounds i32* %6, i32 1
+  %10 = load i32* %9, align 4
+  store i32 %10, i32* %8, align 4
+  %11 = getelementptr inbounds i32* %5, i32 2
+  %12 = getelementptr inbounds i32* %6, i32 2
+  %13 = load i32* %12, align 4
+  store i32 %13, i32* %11, align 4
+  ret void
+}
+
+define i32 @test15(i1 %flag) nounwind uwtable {
+; Ensure that when there are dead instructions using an alloca that are not
+; loads or stores we still delete them during partitioning and rewriting.
+; Otherwise we'll go to promote them while thy still have unpromotable uses.
+; CHECK: @test15
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   br label %loop
+; CHECK:      loop:
+; CHECK-NEXT:   br label %loop
+
+entry:
+  %l0 = alloca i64
+  %l1 = alloca i64
+  %l2 = alloca i64
+  %l3 = alloca i64
+  br label %loop
+
+loop:
+  %dead3 = phi i8* [ %gep3, %loop ], [ null, %entry ]
+
+  store i64 1879048192, i64* %l0, align 8
+  %bc0 = bitcast i64* %l0 to i8*
+  %gep0 = getelementptr i8* %bc0, i64 3
+  %dead0 = bitcast i8* %gep0 to i64*
+
+  store i64 1879048192, i64* %l1, align 8
+  %bc1 = bitcast i64* %l1 to i8*
+  %gep1 = getelementptr i8* %bc1, i64 3
+  %dead1 = getelementptr i8* %gep1, i64 1
+
+  store i64 1879048192, i64* %l2, align 8
+  %bc2 = bitcast i64* %l2 to i8*
+  %gep2.1 = getelementptr i8* %bc2, i64 1
+  %gep2.2 = getelementptr i8* %bc2, i64 3
+  ; Note that this select should get visited multiple times due to using two
+  ; different GEPs off the same alloca. We should only delete it once.
+  %dead2 = select i1 %flag, i8* %gep2.1, i8* %gep2.2
+
+  store i64 1879048192, i64* %l3, align 8
+  %bc3 = bitcast i64* %l3 to i8*
+  %gep3 = getelementptr i8* %bc3, i64 3
+
+  br label %loop
+}
+
+define void @test16(i8* %src, i8* %dst) {
+; Ensure that we can promote an alloca of [3 x i8] to an i24 SSA value.
+; CHECK: @test16
+; CHECK-NOT: alloca
+; CHECK:      %[[srccast:.*]] = bitcast i8* %src to i24*
+; CHECK-NEXT: load i24* %[[srccast]]
+; CHECK-NEXT: %[[dstcast:.*]] = bitcast i8* %dst to i24*
+; CHECK-NEXT: store i24 0, i24* %[[dstcast]]
+; CHECK-NEXT: ret void
+
+entry:
+  %a = alloca [3 x i8]
+  %ptr = getelementptr [3 x i8]* %a, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr, i8* %src, i32 4, i32 1, i1 false)
+  %cast = bitcast i8* %ptr to i24*
+  store i24 0, i24* %cast
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr, i32 4, i32 1, i1 false)
+  ret void
+}
+
+define void @test17(i8* %src, i8* %dst) {
+; Ensure that we can rewrite unpromotable memcpys which extend past the end of
+; the alloca.
+; CHECK: @test17
+; CHECK:      %[[a:.*]] = alloca [3 x i8]
+; CHECK-NEXT: %[[ptr:.*]] = getelementptr [3 x i8]* %[[a]], i32 0, i32 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[ptr]], i8* %src,
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %[[ptr]],
+; CHECK-NEXT: ret void
+
+entry:
+  %a = alloca [3 x i8]
+  %ptr = getelementptr [3 x i8]* %a, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr, i8* %src, i32 4, i32 1, i1 true)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr, i32 4, i32 1, i1 true)
+  ret void
+}
+
+define void @test18(i8* %src, i8* %dst, i32 %size) {
+; Preserve transfer instrinsics with a variable size, even if they overlap with
+; fixed size operations. Further, continue to split and promote allocas preceding
+; the variable sized intrinsic.
+; CHECK: @test18
+; CHECK:      %[[a:.*]] = alloca [34 x i8]
+; CHECK:      %[[srcgep1:.*]] = getelementptr inbounds i8* %src, i64 4
+; CHECK-NEXT: %[[srccast1:.*]] = bitcast i8* %[[srcgep1]] to i32*
+; CHECK-NEXT: %[[srcload:.*]] = load i32* %[[srccast1]]
+; CHECK-NEXT: %[[agep1:.*]] = getelementptr inbounds [34 x i8]* %[[a]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[agep1]], i8* %src, i32 %size,
+; CHECK-NEXT: %[[agep2:.*]] = getelementptr inbounds [34 x i8]* %[[a]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %[[agep2]], i8 42, i32 %size,
+; CHECK-NEXT: %[[dstcast1:.*]] = bitcast i8* %dst to i32*
+; CHECK-NEXT: store i32 42, i32* %[[dstcast1]]
+; CHECK-NEXT: %[[dstgep1:.*]] = getelementptr inbounds i8* %dst, i64 4
+; CHECK-NEXT: %[[dstcast2:.*]] = bitcast i8* %[[dstgep1]] to i32*
+; CHECK-NEXT: store i32 %[[srcload]], i32* %[[dstcast2]]
+; CHECK-NEXT: %[[agep3:.*]] = getelementptr inbounds [34 x i8]* %[[a]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %[[agep3]], i32 %size,
+; CHECK-NEXT: ret void
+
+entry:
+  %a = alloca [42 x i8]
+  %ptr = getelementptr [42 x i8]* %a, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr, i8* %src, i32 8, i32 1, i1 false)
+  %ptr2 = getelementptr [42 x i8]* %a, i32 0, i32 8
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr2, i8* %src, i32 %size, i32 1, i1 false)
+  call void @llvm.memset.p0i8.i32(i8* %ptr2, i8 42, i32 %size, i32 1, i1 false)
+  %cast = bitcast i8* %ptr to i32*
+  store i32 42, i32* %cast
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr, i32 8, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr2, i32 %size, i32 1, i1 false)
+  ret void
+}
+
+%opaque = type opaque
+
+define i32 @test19(%opaque* %x) {
+; This input will cause us to try to compute a natural GEP when rewriting
+; pointers in such a way that we try to GEP through the opaque type. Previously,
+; a check for an unsized type was missing and this crashed. Ensure it behaves
+; reasonably now.
+; CHECK: @test19
+; CHECK-NOT: alloca
+; CHECK: ret i32 undef
+
+entry:
+  %a = alloca { i64, i8* }
+  %cast1 = bitcast %opaque* %x to i8*
+  %cast2 = bitcast { i64, i8* }* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %cast2, i8* %cast1, i32 16, i32 1, i1 false)
+  %gep = getelementptr inbounds { i64, i8* }* %a, i32 0, i32 0
+  %val = load i64* %gep
+  ret i32 undef
+}
+
+define i32 @test20() {
+; Ensure we can track negative offsets (before the beginning of the alloca) and
+; negative relative offsets from offsets starting past the end of the alloca.
+; CHECK: @test20
+; CHECK-NOT: alloca
+; CHECK: %[[sum1:.*]] = add i32 1, 2
+; CHECK: %[[sum2:.*]] = add i32 %[[sum1]], 3
+; CHECK: ret i32 %[[sum2]]
+
+entry:
+  %a = alloca [3 x i32]
+  %gep1 = getelementptr [3 x i32]* %a, i32 0, i32 0
+  store i32 1, i32* %gep1
+  %gep2.1 = getelementptr [3 x i32]* %a, i32 0, i32 -2
+  %gep2.2 = getelementptr i32* %gep2.1, i32 3
+  store i32 2, i32* %gep2.2
+  %gep3.1 = getelementptr [3 x i32]* %a, i32 0, i32 14
+  %gep3.2 = getelementptr i32* %gep3.1, i32 -12
+  store i32 3, i32* %gep3.2
+
+  %load1 = load i32* %gep1
+  %load2 = load i32* %gep2.2
+  %load3 = load i32* %gep3.2
+  %sum1 = add i32 %load1, %load2
+  %sum2 = add i32 %sum1, %load3
+  ret i32 %sum2
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+define i8 @test21() {
+; Test allocations and offsets which border on overflow of the int64_t used
+; internally. This is really awkward to really test as LLVM doesn't really
+; support such extreme constructs cleanly.
+; CHECK: @test21
+; CHECK-NOT: alloca
+; CHECK: or i8 -1, -1
+
+entry:
+  %a = alloca [2305843009213693951 x i8]
+  %gep0 = getelementptr [2305843009213693951 x i8]* %a, i64 0, i64 2305843009213693949
+  store i8 255, i8* %gep0
+  %gep1 = getelementptr [2305843009213693951 x i8]* %a, i64 0, i64 -9223372036854775807
+  %gep2 = getelementptr i8* %gep1, i64 -1
+  call void @llvm.memset.p0i8.i64(i8* %gep2, i8 0, i64 18446744073709551615, i32 1, i1 false)
+  %gep3 = getelementptr i8* %gep1, i64 9223372036854775807
+  %gep4 = getelementptr i8* %gep3, i64 9223372036854775807
+  %gep5 = getelementptr i8* %gep4, i64 -6917529027641081857
+  store i8 255, i8* %gep5
+  %cast1 = bitcast i8* %gep4 to i32*
+  store i32 0, i32* %cast1
+  %load = load i8* %gep0
+  %gep6 = getelementptr i8* %gep0, i32 1
+  %load2 = load i8* %gep6
+  %result = or i8 %load, %load2
+  ret i8 %result
+}
+
+%PR13916.struct = type { i8 }
+
+define void @PR13916.1() {
+; Ensure that we handle overlapping memcpy intrinsics correctly, especially in
+; the case where there is a directly identical value for both source and dest.
+; CHECK: @PR13916.1
+; CHECK-NOT: alloca
+; CHECK: ret void
+
+entry:
+  %a = alloca i8
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %a, i32 1, i32 1, i1 false)
+  %tmp2 = load i8* %a
+  ret void
+}
+
+define void @PR13916.2() {
+; Check whether we continue to handle them correctly when they start off with
+; different pointer value chains, but during rewriting we coalesce them into the
+; same value.
+; CHECK: @PR13916.2
+; CHECK-NOT: alloca
+; CHECK: ret void
+
+entry:
+  %a = alloca %PR13916.struct, align 1
+  br i1 undef, label %if.then, label %if.end
+
+if.then:
+  %tmp0 = bitcast %PR13916.struct* %a to i8*
+  %tmp1 = bitcast %PR13916.struct* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp0, i8* %tmp1, i32 1, i32 1, i1 false)
+  br label %if.end
+
+if.end:
+  %gep = getelementptr %PR13916.struct* %a, i32 0, i32 0
+  %tmp2 = load i8* %gep
+  ret void
+}
+
+define void @PR13990() {
+; Ensure we can handle cases where processing one alloca causes the other
+; alloca to become dead and get deleted. This might crash or fail under
+; Valgrind if we regress.
+; CHECK: @PR13990
+; CHECK-NOT: alloca
+; CHECK: unreachable
+; CHECK: unreachable
+
+entry:
+  %tmp1 = alloca i8*
+  %tmp2 = alloca i8*
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  store i8* undef, i8** %tmp2
+  br i1 undef, label %bb2, label %bb3
+
+bb2:
+  %tmp50 = select i1 undef, i8** %tmp2, i8** %tmp1
+  br i1 undef, label %bb3, label %bb4
+
+bb3:
+  unreachable
+
+bb4:
+  unreachable
+}
+
+define double @PR13969(double %x) {
+; Check that we detect when promotion will un-escape an alloca and iterate to
+; re-try running SROA over that alloca. Without that, the two allocas that are
+; stored into a dead alloca don't get rewritten and promoted.
+; CHECK: @PR13969
+
+entry:
+  %a = alloca double
+  %b = alloca double*
+  %c = alloca double
+; CHECK-NOT: alloca
+
+  store double %x, double* %a
+  store double* %c, double** %b
+  store double* %a, double** %b
+  store double %x, double* %c
+  %ret = load double* %a
+; CHECK-NOT: store
+; CHECK-NOT: load
+
+  ret double %ret
+; CHECK: ret double %x
+}
+
+%PR14034.struct = type { { {} }, i32, %PR14034.list }
+%PR14034.list = type { %PR14034.list*, %PR14034.list* }
+
+define void @PR14034() {
+; This test case tries to form GEPs into the empty leading struct members, and
+; subsequently crashed (under valgrind) before we fixed the PR. The important
+; thing is to handle empty structs gracefully.
+; CHECK: @PR14034
+
+entry:
+  %a = alloca %PR14034.struct
+  %list = getelementptr %PR14034.struct* %a, i32 0, i32 2
+  %prev = getelementptr %PR14034.list* %list, i32 0, i32 1
+  store %PR14034.list* undef, %PR14034.list** %prev
+  %cast0 = bitcast %PR14034.struct* undef to i8*
+  %cast1 = bitcast %PR14034.struct* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %cast0, i8* %cast1, i32 12, i32 0, i1 false)
+  ret void
+}
+
+define i32 @test22(i32 %x) {
+; Test that SROA and promotion is not confused by a grab bax mixture of pointer
+; types involving wrapper aggregates and zero-length aggregate members.
+; CHECK: @test22
+
+entry:
+  %a1 = alloca { { [1 x { i32 }] } }
+  %a2 = alloca { {}, { float }, [0 x i8] }
+  %a3 = alloca { [0 x i8], { [0 x double], [1 x [1 x <4 x i8>]], {} }, { { {} } } }
+; CHECK-NOT: alloca
+
+  %wrap1 = insertvalue [1 x { i32 }] undef, i32 %x, 0, 0
+  %gep1 = getelementptr { { [1 x { i32 }] } }* %a1, i32 0, i32 0, i32 0
+  store [1 x { i32 }] %wrap1, [1 x { i32 }]* %gep1
+
+  %gep2 = getelementptr { { [1 x { i32 }] } }* %a1, i32 0, i32 0
+  %ptrcast1 = bitcast { [1 x { i32 }] }* %gep2 to { [1 x { float }] }*
+  %load1 = load { [1 x { float }] }* %ptrcast1
+  %unwrap1 = extractvalue { [1 x { float }] } %load1, 0, 0
+
+  %wrap2 = insertvalue { {}, { float }, [0 x i8] } undef, { float } %unwrap1, 1
+  store { {}, { float }, [0 x i8] } %wrap2, { {}, { float }, [0 x i8] }* %a2
+
+  %gep3 = getelementptr { {}, { float }, [0 x i8] }* %a2, i32 0, i32 1, i32 0
+  %ptrcast2 = bitcast float* %gep3 to <4 x i8>*
+  %load3 = load <4 x i8>* %ptrcast2
+  %valcast1 = bitcast <4 x i8> %load3 to i32
+
+  %wrap3 = insertvalue [1 x [1 x i32]] undef, i32 %valcast1, 0, 0
+  %wrap4 = insertvalue { [1 x [1 x i32]], {} } undef, [1 x [1 x i32]] %wrap3, 0
+  %gep4 = getelementptr { [0 x i8], { [0 x double], [1 x [1 x <4 x i8>]], {} }, { { {} } } }* %a3, i32 0, i32 1
+  %ptrcast3 = bitcast { [0 x double], [1 x [1 x <4 x i8>]], {} }* %gep4 to { [1 x [1 x i32]], {} }*
+  store { [1 x [1 x i32]], {} } %wrap4, { [1 x [1 x i32]], {} }* %ptrcast3
+
+  %gep5 = getelementptr { [0 x i8], { [0 x double], [1 x [1 x <4 x i8>]], {} }, { { {} } } }* %a3, i32 0, i32 1, i32 1, i32 0
+  %ptrcast4 = bitcast [1 x <4 x i8>]* %gep5 to { {}, float, {} }*
+  %load4 = load { {}, float, {} }* %ptrcast4
+  %unwrap2 = extractvalue { {}, float, {} } %load4, 1
+  %valcast2 = bitcast float %unwrap2 to i32
+
+  ret i32 %valcast2
+; CHECK: ret i32
+}
+
+define void @PR14059.1(double* %d) {
+; In PR14059 a peculiar construct was identified as something that is used
+; pervasively in ARM's ABI-calling-convention lowering: the passing of a struct
+; of doubles via an array of i32 in order to place the data into integer
+; registers. This in turn was missed as an optimization by SROA due to the
+; partial loads and stores of integers to the double alloca we were trying to
+; form and promote. The solution is to widen the integer operations to be
+; whole-alloca operations, and perform the appropriate bitcasting on the
+; *values* rather than the pointers. When this works, partial reads and writes
+; via integers can be promoted away.
+; CHECK: @PR14059.1
+; CHECK-NOT: alloca
+; CHECK: ret void
+
+entry:
+  %X.sroa.0.i = alloca double, align 8
+  %0 = bitcast double* %X.sroa.0.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0)
+
+  ; Store to the low 32-bits...
+  %X.sroa.0.0.cast2.i = bitcast double* %X.sroa.0.i to i32*
+  store i32 0, i32* %X.sroa.0.0.cast2.i, align 8
+
+  ; Also use a memset to the middle 32-bits for fun.
+  %X.sroa.0.2.raw_idx2.i = getelementptr inbounds i8* %0, i32 2
+  call void @llvm.memset.p0i8.i64(i8* %X.sroa.0.2.raw_idx2.i, i8 0, i64 4, i32 1, i1 false)
+
+  ; Or a memset of the whole thing.
+  call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 8, i32 1, i1 false)
+
+  ; Write to the high 32-bits with a memcpy.
+  %X.sroa.0.4.raw_idx4.i = getelementptr inbounds i8* %0, i32 4
+  %d.raw = bitcast double* %d to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %X.sroa.0.4.raw_idx4.i, i8* %d.raw, i32 4, i32 1, i1 false)
+
+  ; Store to the high 32-bits...
+  %X.sroa.0.4.cast5.i = bitcast i8* %X.sroa.0.4.raw_idx4.i to i32*
+  store i32 1072693248, i32* %X.sroa.0.4.cast5.i, align 4
+
+  ; Do the actual math...
+  %X.sroa.0.0.load1.i = load double* %X.sroa.0.i, align 8
+  %accum.real.i = load double* %d, align 8
+  %add.r.i = fadd double %accum.real.i, %X.sroa.0.0.load1.i
+  store double %add.r.i, double* %d, align 8
+  call void @llvm.lifetime.end(i64 -1, i8* %0)
+  ret void
+}
+
+define i64 @PR14059.2({ float, float }* %phi) {
+; Check that SROA can split up alloca-wide integer loads and stores where the
+; underlying alloca has smaller components that are accessed independently. This
+; shows up particularly with ABI lowering patterns coming out of Clang that rely
+; on the particular register placement of a single large integer return value.
+; CHECK: @PR14059.2
+
+entry:
+  %retval = alloca { float, float }, align 4
+  ; CHECK-NOT: alloca
+
+  %0 = bitcast { float, float }* %retval to i64*
+  store i64 0, i64* %0
+  ; CHECK-NOT: store
+
+  %phi.realp = getelementptr inbounds { float, float }* %phi, i32 0, i32 0
+  %phi.real = load float* %phi.realp
+  %phi.imagp = getelementptr inbounds { float, float }* %phi, i32 0, i32 1
+  %phi.imag = load float* %phi.imagp
+  ; CHECK:      %[[realp:.*]] = getelementptr inbounds { float, float }* %phi, i32 0, i32 0
+  ; CHECK-NEXT: %[[real:.*]] = load float* %[[realp]]
+  ; CHECK-NEXT: %[[imagp:.*]] = getelementptr inbounds { float, float }* %phi, i32 0, i32 1
+  ; CHECK-NEXT: %[[imag:.*]] = load float* %[[imagp]]
+
+  %real = getelementptr inbounds { float, float }* %retval, i32 0, i32 0
+  %imag = getelementptr inbounds { float, float }* %retval, i32 0, i32 1
+  store float %phi.real, float* %real
+  store float %phi.imag, float* %imag
+  ; CHECK-NEXT: %[[real_convert:.*]] = bitcast float %[[real]] to i32
+  ; CHECK-NEXT: %[[imag_convert:.*]] = bitcast float %[[imag]] to i32
+  ; CHECK-NEXT: %[[imag_ext:.*]] = zext i32 %[[imag_convert]] to i64
+  ; CHECK-NEXT: %[[imag_shift:.*]] = shl i64 %[[imag_ext]], 32
+  ; CHECK-NEXT: %[[imag_mask:.*]] = and i64 undef, 4294967295
+  ; CHECK-NEXT: %[[imag_insert:.*]] = or i64 %[[imag_mask]], %[[imag_shift]]
+  ; CHECK-NEXT: %[[real_ext:.*]] = zext i32 %[[real_convert]] to i64
+  ; CHECK-NEXT: %[[real_mask:.*]] = and i64 %[[imag_insert]], -4294967296
+  ; CHECK-NEXT: %[[real_insert:.*]] = or i64 %[[real_mask]], %[[real_ext]]
+
+  %1 = load i64* %0, align 1
+  ret i64 %1
+  ; CHECK-NEXT: ret i64 %[[real_insert]]
+}
+
+define void @PR14105({ [16 x i8] }* %ptr) {
+; Ensure that when rewriting the GEP index '-1' for this alloca we preserve is
+; sign as negative. We use a volatile memcpy to ensure promotion never actually
+; occurs.
+; CHECK: @PR14105
+
+entry:
+  %a = alloca { [16 x i8] }, align 8
+; CHECK: alloca [16 x i8], align 8
+
+  %gep = getelementptr inbounds { [16 x i8] }* %ptr, i64 -1
+; CHECK-NEXT: getelementptr inbounds { [16 x i8] }* %ptr, i64 -1, i32 0, i64 0
+
+  %cast1 = bitcast { [16 x i8 ] }* %gep to i8*
+  %cast2 = bitcast { [16 x i8 ] }* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %cast1, i8* %cast2, i32 16, i32 8, i1 true)
+  ret void
+; CHECK: ret
+}
diff --git a/test/Transforms/SROA/big-endian.ll b/test/Transforms/SROA/big-endian.ll
new file mode 100644
index 0000000..ce82d1f
--- /dev/null
+++ b/test/Transforms/SROA/big-endian.ll
@@ -0,0 +1,119 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+; RUN: opt < %s -sroa -force-ssa-updater -S | FileCheck %s
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+define i8 @test1() {
+; We fully promote these to the i24 load or store size, resulting in just masks
+; and other operations that instcombine will fold, but no alloca. Note this is
+; the same as test12 in basictest.ll, but here we assert big-endian byte
+; ordering.
+;
+; CHECK: @test1
+
+entry:
+  %a = alloca [3 x i8]
+  %b = alloca [3 x i8]
+; CHECK-NOT: alloca
+
+  %a0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0
+  store i8 0, i8* %a0ptr
+  %a1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1
+  store i8 0, i8* %a1ptr
+  %a2ptr = getelementptr [3 x i8]* %a, i64 0, i32 2
+  store i8 0, i8* %a2ptr
+  %aiptr = bitcast [3 x i8]* %a to i24*
+  %ai = load i24* %aiptr
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, -256
+; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[ext2]]
+; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
+; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
+; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
+; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift0:.*]] = shl i24 %[[ext0]], 16
+; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], 65535
+; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[shift0]]
+
+  %biptr = bitcast [3 x i8]* %b to i24*
+  store i24 %ai, i24* %biptr
+  %b0ptr = getelementptr [3 x i8]* %b, i64 0, i32 0
+  %b0 = load i8* %b0ptr
+  %b1ptr = getelementptr [3 x i8]* %b, i64 0, i32 1
+  %b1 = load i8* %b1ptr
+  %b2ptr = getelementptr [3 x i8]* %b, i64 0, i32 2
+  %b2 = load i8* %b2ptr
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+; CHECK:      %[[shift0:.*]] = lshr i24 %[[insert0]], 16
+; CHECK-NEXT: %[[trunc0:.*]] = trunc i24 %[[shift0]] to i8
+; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
+; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
+; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[insert0]] to i8
+
+  %bsum0 = add i8 %b0, %b1
+  %bsum1 = add i8 %bsum0, %b2
+  ret i8 %bsum1
+; CHECK:      %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]]
+; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]]
+; CHECK-NEXT: ret i8 %[[sum1]]
+}
+
+define i64 @test2() {
+; Test for various mixed sizes of integer loads and stores all getting
+; promoted.
+;
+; CHECK: @test2
+
+entry:
+  %a = alloca [7 x i8]
+; CHECK-NOT: alloca
+
+  %a0ptr = getelementptr [7 x i8]* %a, i64 0, i32 0
+  %a1ptr = getelementptr [7 x i8]* %a, i64 0, i32 1
+  %a2ptr = getelementptr [7 x i8]* %a, i64 0, i32 2
+  %a3ptr = getelementptr [7 x i8]* %a, i64 0, i32 3
+
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+
+  %a0i16ptr = bitcast i8* %a0ptr to i16*
+  store i16 1, i16* %a0i16ptr
+; CHECK:      %[[mask0:.*]] = and i16 1, -16
+
+  %a1i4ptr = bitcast i8* %a1ptr to i4*
+  store i4 1, i4* %a1i4ptr
+; CHECK-NEXT: %[[insert0:.*]] = or i16 %[[mask0]], 1
+
+  store i8 1, i8* %a2ptr
+; CHECK-NEXT: %[[mask1:.*]] = and i40 undef, 4294967295
+; CHECK-NEXT: %[[insert1:.*]] = or i40 %[[mask1]], 4294967296
+
+  %a3i24ptr = bitcast i8* %a3ptr to i24*
+  store i24 1, i24* %a3i24ptr
+; CHECK-NEXT: %[[mask2:.*]] = and i40 %[[insert1]], -4294967041
+; CHECK-NEXT: %[[insert2:.*]] = or i40 %[[mask2]], 256
+
+  %a2i40ptr = bitcast i8* %a2ptr to i40*
+  store i40 1, i40* %a2i40ptr
+; CHECK-NEXT: %[[ext3:.*]] = zext i40 1 to i56
+; CHECK-NEXT: %[[mask3:.*]] = and i56 undef, -1099511627776
+; CHECK-NEXT: %[[insert3:.*]] = or i56 %[[mask3]], %[[ext3]]
+
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+
+  %aiptr = bitcast [7 x i8]* %a to i56*
+  %ai = load i56* %aiptr
+  %ret = zext i56 %ai to i64
+  ret i64 %ret
+; CHECK-NEXT: %[[ext4:.*]] = zext i16 %[[insert0]] to i56
+; CHECK-NEXT: %[[shift4:.*]] = shl i56 %[[ext4]], 40
+; CHECK-NEXT: %[[mask4:.*]] = and i56 %[[insert3]], 1099511627775
+; CHECK-NEXT: %[[insert4:.*]] = or i56 %[[mask4]], %[[shift4]]
+; CHECK-NEXT: %[[ret:.*]] = zext i56 %[[insert4]] to i64
+; CHECK-NEXT: ret i64 %[[ret]]
+}
diff --git a/test/Transforms/SROA/fca.ll b/test/Transforms/SROA/fca.ll
new file mode 100644
index 0000000..c30a5cc
--- /dev/null
+++ b/test/Transforms/SROA/fca.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+; RUN: opt < %s -sroa -force-ssa-updater -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+define { i32, i32 } @test0(i32 %x, i32 %y) {
+; CHECK: @test0
+; CHECK-NOT: alloca
+; CHECK: insertvalue { i32, i32 }
+; CHECK: insertvalue { i32, i32 }
+; CHECK: ret { i32, i32 }
+
+entry:
+  %a = alloca { i32, i32 }
+
+  store { i32, i32 } undef, { i32, i32 }* %a
+
+  %gep1 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 0
+  store i32 %x, i32* %gep1
+  %gep2 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 1
+  store i32 %y, i32* %gep2
+
+  %result = load { i32, i32 }* %a
+  ret { i32, i32 } %result
+}
+
+define { i32, i32 } @test1(i32 %x, i32 %y) {
+; FIXME: This may be too conservative. Duncan argues that we are allowed to
+; split the volatile load and store here but must produce volatile scalar loads
+; and stores from them.
+; CHECK: @test1
+; CHECK: alloca
+; CHECK: alloca
+; CHECK: load volatile { i32, i32 }*
+; CHECK: store volatile { i32, i32 }
+; CHECK: ret { i32, i32 }
+
+entry:
+  %a = alloca { i32, i32 }
+  %b = alloca { i32, i32 }
+
+  %gep1 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 0
+  store i32 %x, i32* %gep1
+  %gep2 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 1
+  store i32 %y, i32* %gep2
+
+  %result = load volatile { i32, i32 }* %a
+  store volatile { i32, i32 } %result, { i32, i32 }* %b
+  ret { i32, i32 } %result
+}
diff --git a/test/Transforms/SROA/lit.local.cfg b/test/Transforms/SROA/lit.local.cfg
new file mode 100644
index 0000000..c6106e4
--- /dev/null
+++ b/test/Transforms/SROA/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.ll']
diff --git a/test/Transforms/SROA/phi-and-select.ll b/test/Transforms/SROA/phi-and-select.ll
new file mode 100644
index 0000000..921016a
--- /dev/null
+++ b/test/Transforms/SROA/phi-and-select.ll
@@ -0,0 +1,427 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+define i32 @test1() {
+; CHECK: @test1
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  %a0 = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 0, i32* %a0
+	store i32 1, i32* %a1
+	%v0 = load i32* %a0
+	%v1 = load i32* %a1
+; CHECK-NOT: store
+; CHECK-NOT: load
+
+	%cond = icmp sle i32 %v0, %v1
+	br i1 %cond, label %then, label %exit
+
+then:
+	br label %exit
+
+exit:
+	%phi = phi i32* [ %a1, %then ], [ %a0, %entry ]
+; CHECK: phi i32 [ 1, %{{.*}} ], [ 0, %{{.*}} ]
+
+	%result = load i32* %phi
+	ret i32 %result
+}
+
+define i32 @test2() {
+; CHECK: @test2
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  %a0 = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 0, i32* %a0
+	store i32 1, i32* %a1
+	%v0 = load i32* %a0
+	%v1 = load i32* %a1
+; CHECK-NOT: store
+; CHECK-NOT: load
+
+	%cond = icmp sle i32 %v0, %v1
+	%select = select i1 %cond, i32* %a1, i32* %a0
+; CHECK: select i1 %{{.*}}, i32 1, i32 0
+
+	%result = load i32* %select
+	ret i32 %result
+}
+
+define i32 @test3(i32 %x) {
+; CHECK: @test3
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  ; Note that we build redundant GEPs here to ensure that having different GEPs
+  ; into the same alloca partation continues to work with PHI speculation. This
+  ; was the underlying cause of PR13926.
+  %a0 = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a0b = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+  %a1b = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 0, i32* %a0
+	store i32 1, i32* %a1
+; CHECK-NOT: store
+
+  switch i32 %x, label %bb0 [ i32 1, label %bb1
+                              i32 2, label %bb2
+                              i32 3, label %bb3
+                              i32 4, label %bb4
+                              i32 5, label %bb5
+                              i32 6, label %bb6
+                              i32 7, label %bb7 ]
+
+bb0:
+	br label %exit
+bb1:
+	br label %exit
+bb2:
+	br label %exit
+bb3:
+	br label %exit
+bb4:
+	br label %exit
+bb5:
+	br label %exit
+bb6:
+	br label %exit
+bb7:
+	br label %exit
+
+exit:
+	%phi = phi i32* [ %a1, %bb0 ], [ %a0, %bb1 ], [ %a0, %bb2 ], [ %a1, %bb3 ],
+                  [ %a1b, %bb4 ], [ %a0b, %bb5 ], [ %a0b, %bb6 ], [ %a1b, %bb7 ]
+; CHECK: phi i32 [ 1, %{{.*}} ], [ 0, %{{.*}} ], [ 0, %{{.*}} ], [ 1, %{{.*}} ], [ 1, %{{.*}} ], [ 0, %{{.*}} ], [ 0, %{{.*}} ], [ 1, %{{.*}} ]
+
+	%result = load i32* %phi
+	ret i32 %result
+}
+
+define i32 @test4() {
+; CHECK: @test4
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  %a0 = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 0, i32* %a0
+	store i32 1, i32* %a1
+	%v0 = load i32* %a0
+	%v1 = load i32* %a1
+; CHECK-NOT: store
+; CHECK-NOT: load
+
+	%cond = icmp sle i32 %v0, %v1
+	%select = select i1 %cond, i32* %a0, i32* %a0
+; CHECK-NOT: select
+
+	%result = load i32* %select
+	ret i32 %result
+; CHECK: ret i32 0
+}
+
+define i32 @test5(i32* %b) {
+; CHECK: @test5
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 1, i32* %a1
+; CHECK-NOT: store
+
+	%select = select i1 true, i32* %a1, i32* %b
+; CHECK-NOT: select
+
+	%result = load i32* %select
+; CHECK-NOT: load
+
+	ret i32 %result
+; CHECK: ret i32 1
+}
+
+declare void @f(i32*, i32*)
+
+define i32 @test6(i32* %b) {
+; CHECK: @test6
+entry:
+	%a = alloca [2 x i32]
+  %c = alloca i32
+; CHECK-NOT: alloca
+
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 1, i32* %a1
+
+	%select = select i1 true, i32* %a1, i32* %b
+	%select2 = select i1 false, i32* %a1, i32* %b
+  %select3 = select i1 false, i32* %c, i32* %b
+; CHECK: %[[select2:.*]] = select i1 false, i32* undef, i32* %b
+; CHECK: %[[select3:.*]] = select i1 false, i32* undef, i32* %b
+
+  ; Note, this would potentially escape the alloca pointer except for the
+  ; constant folding of the select.
+  call void @f(i32* %select2, i32* %select3)
+; CHECK: call void @f(i32* %[[select2]], i32* %[[select3]])
+
+
+	%result = load i32* %select
+; CHECK-NOT: load
+
+  %dead = load i32* %c
+
+	ret i32 %result
+; CHECK: ret i32 1
+}
+
+define i32 @test7() {
+; CHECK: @test7
+; CHECK-NOT: alloca
+
+entry:
+  %X = alloca i32
+  br i1 undef, label %good, label %bad
+
+good:
+  %Y1 = getelementptr i32* %X, i64 0
+  store i32 0, i32* %Y1
+  br label %exit
+
+bad:
+  %Y2 = getelementptr i32* %X, i64 1
+  store i32 0, i32* %Y2
+  br label %exit
+
+exit:
+	%P = phi i32* [ %Y1, %good ], [ %Y2, %bad ]
+; CHECK: %[[phi:.*]] = phi i32 [ 0, %good ],
+  %Z2 = load i32* %P
+  ret i32 %Z2
+; CHECK: ret i32 %[[phi]]
+}
+
+define i32 @test8(i32 %b, i32* %ptr) {
+; Ensure that we rewrite allocas to the used type when that use is hidden by
+; a PHI that can be speculated.
+; CHECK: @test8
+; CHECK-NOT: alloca
+; CHECK-NOT: load
+; CHECK: %[[value:.*]] = load i32* %ptr
+; CHECK-NOT: load
+; CHECK: %[[result:.*]] = phi i32 [ undef, %else ], [ %[[value]], %then ]
+; CHECK-NEXT: ret i32 %[[result]]
+
+entry:
+  %f = alloca float
+  %test = icmp ne i32 %b, 0
+  br i1 %test, label %then, label %else
+
+then:
+  br label %exit
+
+else:
+  %bitcast = bitcast float* %f to i32*
+  br label %exit
+
+exit:
+  %phi = phi i32* [ %bitcast, %else ], [ %ptr, %then ]
+  %loaded = load i32* %phi, align 4
+  ret i32 %loaded
+}
+
+define i32 @test9(i32 %b, i32* %ptr) {
+; Same as @test8 but for a select rather than a PHI node.
+; CHECK: @test9
+; CHECK-NOT: alloca
+; CHECK-NOT: load
+; CHECK: %[[value:.*]] = load i32* %ptr
+; CHECK-NOT: load
+; CHECK: %[[result:.*]] = select i1 %{{.*}}, i32 undef, i32 %[[value]]
+; CHECK-NEXT: ret i32 %[[result]]
+
+entry:
+  %f = alloca float
+  store i32 0, i32* %ptr
+  %test = icmp ne i32 %b, 0
+  %bitcast = bitcast float* %f to i32*
+  %select = select i1 %test, i32* %bitcast, i32* %ptr
+  %loaded = load i32* %select, align 4
+  ret i32 %loaded
+}
+
+define float @test10(i32 %b, float* %ptr) {
+; Don't try to promote allocas which are not elligible for it even after
+; rewriting due to the necessity of inserting bitcasts when speculating a PHI
+; node.
+; CHECK: @test10
+; CHECK: %[[alloca:.*]] = alloca
+; CHECK: %[[argvalue:.*]] = load float* %ptr
+; CHECK: %[[cast:.*]] = bitcast double* %[[alloca]] to float*
+; CHECK: %[[allocavalue:.*]] = load float* %[[cast]]
+; CHECK: %[[result:.*]] = phi float [ %[[allocavalue]], %else ], [ %[[argvalue]], %then ]
+; CHECK-NEXT: ret float %[[result]]
+
+entry:
+  %f = alloca double
+  store double 0.0, double* %f
+  %test = icmp ne i32 %b, 0
+  br i1 %test, label %then, label %else
+
+then:
+  br label %exit
+
+else:
+  %bitcast = bitcast double* %f to float*
+  br label %exit
+
+exit:
+  %phi = phi float* [ %bitcast, %else ], [ %ptr, %then ]
+  %loaded = load float* %phi, align 4
+  ret float %loaded
+}
+
+define float @test11(i32 %b, float* %ptr) {
+; Same as @test10 but for a select rather than a PHI node.
+; CHECK: @test11
+; CHECK: %[[alloca:.*]] = alloca
+; CHECK: %[[cast:.*]] = bitcast double* %[[alloca]] to float*
+; CHECK: %[[allocavalue:.*]] = load float* %[[cast]]
+; CHECK: %[[argvalue:.*]] = load float* %ptr
+; CHECK: %[[result:.*]] = select i1 %{{.*}}, float %[[allocavalue]], float %[[argvalue]]
+; CHECK-NEXT: ret float %[[result]]
+
+entry:
+  %f = alloca double
+  store double 0.0, double* %f
+  store float 0.0, float* %ptr
+  %test = icmp ne i32 %b, 0
+  %bitcast = bitcast double* %f to float*
+  %select = select i1 %test, float* %bitcast, float* %ptr
+  %loaded = load float* %select, align 4
+  ret float %loaded
+}
+
+define i32 @test12(i32 %x, i32* %p) {
+; Ensure we don't crash or fail to nuke dead selects of allocas if no load is
+; never found.
+; CHECK: @test12
+; CHECK-NOT: alloca
+; CHECK-NOT: select
+; CHECK: ret i32 %x
+
+entry:
+  %a = alloca i32
+  store i32 %x, i32* %a
+  %dead = select i1 undef, i32* %a, i32* %p
+  %load = load i32* %a
+  ret i32 %load
+}
+
+define i32 @test13(i32 %x, i32* %p) {
+; Ensure we don't crash or fail to nuke dead phis of allocas if no load is ever
+; found.
+; CHECK: @test13
+; CHECK-NOT: alloca
+; CHECK-NOT: phi
+; CHECK: ret i32 %x
+
+entry:
+  %a = alloca i32
+  store i32 %x, i32* %a
+  br label %loop
+
+loop:
+  %phi = phi i32* [ %p, %entry ], [ %a, %loop ]
+  br i1 undef, label %loop, label %exit
+
+exit:
+  %load = load i32* %a
+  ret i32 %load
+}
+
+define i32 @PR13905() {
+; Check a pattern where we have a chain of dead phi nodes to ensure they are
+; deleted and promotion can proceed.
+; CHECK: @PR13905
+; CHECK-NOT: alloca i32
+; CHECK: ret i32 undef
+
+entry:
+  %h = alloca i32
+  store i32 0, i32* %h
+  br i1 undef, label %loop1, label %exit
+
+loop1:
+  %phi1 = phi i32* [ null, %entry ], [ %h, %loop1 ], [ %h, %loop2 ]
+  br i1 undef, label %loop1, label %loop2
+
+loop2:
+  br i1 undef, label %loop1, label %exit
+
+exit:
+  %phi2 = phi i32* [ %phi1, %loop2 ], [ null, %entry ]
+  ret i32 undef
+}
+
+define i32 @PR13906() {
+; Another pattern which can lead to crashes due to failing to clear out dead
+; PHI nodes or select nodes. This triggers subtly differently from the above
+; cases because the PHI node is (recursively) alive, but the select is dead.
+; CHECK: @PR13906
+; CHECK-NOT: alloca
+
+entry:
+  %c = alloca i32
+  store i32 0, i32* %c
+  br label %for.cond
+
+for.cond:
+  %d.0 = phi i32* [ undef, %entry ], [ %c, %if.then ], [ %d.0, %for.cond ]
+  br i1 undef, label %if.then, label %for.cond
+
+if.then:
+  %tmpcast.d.0 = select i1 undef, i32* %c, i32* %d.0
+  br label %for.cond
+}
+
+define i64 @PR14132(i1 %flag) {
+; CHECK: @PR14132
+; Here we form a PHI-node by promoting the pointer alloca first, and then in
+; order to promote the other two allocas, we speculate the load of the
+; now-phi-node-pointer. In doing so we end up loading a 64-bit value from an i8
+; alloca, which is completely bogus. However, we were asserting on trying to
+; rewrite it. Now it is replaced with undef. Eventually we may replace it with
+; unrechable and even the CFG will go away here.
+entry:
+  %a = alloca i64
+  %b = alloca i8
+  %ptr = alloca i64*
+; CHECK-NOT: alloca
+
+  %ptr.cast = bitcast i64** %ptr to i8**
+  store i64 0, i64* %a
+  store i8 1, i8* %b
+  store i64* %a, i64** %ptr
+  br i1 %flag, label %if.then, label %if.end
+
+if.then:
+  store i8* %b, i8** %ptr.cast
+  br label %if.end
+
+if.end:
+  %tmp = load i64** %ptr
+  %result = load i64* %tmp
+; CHECK-NOT: store
+; CHECK-NOT: load
+; CHECK: %[[result:.*]] = phi i64 [ undef, %if.then ], [ 0, %entry ]
+
+  ret i64 %result
+; CHECK-NEXT: ret i64 %[[result]]
+}
diff --git a/test/Transforms/SROA/vector-promotion.ll b/test/Transforms/SROA/vector-promotion.ll
new file mode 100644
index 0000000..ea28f5d
--- /dev/null
+++ b/test/Transforms/SROA/vector-promotion.ll
@@ -0,0 +1,267 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+%S1 = type { i64, [42 x float] }
+
+define i32 @test1(<4 x i32> %x, <4 x i32> %y) {
+; CHECK: @test1
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK-NOT: alloca
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+; CHECK-NOT: store
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %tmp3 = load i32* %a.tmp3
+; CHECK-NOT: load
+; CHECK:      extractelement <4 x i32> %x, i32 2
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 3
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+define i32 @test2(<4 x i32> %x, <4 x i32> %y) {
+; CHECK: @test2
+; FIXME: This should be handled!
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK: alloca <4 x i32>
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %a.tmp3.cast = bitcast i32* %a.tmp3 to <2 x i32>*
+  %tmp3.vec = load <2 x i32>* %a.tmp3.cast
+  %tmp3 = extractelement <2 x i32> %tmp3.vec, i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @test3(<4 x i32> %x, <4 x i32> %y) {
+; CHECK: @test3
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK-NOT: alloca
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+; CHECK-NOT: store
+
+  %a.y.cast = bitcast <4 x i32>* %a.y to i8*
+  call void @llvm.memset.p0i8.i32(i8* %a.y.cast, i8 0, i32 16, i32 1, i1 false)
+; CHECK-NOT: memset
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
+  call void @llvm.memset.p0i8.i32(i8* %a.tmp1.cast, i8 -1, i32 4, i32 1, i1 false)
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %tmp3 = load i32* %a.tmp3
+; CHECK-NOT: load
+; CHECK:      %[[insert:.*]] = insertelement <4 x i32> %x, i32 -1, i32 2
+; CHECK-NEXT: extractelement <4 x i32> %[[insert]], i32 2
+; CHECK-NEXT: extractelement <4 x i32> zeroinitializer, i32 3
+; CHECK-NEXT: extractelement <4 x i32> zeroinitializer, i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+define i32 @test4(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %z) {
+; CHECK: @test4
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK-NOT: alloca
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+; CHECK-NOT: store
+
+  %a.y.cast = bitcast <4 x i32>* %a.y to i8*
+  %z.cast = bitcast <4 x i32>* %z to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.y.cast, i8* %z.cast, i32 16, i32 1, i1 false)
+; CHECK-NOT: memcpy
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
+  %z.tmp1 = getelementptr inbounds <4 x i32>* %z, i64 0, i64 2
+  %z.tmp1.cast = bitcast i32* %z.tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.tmp1.cast, i8* %z.tmp1.cast, i32 4, i32 1, i1 false)
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %tmp3 = load i32* %a.tmp3
+; CHECK-NOT: memcpy
+; CHECK:      %[[load:.*]] = load <4 x i32>* %z
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds <4 x i32>* %z, i64 0, i64 2
+; CHECK-NEXT: %[[element_load:.*]] = load i32* %[[gep]]
+; CHECK-NEXT: %[[insert:.*]] = insertelement <4 x i32> %x, i32 %[[element_load]], i32 2
+; CHECK-NEXT: extractelement <4 x i32> %[[insert]], i32 2
+; CHECK-NEXT: extractelement <4 x i32> %[[load]], i32 3
+; CHECK-NEXT: extractelement <4 x i32> %[[load]], i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+define i32 @test5(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %z) {
+; CHECK: @test5
+; The same as the above, but with reversed source and destination for the
+; element memcpy, and a self copy.
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK-NOT: alloca
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+; CHECK-NOT: store
+
+  %a.y.cast = bitcast <4 x i32>* %a.y to i8*
+  %a.x.cast = bitcast <4 x i32>* %a.x to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.x.cast, i8* %a.y.cast, i32 16, i32 1, i1 false)
+; CHECK-NOT: memcpy
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
+  %z.tmp1 = getelementptr inbounds <4 x i32>* %z, i64 0, i64 2
+  %z.tmp1.cast = bitcast i32* %z.tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %z.tmp1.cast, i8* %a.tmp1.cast, i32 4, i32 1, i1 false)
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %tmp3 = load i32* %a.tmp3
+; CHECK-NOT: memcpy
+; CHECK:      %[[gep:.*]] = getelementptr inbounds <4 x i32>* %z, i64 0, i64 2
+; CHECK-NEXT: %[[extract:.*]] = extractelement <4 x i32> %y, i32 2
+; CHECK-NEXT: store i32 %[[extract]], i32* %[[gep]]
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 2
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 3
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+
+define i64 @test6(<4 x i64> %x, <4 x i64> %y, i64 %n) {
+; CHECK: @test6
+; The old scalarrepl pass would wrongly drop the store to the second alloca.
+; PR13254
+  %tmp = alloca { <4 x i64>, <4 x i64> }
+  %p0 = getelementptr inbounds { <4 x i64>, <4 x i64> }* %tmp, i32 0, i32 0
+  store <4 x i64> %x, <4 x i64>* %p0
+; CHECK: store <4 x i64> %x,
+  %p1 = getelementptr inbounds { <4 x i64>, <4 x i64> }* %tmp, i32 0, i32 1
+  store <4 x i64> %y, <4 x i64>* %p1
+; CHECK: store <4 x i64> %y,
+  %addr = getelementptr inbounds { <4 x i64>, <4 x i64> }* %tmp, i32 0, i32 0, i64 %n
+  %res = load i64* %addr, align 4
+  ret i64 %res
+}
+
+define i32 @PR14212() {
+; CHECK: @PR14212
+; This caused a crash when "splitting" the load of the i32 in order to promote
+; the store of <3 x i8> properly. Heavily reduced from an OpenCL test case.
+entry:
+  %retval = alloca <3 x i8>, align 4
+; CHECK-NOT: alloca
+
+  store <3 x i8> undef, <3 x i8>* %retval, align 4
+  %cast = bitcast <3 x i8>* %retval to i32*
+  %load = load i32* %cast, align 4
+  ret i32 %load
+; CHECK: ret i32
+}
+
+define <2 x i8> @PR14349.1(i32 %x) {
+; CEHCK: @PR14349.1
+; The first testcase for broken SROA rewriting of split integer loads and
+; stores due to smaller vector loads and stores. This particular test ensures
+; that we can rewrite a split store of an integer to a store of a vector.
+entry:
+  %a = alloca i32
+; CHECK-NOT: alloca
+
+  store i32 %x, i32* %a
+; CHECK-NOT: store
+
+  %cast = bitcast i32* %a to <2 x i8>*
+  %vec = load <2 x i8>* %cast
+; CHECK-NOT: load
+
+  ret <2 x i8> %vec
+; CHECK: %[[trunc:.*]] = trunc i32 %x to i16
+; CHECK: %[[cast:.*]] = bitcast i16 %[[trunc]] to <2 x i8>
+; CHECK: ret <2 x i8> %[[cast]]
+}
+
+define i32 @PR14349.2(<2 x i8> %x) {
+; CEHCK: @PR14349.2
+; The first testcase for broken SROA rewriting of split integer loads and
+; stores due to smaller vector loads and stores. This particular test ensures
+; that we can rewrite a split load of an integer to a load of a vector.
+entry:
+  %a = alloca i32
+; CHECK-NOT: alloca
+
+  %cast = bitcast i32* %a to <2 x i8>*
+  store <2 x i8> %x, <2 x i8>* %cast
+; CHECK-NOT: store
+
+  %int = load i32* %a
+; CHECK-NOT: load
+
+  ret i32 %int
+; CHECK: %[[cast:.*]] = bitcast <2 x i8> %x to i16
+; CHECK: %[[trunc:.*]] = zext i16 %[[cast]] to i32
+; CHECK: %[[insert:.*]] = or i32 %{{.*}}, %[[trunc]]
+; CHECK: ret i32 %[[insert]]
+}
diff --git a/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg b/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg
new file mode 100644
index 0000000..786fee9
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'Sparc' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/SimplifyCFG/SPARC/switch_to_lookup_table.ll b/test/Transforms/SimplifyCFG/SPARC/switch_to_lookup_table.ll
new file mode 100644
index 0000000..9d15685
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/SPARC/switch_to_lookup_table.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -simplifycfg -S -mtriple=sparc-unknown-unknown | FileCheck %s
+
+; Check that switches are not turned into lookup tables, as this is not
+; considered profitable on the target.
+
+define i32 @f(i32 %c) nounwind uwtable readnone {
+entry:
+  switch i32 %c, label %sw.default [
+    i32 42, label %return
+    i32 43, label %sw.bb1
+    i32 44, label %sw.bb2
+    i32 45, label %sw.bb3
+    i32 46, label %sw.bb4
+    i32 47, label %sw.bb5
+    i32 48, label %sw.bb6
+  ]
+
+sw.bb1: br label %return
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.bb4: br label %return
+sw.bb5: br label %return
+sw.bb6: br label %return
+sw.default: br label %return
+return:
+  %retval.0 = phi i32 [ 15, %sw.default ], [ 1, %sw.bb6 ], [ 62, %sw.bb5 ], [ 27, %sw.bb4 ], [ -1, %sw.bb3 ], [ 0, %sw.bb2 ], [ 123, %sw.bb1 ], [ 55, %entry ]
+  ret i32 %retval.0
+
+; CHECK: @f
+; CHECK-NOT: getelementptr
+; CHECK: switch i32 %c
+}
diff --git a/test/Transforms/SimplifyCFG/X86/lit.local.cfg b/test/Transforms/SimplifyCFG/X86/lit.local.cfg
new file mode 100644
index 0000000..a8ad0f1
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/X86/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
new file mode 100644
index 0000000..8a59992
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
@@ -0,0 +1,779 @@
+; RUN: opt < %s -simplifycfg -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; The table for @f
+; CHECK: @switch.table = private unnamed_addr constant [7 x i32] [i32 55, i32 123, i32 0, i32 -1, i32 27, i32 62, i32 1]
+
+; The float table for @h
+; CHECK: @switch.table1 = private unnamed_addr constant [4 x float] [float 0x40091EB860000000, float 0x3FF3BE76C0000000, float 0x4012449BA0000000, float 0x4001AE1480000000]
+
+; The table for @foostring
+; CHECK: @switch.table2 = private unnamed_addr constant [4 x i8*] [i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str2, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str3, i64 0, i64 0)]
+
+; The table for @earlyreturncrash
+; CHECK: @switch.table3 = private unnamed_addr constant [4 x i32] [i32 42, i32 9, i32 88, i32 5]
+
+; The table for @large.
+; CHECK: @switch.table4 = private unnamed_addr constant [199 x i32] [i32 1, i32 4, i32 9,
+
+; The table for @cprop
+; CHECK: @switch.table5 = private unnamed_addr constant [7 x i32] [i32 5, i32 42, i32 126, i32 -452, i32 128, i32 6, i32 7]
+
+; The table for @unreachable
+; CHECK: @switch.table6 = private unnamed_addr constant [5 x i32] [i32 0, i32 0, i32 0, i32 1, i32 -1]
+
+; A simple int-to-int selection switch.
+; It is dense enough to be replaced by table lookup.
+; The result is directly by a ret from an otherwise empty bb,
+; so we return early, directly from the lookup bb.
+
+define i32 @f(i32 %c) {
+entry:
+  switch i32 %c, label %sw.default [
+    i32 42, label %return
+    i32 43, label %sw.bb1
+    i32 44, label %sw.bb2
+    i32 45, label %sw.bb3
+    i32 46, label %sw.bb4
+    i32 47, label %sw.bb5
+    i32 48, label %sw.bb6
+  ]
+
+sw.bb1: br label %return
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.bb4: br label %return
+sw.bb5: br label %return
+sw.bb6: br label %return
+sw.default: br label %return
+return:
+  %retval.0 = phi i32 [ 15, %sw.default ], [ 1, %sw.bb6 ], [ 62, %sw.bb5 ], [ 27, %sw.bb4 ], [ -1, %sw.bb3 ], [ 0, %sw.bb2 ], [ 123, %sw.bb1 ], [ 55, %entry ]
+  ret i32 %retval.0
+
+; CHECK: @f
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i32 %c, 42
+; CHECK-NEXT: %0 = icmp ult i32 %switch.tableidx, 7
+; CHECK-NEXT: br i1 %0, label %switch.lookup, label %return
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.gep = getelementptr inbounds [7 x i32]* @switch.table, i32 0, i32 %switch.tableidx
+; CHECK-NEXT: %switch.load = load i32* %switch.gep
+; CHECK-NEXT: ret i32 %switch.load
+; CHECK: return:
+; CHECK-NEXT: ret i32 15
+}
+
+; A switch used to initialize two variables, an i8 and a float.
+
+declare void @dummy(i8 signext, float)
+define void @h(i32 %x) {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.epilog
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb1: br label %sw.epilog
+sw.bb2: br label %sw.epilog
+sw.bb3: br label %sw.epilog
+sw.default: br label %sw.epilog
+
+sw.epilog:
+  %a.0 = phi i8 [ 7, %sw.default ], [ 5, %sw.bb3 ], [ 88, %sw.bb2 ], [ 9, %sw.bb1 ], [ 42, %entry ]
+  %b.0 = phi float [ 0x4023FAE140000000, %sw.default ], [ 0x4001AE1480000000, %sw.bb3 ], [ 0x4012449BA0000000, %sw.bb2 ], [ 0x3FF3BE76C0000000, %sw.bb1 ], [ 0x40091EB860000000, %entry ]
+  call void @dummy(i8 signext %a.0, float %b.0)
+  ret void
+
+; CHECK: @h
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i32 %x, 0
+; CHECK-NEXT: %0 = icmp ult i32 %switch.tableidx, 4
+; CHECK-NEXT: br i1 %0, label %switch.lookup, label %sw.epilog
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.shiftamt = mul i32 %switch.tableidx, 8
+; CHECK-NEXT: %switch.downshift = lshr i32 89655594, %switch.shiftamt
+; CHECK-NEXT: %switch.masked = trunc i32 %switch.downshift to i8
+; CHECK-NEXT: %switch.gep = getelementptr inbounds [4 x float]* @switch.table1, i32 0, i32 %switch.tableidx
+; CHECK-NEXT: %switch.load = load float* %switch.gep
+; CHECK-NEXT: br label %sw.epilog
+; CHECK: sw.epilog:
+; CHECK-NEXT: %a.0 = phi i8 [ %switch.masked, %switch.lookup ], [ 7, %entry ]
+; CHECK-NEXT: %b.0 = phi float [ %switch.load, %switch.lookup ], [ 0x4023FAE140000000, %entry ]
+; CHECK-NEXT: call void @dummy(i8 signext %a.0, float %b.0)
+; CHECK-NEXT: ret void
+}
+
+
+; Switch used to return a string.
+
+@.str = private unnamed_addr constant [4 x i8] c"foo\00", align 1
+@.str1 = private unnamed_addr constant [4 x i8] c"bar\00", align 1
+@.str2 = private unnamed_addr constant [4 x i8] c"baz\00", align 1
+@.str3 = private unnamed_addr constant [4 x i8] c"qux\00", align 1
+@.str4 = private unnamed_addr constant [6 x i8] c"error\00", align 1
+
+define i8* @foostring(i32 %x)  {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %return
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb1: br label %return
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.default: br label %return
+
+return:
+  %retval.0 = phi i8* [ getelementptr inbounds ([6 x i8]* @.str4, i64 0, i64 0), %sw.default ],
+                      [ getelementptr inbounds ([4 x i8]* @.str3, i64 0, i64 0), %sw.bb3 ],
+                      [ getelementptr inbounds ([4 x i8]* @.str2, i64 0, i64 0), %sw.bb2 ],
+                      [ getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0), %sw.bb1 ],
+                      [ getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), %entry ]
+  ret i8* %retval.0
+
+; CHECK: @foostring
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i32 %x, 0
+; CHECK-NEXT: %0 = icmp ult i32 %switch.tableidx, 4
+; CHECK-NEXT: br i1 %0, label %switch.lookup, label %return
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.gep = getelementptr inbounds [4 x i8*]* @switch.table2, i32 0, i32 %switch.tableidx
+; CHECK-NEXT: %switch.load = load i8** %switch.gep
+; CHECK-NEXT: ret i8* %switch.load
+}
+
+; Switch used to initialize two values. The first value is returned, the second
+; value is not used. This used to make the transformation generate illegal code.
+
+define i32 @earlyreturncrash(i32 %x)  {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.epilog
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb1: br label %sw.epilog
+sw.bb2: br label %sw.epilog
+sw.bb3: br label %sw.epilog
+sw.default: br label %sw.epilog
+
+sw.epilog:
+  %a.0 = phi i32 [ 7, %sw.default ], [ 5, %sw.bb3 ], [ 88, %sw.bb2 ], [ 9, %sw.bb1 ], [ 42, %entry ]
+  %b.0 = phi i32 [ 10, %sw.default ], [ 5, %sw.bb3 ], [ 1, %sw.bb2 ], [ 4, %sw.bb1 ], [ 3, %entry ]
+  ret i32 %a.0
+
+; CHECK: @earlyreturncrash
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.gep = getelementptr inbounds [4 x i32]* @switch.table3, i32 0, i32 %switch.tableidx
+; CHECK-NEXT: %switch.load = load i32* %switch.gep
+; CHECK-NEXT: ret i32 %switch.load
+; CHECK: sw.epilog:
+; CHECK-NEXT: ret i32 7
+}
+
+
+; Example 7 from http://blog.regehr.org/archives/320
+; It is not dense enough for a regular table, but the results
+; can be packed into a bitmap.
+
+define i32 @crud(i8 zeroext %c)  {
+entry:
+  %cmp = icmp ult i8 %c, 33
+  br i1 %cmp, label %lor.end, label %switch.early.test
+
+switch.early.test:
+  switch i8 %c, label %lor.rhs [
+    i8 92, label %lor.end
+    i8 62, label %lor.end
+    i8 60, label %lor.end
+    i8 59, label %lor.end
+    i8 58, label %lor.end
+    i8 46, label %lor.end
+    i8 44, label %lor.end
+    i8 34, label %lor.end
+    i8 39, label %switch.edge
+  ]
+
+switch.edge: br label %lor.end
+lor.rhs: br label %lor.end
+
+lor.end:
+  %0 = phi i1 [ true, %switch.early.test ],
+              [ false, %lor.rhs ],
+              [ true, %entry ],
+              [ true, %switch.early.test ],
+              [ true, %switch.early.test ],
+              [ true, %switch.early.test ],
+              [ true, %switch.early.test ],
+              [ true, %switch.early.test ],
+              [ true, %switch.early.test ],
+              [ true, %switch.early.test ],
+              [ true, %switch.edge ]
+  %lor.ext = zext i1 %0 to i32
+  ret i32 %lor.ext
+
+; CHECK: @crud
+; CHECK: entry:
+; CHECK-NEXT: %cmp = icmp ult i8 %c, 33
+; CHECK-NEXT: br i1 %cmp, label %lor.end, label %switch.early.test
+; CHECK: switch.early.test:
+; CHECK-NEXT: %switch.tableidx = sub i8 %c, 34
+; CHECK-NEXT: %0 = icmp ult i8 %switch.tableidx, 59
+; CHECK-NEXT: br i1 %0, label %switch.lookup, label %lor.end
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.cast = zext i8 %switch.tableidx to i59
+; CHECK-NEXT: %switch.shiftamt = mul i59 %switch.cast, 1
+; CHECK-NEXT: %switch.downshift = lshr i59 -288230375765830623, %switch.shiftamt
+; CHECK-NEXT: %switch.masked = trunc i59 %switch.downshift to i1
+; CHECK-NEXT: br label %lor.end
+; CHECK: lor.end:
+; CHECK-NEXT: %1 = phi i1 [ true, %entry ], [ %switch.masked, %switch.lookup ], [ false, %switch.early.test ]
+; CHECK-NEXT: %lor.ext = zext i1 %1 to i32
+; CHECK-NEXT: ret i32 %lor.ext
+}
+
+; PR13946
+define i32 @overflow(i32 %type) {
+entry:
+  switch i32 %type, label %sw.default [
+    i32 -2147483648, label %sw.bb
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 -2147483645, label %sw.bb3
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb: br label %if.end
+sw.bb1: br label %if.end
+sw.bb2: br label %if.end
+sw.bb3: br label %if.end
+sw.default: br label %if.end
+if.else: br label %if.end
+
+if.end:
+  %dirent_type.0 = phi i32 [ 3, %sw.default ], [ 6, %sw.bb3 ], [ 5, %sw.bb2 ], [ 0, %sw.bb1 ], [ 3, %sw.bb ], [ 0, %if.else ]
+  ret i32 %dirent_type.0
+; CHECK: define i32 @overflow
+; CHECK: switch
+; CHECK: phi
+}
+
+; PR13985
+define i1 @undef(i32 %tmp) {
+bb:
+  switch i32 %tmp, label %bb3 [
+    i32 0, label %bb1
+    i32 1, label %bb1
+    i32 7, label %bb2
+    i32 8, label %bb2
+  ]
+
+bb1: br label %bb3
+bb2: br label %bb3
+
+bb3:
+  %tmp4 = phi i1 [ undef, %bb ], [ false, %bb2 ], [ true, %bb1 ]
+  ret i1 %tmp4
+; CHECK: define i1 @undef
+; CHECK: %switch.cast = trunc i32 %switch.tableidx to i9
+; CHECK: %switch.downshift = lshr i9 3, %switch.shiftamt
+}
+
+; Also handle large switches that would be rejected by
+; isValueEqualityComparison()
+; CHECK: large
+; CHECK-NOT: switch i32
+define i32 @large(i32 %x) {
+entry:
+  %cmp = icmp slt i32 %x, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %mul = mul i32 %x, -10
+  br label %if.end
+
+if.end:
+  %x.addr.0 = phi i32 [ %mul, %if.then ], [ %x, %entry ]
+  switch i32 %x.addr.0, label %return [
+    i32 199, label %sw.bb203
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+    i32 4, label %sw.bb4
+    i32 5, label %sw.bb5
+    i32 6, label %sw.bb6
+    i32 7, label %sw.bb7
+    i32 8, label %sw.bb8
+    i32 9, label %sw.bb9
+    i32 10, label %sw.bb10
+    i32 11, label %sw.bb11
+    i32 12, label %sw.bb12
+    i32 13, label %sw.bb13
+    i32 14, label %sw.bb14
+    i32 15, label %sw.bb15
+    i32 16, label %sw.bb16
+    i32 17, label %sw.bb17
+    i32 18, label %sw.bb18
+    i32 19, label %sw.bb19
+    i32 20, label %sw.bb20
+    i32 21, label %sw.bb21
+    i32 22, label %sw.bb22
+    i32 23, label %sw.bb23
+    i32 24, label %sw.bb24
+    i32 25, label %sw.bb25
+    i32 26, label %sw.bb26
+    i32 27, label %sw.bb27
+    i32 28, label %sw.bb28
+    i32 29, label %sw.bb29
+    i32 30, label %sw.bb30
+    i32 31, label %sw.bb31
+    i32 32, label %sw.bb32
+    i32 33, label %sw.bb33
+    i32 34, label %sw.bb34
+    i32 35, label %sw.bb35
+    i32 36, label %sw.bb37
+    i32 37, label %sw.bb38
+    i32 38, label %sw.bb39
+    i32 39, label %sw.bb40
+    i32 40, label %sw.bb41
+    i32 41, label %sw.bb42
+    i32 42, label %sw.bb43
+    i32 43, label %sw.bb44
+    i32 44, label %sw.bb45
+    i32 45, label %sw.bb47
+    i32 46, label %sw.bb48
+    i32 47, label %sw.bb49
+    i32 48, label %sw.bb50
+    i32 49, label %sw.bb51
+    i32 50, label %sw.bb52
+    i32 51, label %sw.bb53
+    i32 52, label %sw.bb54
+    i32 53, label %sw.bb55
+    i32 54, label %sw.bb56
+    i32 55, label %sw.bb58
+    i32 56, label %sw.bb59
+    i32 57, label %sw.bb60
+    i32 58, label %sw.bb61
+    i32 59, label %sw.bb62
+    i32 60, label %sw.bb63
+    i32 61, label %sw.bb64
+    i32 62, label %sw.bb65
+    i32 63, label %sw.bb66
+    i32 64, label %sw.bb67
+    i32 65, label %sw.bb68
+    i32 66, label %sw.bb69
+    i32 67, label %sw.bb70
+    i32 68, label %sw.bb71
+    i32 69, label %sw.bb72
+    i32 70, label %sw.bb73
+    i32 71, label %sw.bb74
+    i32 72, label %sw.bb76
+    i32 73, label %sw.bb77
+    i32 74, label %sw.bb78
+    i32 75, label %sw.bb79
+    i32 76, label %sw.bb80
+    i32 77, label %sw.bb81
+    i32 78, label %sw.bb82
+    i32 79, label %sw.bb83
+    i32 80, label %sw.bb84
+    i32 81, label %sw.bb85
+    i32 82, label %sw.bb86
+    i32 83, label %sw.bb87
+    i32 84, label %sw.bb88
+    i32 85, label %sw.bb89
+    i32 86, label %sw.bb90
+    i32 87, label %sw.bb91
+    i32 88, label %sw.bb92
+    i32 89, label %sw.bb93
+    i32 90, label %sw.bb94
+    i32 91, label %sw.bb95
+    i32 92, label %sw.bb96
+    i32 93, label %sw.bb97
+    i32 94, label %sw.bb98
+    i32 95, label %sw.bb99
+    i32 96, label %sw.bb100
+    i32 97, label %sw.bb101
+    i32 98, label %sw.bb102
+    i32 99, label %sw.bb103
+    i32 100, label %sw.bb104
+    i32 101, label %sw.bb105
+    i32 102, label %sw.bb106
+    i32 103, label %sw.bb107
+    i32 104, label %sw.bb108
+    i32 105, label %sw.bb109
+    i32 106, label %sw.bb110
+    i32 107, label %sw.bb111
+    i32 108, label %sw.bb112
+    i32 109, label %sw.bb113
+    i32 110, label %sw.bb114
+    i32 111, label %sw.bb115
+    i32 112, label %sw.bb116
+    i32 113, label %sw.bb117
+    i32 114, label %sw.bb118
+    i32 115, label %sw.bb119
+    i32 116, label %sw.bb120
+    i32 117, label %sw.bb121
+    i32 118, label %sw.bb122
+    i32 119, label %sw.bb123
+    i32 120, label %sw.bb124
+    i32 121, label %sw.bb125
+    i32 122, label %sw.bb126
+    i32 123, label %sw.bb127
+    i32 124, label %sw.bb128
+    i32 125, label %sw.bb129
+    i32 126, label %sw.bb130
+    i32 127, label %sw.bb131
+    i32 128, label %sw.bb132
+    i32 129, label %sw.bb133
+    i32 130, label %sw.bb134
+    i32 131, label %sw.bb135
+    i32 132, label %sw.bb136
+    i32 133, label %sw.bb137
+    i32 134, label %sw.bb138
+    i32 135, label %sw.bb139
+    i32 136, label %sw.bb140
+    i32 137, label %sw.bb141
+    i32 138, label %sw.bb142
+    i32 139, label %sw.bb143
+    i32 140, label %sw.bb144
+    i32 141, label %sw.bb145
+    i32 142, label %sw.bb146
+    i32 143, label %sw.bb147
+    i32 144, label %sw.bb148
+    i32 145, label %sw.bb149
+    i32 146, label %sw.bb150
+    i32 147, label %sw.bb151
+    i32 148, label %sw.bb152
+    i32 149, label %sw.bb153
+    i32 150, label %sw.bb154
+    i32 151, label %sw.bb155
+    i32 152, label %sw.bb156
+    i32 153, label %sw.bb157
+    i32 154, label %sw.bb158
+    i32 155, label %sw.bb159
+    i32 156, label %sw.bb160
+    i32 157, label %sw.bb161
+    i32 158, label %sw.bb162
+    i32 159, label %sw.bb163
+    i32 160, label %sw.bb164
+    i32 161, label %sw.bb165
+    i32 162, label %sw.bb166
+    i32 163, label %sw.bb167
+    i32 164, label %sw.bb168
+    i32 165, label %sw.bb169
+    i32 166, label %sw.bb170
+    i32 167, label %sw.bb171
+    i32 168, label %sw.bb172
+    i32 169, label %sw.bb173
+    i32 170, label %sw.bb174
+    i32 171, label %sw.bb175
+    i32 172, label %sw.bb176
+    i32 173, label %sw.bb177
+    i32 174, label %sw.bb178
+    i32 175, label %sw.bb179
+    i32 176, label %sw.bb180
+    i32 177, label %sw.bb181
+    i32 178, label %sw.bb182
+    i32 179, label %sw.bb183
+    i32 180, label %sw.bb184
+    i32 181, label %sw.bb185
+    i32 182, label %sw.bb186
+    i32 183, label %sw.bb187
+    i32 184, label %sw.bb188
+    i32 185, label %sw.bb189
+    i32 186, label %sw.bb190
+    i32 187, label %sw.bb191
+    i32 188, label %sw.bb192
+    i32 189, label %sw.bb193
+    i32 190, label %sw.bb194
+    i32 191, label %sw.bb195
+    i32 192, label %sw.bb196
+    i32 193, label %sw.bb197
+    i32 194, label %sw.bb198
+    i32 195, label %sw.bb199
+    i32 196, label %sw.bb200
+    i32 197, label %sw.bb201
+    i32 198, label %sw.bb202
+  ]
+
+sw.bb1: br label %return
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.bb4: br label %return
+sw.bb5: br label %return
+sw.bb6: br label %return
+sw.bb7: br label %return
+sw.bb8: br label %return
+sw.bb9: br label %return
+sw.bb10: br label %return
+sw.bb11: br label %return
+sw.bb12: br label %return
+sw.bb13: br label %return
+sw.bb14: br label %return
+sw.bb15: br label %return
+sw.bb16: br label %return
+sw.bb17: br label %return
+sw.bb18: br label %return
+sw.bb19: br label %return
+sw.bb20: br label %return
+sw.bb21: br label %return
+sw.bb22: br label %return
+sw.bb23: br label %return
+sw.bb24: br label %return
+sw.bb25: br label %return
+sw.bb26: br label %return
+sw.bb27: br label %return
+sw.bb28: br label %return
+sw.bb29: br label %return
+sw.bb30: br label %return
+sw.bb31: br label %return
+sw.bb32: br label %return
+sw.bb33: br label %return
+sw.bb34: br label %return
+sw.bb35: br label %return
+sw.bb37: br label %return
+sw.bb38: br label %return
+sw.bb39: br label %return
+sw.bb40: br label %return
+sw.bb41: br label %return
+sw.bb42: br label %return
+sw.bb43: br label %return
+sw.bb44: br label %return
+sw.bb45: br label %return
+sw.bb47: br label %return
+sw.bb48: br label %return
+sw.bb49: br label %return
+sw.bb50: br label %return
+sw.bb51: br label %return
+sw.bb52: br label %return
+sw.bb53: br label %return
+sw.bb54: br label %return
+sw.bb55: br label %return
+sw.bb56: br label %return
+sw.bb58: br label %return
+sw.bb59: br label %return
+sw.bb60: br label %return
+sw.bb61: br label %return
+sw.bb62: br label %return
+sw.bb63: br label %return
+sw.bb64: br label %return
+sw.bb65: br label %return
+sw.bb66: br label %return
+sw.bb67: br label %return
+sw.bb68: br label %return
+sw.bb69: br label %return
+sw.bb70: br label %return
+sw.bb71: br label %return
+sw.bb72: br label %return
+sw.bb73: br label %return
+sw.bb74: br label %return
+sw.bb76: br label %return
+sw.bb77: br label %return
+sw.bb78: br label %return
+sw.bb79: br label %return
+sw.bb80: br label %return
+sw.bb81: br label %return
+sw.bb82: br label %return
+sw.bb83: br label %return
+sw.bb84: br label %return
+sw.bb85: br label %return
+sw.bb86: br label %return
+sw.bb87: br label %return
+sw.bb88: br label %return
+sw.bb89: br label %return
+sw.bb90: br label %return
+sw.bb91: br label %return
+sw.bb92: br label %return
+sw.bb93: br label %return
+sw.bb94: br label %return
+sw.bb95: br label %return
+sw.bb96: br label %return
+sw.bb97: br label %return
+sw.bb98: br label %return
+sw.bb99: br label %return
+sw.bb100: br label %return
+sw.bb101: br label %return
+sw.bb102: br label %return
+sw.bb103: br label %return
+sw.bb104: br label %return
+sw.bb105: br label %return
+sw.bb106: br label %return
+sw.bb107: br label %return
+sw.bb108: br label %return
+sw.bb109: br label %return
+sw.bb110: br label %return
+sw.bb111: br label %return
+sw.bb112: br label %return
+sw.bb113: br label %return
+sw.bb114: br label %return
+sw.bb115: br label %return
+sw.bb116: br label %return
+sw.bb117: br label %return
+sw.bb118: br label %return
+sw.bb119: br label %return
+sw.bb120: br label %return
+sw.bb121: br label %return
+sw.bb122: br label %return
+sw.bb123: br label %return
+sw.bb124: br label %return
+sw.bb125: br label %return
+sw.bb126: br label %return
+sw.bb127: br label %return
+sw.bb128: br label %return
+sw.bb129: br label %return
+sw.bb130: br label %return
+sw.bb131: br label %return
+sw.bb132: br label %return
+sw.bb133: br label %return
+sw.bb134: br label %return
+sw.bb135: br label %return
+sw.bb136: br label %return
+sw.bb137: br label %return
+sw.bb138: br label %return
+sw.bb139: br label %return
+sw.bb140: br label %return
+sw.bb141: br label %return
+sw.bb142: br label %return
+sw.bb143: br label %return
+sw.bb144: br label %return
+sw.bb145: br label %return
+sw.bb146: br label %return
+sw.bb147: br label %return
+sw.bb148: br label %return
+sw.bb149: br label %return
+sw.bb150: br label %return
+sw.bb151: br label %return
+sw.bb152: br label %return
+sw.bb153: br label %return
+sw.bb154: br label %return
+sw.bb155: br label %return
+sw.bb156: br label %return
+sw.bb157: br label %return
+sw.bb158: br label %return
+sw.bb159: br label %return
+sw.bb160: br label %return
+sw.bb161: br label %return
+sw.bb162: br label %return
+sw.bb163: br label %return
+sw.bb164: br label %return
+sw.bb165: br label %return
+sw.bb166: br label %return
+sw.bb167: br label %return
+sw.bb168: br label %return
+sw.bb169: br label %return
+sw.bb170: br label %return
+sw.bb171: br label %return
+sw.bb172: br label %return
+sw.bb173: br label %return
+sw.bb174: br label %return
+sw.bb175: br label %return
+sw.bb176: br label %return
+sw.bb177: br label %return
+sw.bb178: br label %return
+sw.bb179: br label %return
+sw.bb180: br label %return
+sw.bb181: br label %return
+sw.bb182: br label %return
+sw.bb183: br label %return
+sw.bb184: br label %return
+sw.bb185: br label %return
+sw.bb186: br label %return
+sw.bb187: br label %return
+sw.bb188: br label %return
+sw.bb189: br label %return
+sw.bb190: br label %return
+sw.bb191: br label %return
+sw.bb192: br label %return
+sw.bb193: br label %return
+sw.bb194: br label %return
+sw.bb195: br label %return
+sw.bb196: br label %return
+sw.bb197: br label %return
+sw.bb198: br label %return
+sw.bb199: br label %return
+sw.bb200: br label %return
+sw.bb201: br label %return
+sw.bb202: br label %return
+sw.bb203: br label %return
+
+return:
+  %retval.0 = phi i32 [ 39204, %sw.bb202 ], [ 38809, %sw.bb201 ], [ 38416, %sw.bb200 ], [ 38025, %sw.bb199 ], [ 37636, %sw.bb198 ], [ 37249, %sw.bb197 ], [ 36864, %sw.bb196 ], [ 36481, %sw.bb195 ], [ 36100, %sw.bb194 ], [ 35721, %sw.bb193 ], [ 35344, %sw.bb192 ], [ 34969, %sw.bb191 ], [ 34596, %sw.bb190 ], [ 34225, %sw.bb189 ], [ 33856, %sw.bb188 ], [ 33489, %sw.bb187 ], [ 33124, %sw.bb186 ], [ 32761, %sw.bb185 ], [ 32400, %sw.bb184 ], [ 32041, %sw.bb183 ], [ 31684, %sw.bb182 ], [ 31329, %sw.bb181 ], [ 30976, %sw.bb180 ], [ 30625, %sw.bb179 ], [ 30276, %sw.bb178 ], [ 29929, %sw.bb177 ], [ 29584, %sw.bb176 ], [ 29241, %sw.bb175 ], [ 28900, %sw.bb174 ], [ 28561, %sw.bb173 ], [ 28224, %sw.bb172 ], [ 27889, %sw.bb171 ], [ 27556, %sw.bb170 ], [ 27225, %sw.bb169 ], [ 26896, %sw.bb168 ], [ 26569, %sw.bb167 ], [ 26244, %sw.bb166 ], [ 25921, %sw.bb165 ], [ 25600, %sw.bb164 ], [ 25281, %sw.bb163 ], [ 24964, %sw.bb162 ], [ 24649, %sw.bb161 ], [ 24336, %sw.bb160 ], [ 24025, %sw.bb159 ], [ 23716, %sw.bb158 ], [ 23409, %sw.bb157 ], [ 23104, %sw.bb156 ], [ 22801, %sw.bb155 ], [ 22500, %sw.bb154 ], [ 22201, %sw.bb153 ], [ 21904, %sw.bb152 ], [ 21609, %sw.bb151 ], [ 21316, %sw.bb150 ], [ 21025, %sw.bb149 ], [ 20736, %sw.bb148 ], [ 20449, %sw.bb147 ], [ 20164, %sw.bb146 ], [ 19881, %sw.bb145 ], [ 19600, %sw.bb144 ], [ 19321, %sw.bb143 ], [ 19044, %sw.bb142 ], [ 18769, %sw.bb141 ], [ 18496, %sw.bb140 ], [ 18225, %sw.bb139 ], [ 17956, %sw.bb138 ], [ 17689, %sw.bb137 ], [ 17424, %sw.bb136 ], [ 17161, %sw.bb135 ], [ 16900, %sw.bb134 ], [ 16641, %sw.bb133 ], [ 16384, %sw.bb132 ], [ 16129, %sw.bb131 ], [ 15876, %sw.bb130 ], [ 15625, %sw.bb129 ], [ 15376, %sw.bb128 ], [ 15129, %sw.bb127 ], [ 14884, %sw.bb126 ], [ 14641, %sw.bb125 ], [ 14400, %sw.bb124 ], [ 14161, %sw.bb123 ], [ 13924, %sw.bb122 ], [ 13689, %sw.bb121 ], [ 13456, %sw.bb120 ], [ 13225, %sw.bb119 ], [ 12996, %sw.bb118 ], [ 12769, %sw.bb117 ], [ 12544, %sw.bb116 ], [ 12321, %sw.bb115 ], [ 12100, %sw.bb114 ], [ 11881, %sw.bb113 ], [ 11664, %sw.bb112 ], [ 11449, %sw.bb111 ], [ 11236, %sw.bb110 ], [ 11025, %sw.bb109 ], [ 10816, %sw.bb108 ], [ 10609, %sw.bb107 ], [ 10404, %sw.bb106 ], [ 10201, %sw.bb105 ], [ 10000, %sw.bb104 ], [ 9801, %sw.bb103 ], [ 9604, %sw.bb102 ], [ 9409, %sw.bb101 ], [ 9216, %sw.bb100 ], [ 9025, %sw.bb99 ], [ 8836, %sw.bb98 ], [ 8649, %sw.bb97 ], [ 8464, %sw.bb96 ], [ 8281, %sw.bb95 ], [ 8100, %sw.bb94 ], [ 7921, %sw.bb93 ], [ 7744, %sw.bb92 ], [ 7569, %sw.bb91 ], [ 7396, %sw.bb90 ], [ 7225, %sw.bb89 ], [ 7056, %sw.bb88 ], [ 6889, %sw.bb87 ], [ 6724, %sw.bb86 ], [ 6561, %sw.bb85 ], [ 6400, %sw.bb84 ], [ 6241, %sw.bb83 ], [ 6084, %sw.bb82 ], [ 5929, %sw.bb81 ], [ 5776, %sw.bb80 ], [ 5625, %sw.bb79 ], [ 5476, %sw.bb78 ], [ 5329, %sw.bb77 ], [ 5184, %sw.bb76 ], [ 5112, %sw.bb74 ], [ 4900, %sw.bb73 ], [ 4761, %sw.bb72 ], [ 4624, %sw.bb71 ], [ 4489, %sw.bb70 ], [ 4356, %sw.bb69 ], [ 4225, %sw.bb68 ], [ 4096, %sw.bb67 ], [ 3969, %sw.bb66 ], [ 3844, %sw.bb65 ], [ 3721, %sw.bb64 ], [ 3600, %sw.bb63 ], [ 3481, %sw.bb62 ], [ 3364, %sw.bb61 ], [ 3249, %sw.bb60 ], [ 3136, %sw.bb59 ], [ 3025, %sw.bb58 ], [ 2970, %sw.bb56 ], [ 2809, %sw.bb55 ], [ 2704, %sw.bb54 ], [ 2601, %sw.bb53 ], [ 2500, %sw.bb52 ], [ 2401, %sw.bb51 ], [ 2304, %sw.bb50 ], [ 2209, %sw.bb49 ], [ 2116, %sw.bb48 ], [ 2025, %sw.bb47 ], [ 1980, %sw.bb45 ], [ 1849, %sw.bb44 ], [ 1764, %sw.bb43 ], [ 1681, %sw.bb42 ], [ 1600, %sw.bb41 ], [ 1521, %sw.bb40 ], [ 1444, %sw.bb39 ], [ 1369, %sw.bb38 ], [ 1296, %sw.bb37 ], [ 1260, %sw.bb35 ], [ 1156, %sw.bb34 ], [ 1089, %sw.bb33 ], [ 1024, %sw.bb32 ], [ 961, %sw.bb31 ], [ 900, %sw.bb30 ], [ 841, %sw.bb29 ], [ 784, %sw.bb28 ], [ 729, %sw.bb27 ], [ 676, %sw.bb26 ], [ 625, %sw.bb25 ], [ 576, %sw.bb24 ], [ 529, %sw.bb23 ], [ 484, %sw.bb22 ], [ 441, %sw.bb21 ], [ 400, %sw.bb20 ], [ 361, %sw.bb19 ], [ 342, %sw.bb18 ], [ 289, %sw.bb17 ], [ 256, %sw.bb16 ], [ 225, %sw.bb15 ], [ 196, %sw.bb14 ], [ 169, %sw.bb13 ], [ 144, %sw.bb12 ], [ 121, %sw.bb11 ], [ 100, %sw.bb10 ], [ 81, %sw.bb9 ], [ 64, %sw.bb8 ], [ 49, %sw.bb7 ], [ 36, %sw.bb6 ], [ 25, %sw.bb5 ], [ 16, %sw.bb4 ], [ 9, %sw.bb3 ], [ 4, %sw.bb2 ], [ 1, %sw.bb1 ], [ 39601, %sw.bb203 ], [ 0, %if.end ]
+  ret i32 %retval.0
+}
+
+define i32 @cprop(i32 %x) {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 1, label %return
+    i32 2, label %sw.bb1
+    i32 3, label %sw.bb2
+    i32 4, label %sw.bb2
+    i32 5, label %sw.bb2
+    i32 6, label %sw.bb3
+    i32 7, label %sw.bb3
+  ]
+
+sw.bb1: br label %return
+
+sw.bb2:
+  %and = and i32 %x, 1
+  %tobool = icmp ne i32 %and, 0
+  %cond = select i1 %tobool, i32 -123, i32 456
+  %sub = sub nsw i32 %x, %cond
+  br label %return
+
+sw.bb3:
+  %trunc = trunc i32 %x to i8
+  %sext = sext i8 %trunc to i32
+  br label %return
+
+sw.default:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 123, %sw.default ], [ %sext, %sw.bb3 ], [ %sub, %sw.bb2 ], [ 42, %sw.bb1 ], [ 5, %entry ]
+  ret i32 %retval.0
+
+; CHECK: @cprop
+; CHECK: switch.lookup:
+; CHECK: %switch.gep = getelementptr inbounds [7 x i32]* @switch.table5, i32 0, i32 %switch.tableidx
+}
+
+define i32 @unreachable(i32 %x)  {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb
+    i32 3, label %sw.bb1
+    i32 4, label %sw.bb2
+    i32 5, label %sw.bb3
+    i32 6, label %sw.bb3
+    i32 7, label %sw.bb3
+    i32 8, label %sw.bb3
+  ]
+
+sw.bb: br label %return
+sw.bb1: unreachable
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.default: unreachable
+
+return:
+  %retval.0 = phi i32 [ 1, %sw.bb3 ], [ -1, %sw.bb2 ], [ 0, %sw.bb ]
+  ret i32 %retval.0
+
+; CHECK: @unreachable
+; CHECK: switch.lookup:
+; CHECK: getelementptr inbounds [5 x i32]* @switch.table6, i32 0, i32 %switch.tableidx
+}
diff --git a/test/Transforms/SimplifyCFG/phi-undef-loadstore.ll b/test/Transforms/SimplifyCFG/phi-undef-loadstore.ll
index 65d888e..028fb07 100644
--- a/test/Transforms/SimplifyCFG/phi-undef-loadstore.ll
+++ b/test/Transforms/SimplifyCFG/phi-undef-loadstore.ll
@@ -85,3 +85,31 @@ if.end7:                                          ; preds = %if.else, %if.then4,
 ; CHECK: if.end7:
 ; CHECK: phi i32* [ %a, %if.then ], [ null, %if.then4 ], [ null, %if.else ]
 }
+
+define i32 @test4(i32* %a, i32 %b, i32* %c, i32 %d) nounwind {
+entry:
+  %tobool = icmp eq i32 %b, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  tail call void @bar() nounwind
+  br label %if.end7
+
+if.else:                                          ; preds = %entry
+  %tobool3 = icmp eq i32 %d, 0
+  br i1 %tobool3, label %if.end7, label %if.then4
+
+if.then4:                                         ; preds = %if.else
+  tail call void @bar() nounwind
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.else, %if.then4, %if.then
+  %x.0 = phi i32* [ %a, %if.then ], [ null, %if.then4 ], [ null, %if.else ]
+  %gep = getelementptr i32* %x.0, i32 10
+  %tmp9 = load i32* %gep
+  %tmp10 = or i32 %tmp9, 1
+  store i32 %tmp10, i32* %gep
+  ret i32 %tmp9
+; CHECK: @test4
+; CHECK-NOT: phi
+}
diff --git a/test/Transforms/SimplifyCFG/preserve-branchweights-partial.ll b/test/Transforms/SimplifyCFG/preserve-branchweights-partial.ll
new file mode 100644
index 0000000..53d5448
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/preserve-branchweights-partial.ll
@@ -0,0 +1,37 @@
+; RUN: opt -simplifycfg -S -o - < %s | FileCheck %s
+
+; This test case was written to trigger an incorrect assert statement in
+; -simplifycfg.  Thus we don't actually want to check the output, just that
+; -simplifycfg ran successfully.  Thus we only check that the function still
+; exists, and that it still calls foo().
+;
+; NOTE: There are some obviously dead blocks and missing branch weight
+;       metadata.  Both of these features were key to triggering the assert.
+;       Additionally, the not-taken weight of the branch with a weight had to
+;       be 0 to trigger the assert.
+
+declare void @foo() nounwind uwtable
+
+define void @func(i32 %A) nounwind uwtable {
+; CHECK: define void @func
+entry:
+  %cmp11 = icmp eq i32 %A, 1
+  br i1 %cmp11, label %if.then, label %if.else, !prof !0
+
+if.then:
+  call void @foo()
+; CHECK: call void @foo()
+  br label %if.else
+
+if.else:
+  %cmp17 = icmp eq i32 %A, 2
+  br i1 %cmp17, label %if.then2, label %if.end
+
+if.then2:
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 1, i32 0}
diff --git a/test/Transforms/SimplifyCFG/preserve-branchweights-switch-create.ll b/test/Transforms/SimplifyCFG/preserve-branchweights-switch-create.ll
new file mode 100644
index 0000000..941f5ad
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/preserve-branchweights-switch-create.ll
@@ -0,0 +1,140 @@
+; RUN: opt -simplifycfg -S -o - < %s | FileCheck %s
+
+declare void @func2(i32)
+declare void @func4(i32)
+declare void @func6(i32)
+declare void @func8(i32)
+
+;; test1 - create a switch with case 2 and case 4 from two branches: N == 2
+;; and N == 4.
+define void @test1(i32 %N) nounwind uwtable {
+entry:
+  %cmp = icmp eq i32 %N, 2
+  br i1 %cmp, label %if.then, label %if.else, !prof !0
+; CHECK: test1
+; CHECK: switch i32 %N
+; CHECK: ], !prof !0
+
+if.then:
+  call void @func2(i32 %N) nounwind
+  br label %if.end9
+
+if.else:
+  %cmp2 = icmp eq i32 %N, 4
+  br i1 %cmp2, label %if.then7, label %if.else8, !prof !1
+
+if.then7:
+  call void @func4(i32 %N) nounwind
+  br label %if.end
+
+if.else8:
+  call void @func8(i32 %N) nounwind
+  br label %if.end
+
+if.end:
+  br label %if.end9
+
+if.end9:
+  ret void
+}
+
+;; test2 - Merge two switches where PredDefault == BB.
+define void @test2(i32 %M, i32 %N) nounwind uwtable {
+entry:
+  %cmp = icmp sgt i32 %M, 2
+  br i1 %cmp, label %sw1, label %sw2
+
+sw1:
+  switch i32 %N, label %sw2 [
+    i32 2, label %sw.bb
+    i32 3, label %sw.bb1
+  ], !prof !2
+; CHECK: test2
+; CHECK: switch i32 %N, label %sw.epilog
+; CHECK: i32 2, label %sw.bb
+; CHECK: i32 3, label %sw.bb1
+; CHECK: i32 4, label %sw.bb5
+; CHECK: ], !prof !1
+
+sw.bb:
+  call void @func2(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.bb1:
+  call void @func4(i32 %N) nounwind
+  br label %sw.epilog
+
+sw2:
+;; Here "case 2" is invalidated if control is transferred through default case
+;; of the first switch.
+  switch i32 %N, label %sw.epilog [
+    i32 2, label %sw.bb4
+    i32 4, label %sw.bb5
+  ], !prof !3
+
+sw.bb4:
+  call void @func6(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.bb5:
+  call void @func8(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+}
+
+;; test3 - Merge two switches where PredDefault != BB.
+define void @test3(i32 %M, i32 %N) nounwind uwtable {
+entry:
+  %cmp = icmp sgt i32 %M, 2
+  br i1 %cmp, label %sw1, label %sw2
+
+sw1:
+  switch i32 %N, label %sw.bb [
+    i32 2, label %sw2
+    i32 3, label %sw2
+    i32 1, label %sw.bb1
+  ], !prof !4
+; CHECK: test3
+; CHECK: switch i32 %N, label %sw.bb
+; CHECK: i32 1, label %sw.bb1
+; CHECK: i32 3, label %sw.bb4
+; CHECK: i32 2, label %sw.epilog
+; CHECK: ], !prof !3
+
+sw.bb:
+  call void @func2(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.bb1:
+  call void @func4(i32 %N) nounwind
+  br label %sw.epilog
+
+sw2:
+  switch i32 %N, label %sw.epilog [
+    i32 3, label %sw.bb4
+    i32 4, label %sw.bb5
+  ], !prof !5
+
+sw.bb4:
+  call void @func6(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.bb5:
+  call void @func8(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
+!1 = metadata !{metadata !"branch_weights", i32 4, i32 64}
+; CHECK: !0 = metadata !{metadata !"branch_weights", i32 256, i32 4352, i32 16}
+!2 = metadata !{metadata !"branch_weights", i32 4, i32 4, i32 8}
+!3 = metadata !{metadata !"branch_weights", i32 8, i32 8, i32 4}
+; CHECK: !1 = metadata !{metadata !"branch_weights", i32 32, i32 48, i32 96, i32 16}
+!4 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 4, i32 3}
+!5 = metadata !{metadata !"branch_weights", i32 17, i32 13, i32 9}
+; CHECK: !3 = metadata !{metadata !"branch_weights", i32 7, i32 3, i32 4, i32 6}
diff --git a/test/Transforms/SimplifyCFG/preserve-branchweights.ll b/test/Transforms/SimplifyCFG/preserve-branchweights.ll
index c791785..beef527 100644
--- a/test/Transforms/SimplifyCFG/preserve-branchweights.ll
+++ b/test/Transforms/SimplifyCFG/preserve-branchweights.ll
@@ -79,10 +79,238 @@ Z:
   ret void
 }
 
+;; test5 - The case where it jumps to the default target will be removed.
+define void @test5(i32 %M, i32 %N) nounwind uwtable {
+entry:
+  switch i32 %N, label %sw2 [
+    i32 1, label %sw2
+    i32 2, label %sw.bb
+    i32 3, label %sw.bb1
+  ], !prof !3
+; CHECK: test5
+; CHECK: switch i32 %N, label %sw2 [
+; CHECK: i32 3, label %sw.bb1
+; CHECK: i32 2, label %sw.bb
+; CHECK: ], !prof !2
+
+sw.bb:
+  call void @helper(i32 0)
+  br label %sw.epilog
+
+sw.bb1:
+  call void @helper(i32 1)
+  br label %sw.epilog
+
+sw2:
+  call void @helper(i32 2)
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+}
+
+;; test6 - Some cases of the second switch are pruned during optimization.
+;; Then the second switch will be converted to a branch, finally, the first
+;; switch and the branch will be merged into a single switch.
+define void @test6(i32 %M, i32 %N) nounwind uwtable {
+entry:
+  switch i32 %N, label %sw2 [
+    i32 1, label %sw2
+    i32 2, label %sw.bb
+    i32 3, label %sw.bb1
+  ], !prof !4
+; CHECK: test6
+; CHECK: switch i32 %N, label %sw.epilog
+; CHECK: i32 3, label %sw.bb1
+; CHECK: i32 2, label %sw.bb
+; CHECK: i32 4, label %sw.bb5
+; CHECK: ], !prof !3
+
+sw.bb:
+  call void @helper(i32 0)
+  br label %sw.epilog
+
+sw.bb1:
+  call void @helper(i32 1)
+  br label %sw.epilog
+
+sw2:
+;; Here "case 2" is invalidated since the default case of the first switch
+;; does not include "case 2".
+  switch i32 %N, label %sw.epilog [
+    i32 2, label %sw.bb4
+    i32 4, label %sw.bb5
+  ], !prof !5
+
+sw.bb4:
+  call void @helper(i32 2)
+  br label %sw.epilog
+
+sw.bb5:
+  call void @helper(i32 3)
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+}
+
+;; This test is based on test1 but swapped the targets of the second branch.
+define void @test1_swap(i1 %a, i1 %b) {
+; CHECK: @test1_swap
+entry:
+  br i1 %a, label %Y, label %X, !prof !0
+; CHECK: br i1 %or.cond, label %Y, label %Z, !prof !4
+
+X:
+  %c = or i1 %b, false
+  br i1 %c, label %Y, label %Z, !prof !1
+
+Y:
+  call void @helper(i32 0)
+  ret void
+
+Z:
+  call void @helper(i32 1)
+  ret void
+}
+
+define void @test7(i1 %a, i1 %b) {
+; CHECK: @test7
+entry:
+  %c = or i1 %b, false
+  br i1 %a, label %Y, label %X, !prof !0
+; CHECK: br i1 %brmerge, label %Y, label %Z, !prof !5
+
+X:
+  br i1 %c, label %Y, label %Z, !prof !6
+
+Y:
+  call void @helper(i32 0)
+  ret void
+
+Z:
+  call void @helper(i32 1)
+  ret void
+}
+
+; Test basic folding to a conditional branch.
+define void @test8(i64 %x, i64 %y) nounwind {
+; CHECK: @test8
+entry:
+    %lt = icmp slt i64 %x, %y
+; CHECK: br i1 %lt, label %a, label %b, !prof !6
+    %qux = select i1 %lt, i32 0, i32 2
+    switch i32 %qux, label %bees [
+        i32 0, label %a
+        i32 1, label %b
+        i32 2, label %b
+    ], !prof !7
+a:
+    call void @helper(i32 0) nounwind
+    ret void
+b:
+    call void @helper(i32 1) nounwind
+    ret void
+bees:
+    call void @helper(i32 2) nounwind
+    ret void
+}
+
+; Test edge splitting when the default target has icmp and unconditinal
+; branch
+define i1 @test9(i32 %x, i32 %y) nounwind {
+; CHECK: @test9
+entry:
+    switch i32 %x, label %bees [
+        i32 0, label %a
+        i32 1, label %end
+        i32 2, label %end
+    ], !prof !7
+; CHECK: switch i32 %x, label %bees [
+; CHECK: i32 0, label %a
+; CHECK: i32 1, label %end
+; CHECK: i32 2, label %end
+; CHECK: i32 92, label %end
+; CHECK: ], !prof !7
+
+a:
+    call void @helper(i32 0) nounwind
+    %reta = icmp slt i32 %x, %y
+    ret i1 %reta
+
+bees:
+    %tmp = icmp eq i32 %x, 92
+    br label %end
+
+end:
+; CHECK: end:
+; CHECK: %ret = phi i1 [ true, %entry ], [ false, %bees ], [ true, %entry ], [ true, %entry ]
+    %ret = phi i1 [ true, %entry ], [%tmp, %bees], [true, %entry]
+    call void @helper(i32 2) nounwind
+    ret i1 %ret
+}
+
+define void @test10(i32 %x) nounwind readnone ssp noredzone {
+entry:
+ switch i32 %x, label %lor.rhs [
+   i32 2, label %lor.end
+   i32 1, label %lor.end
+   i32 3, label %lor.end
+ ], !prof !7
+
+lor.rhs:
+ call void @helper(i32 1) nounwind
+ ret void
+
+lor.end:
+ call void @helper(i32 0) nounwind
+ ret void
+
+; CHECK: test10
+; CHECK: %x.off = add i32 %x, -1
+; CHECK: %switch = icmp ult i32 %x.off, 3
+; CHECK: br i1 %switch, label %lor.end, label %lor.rhs, !prof !8
+}
+
+; Remove dead cases from the switch.
+define void @test11(i32 %x) nounwind {
+  %i = shl i32 %x, 1
+  switch i32 %i, label %a [
+    i32 21, label %b
+    i32 24, label %c
+  ], !prof !8
+; CHECK: %cond = icmp eq i32 %i, 24
+; CHECK: br i1 %cond, label %c, label %a, !prof !9
+
+a:
+ call void @helper(i32 0) nounwind
+ ret void
+b:
+ call void @helper(i32 1) nounwind
+ ret void
+c:
+ call void @helper(i32 2) nounwind
+ ret void
+}
+
 !0 = metadata !{metadata !"branch_weights", i32 3, i32 5}
 !1 = metadata !{metadata !"branch_weights", i32 1, i32 1}
 !2 = metadata !{metadata !"branch_weights", i32 1, i32 2}
+!3 = metadata !{metadata !"branch_weights", i32 4, i32 3, i32 2, i32 1}
+!4 = metadata !{metadata !"branch_weights", i32 4, i32 3, i32 2, i32 1}
+!5 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 5}
+!6 = metadata !{metadata !"branch_weights", i32 1, i32 3}
+!7 = metadata !{metadata !"branch_weights", i32 33, i32 9, i32 8, i32 7}
+!8 = metadata !{metadata !"branch_weights", i32 33, i32 9, i32 8}
 
 ; CHECK: !0 = metadata !{metadata !"branch_weights", i32 5, i32 11}
 ; CHECK: !1 = metadata !{metadata !"branch_weights", i32 1, i32 5}
-; CHECK-NOT: !2
+; CHECK: !2 = metadata !{metadata !"branch_weights", i32 7, i32 1, i32 2}
+; CHECK: !3 = metadata !{metadata !"branch_weights", i32 49, i32 12, i32 24, i32 35}
+; CHECK: !4 = metadata !{metadata !"branch_weights", i32 11, i32 5}
+; CHECK: !5 = metadata !{metadata !"branch_weights", i32 17, i32 15} 
+; CHECK: !6 = metadata !{metadata !"branch_weights", i32 9, i32 7}
+; CHECK: !7 = metadata !{metadata !"branch_weights", i32 17, i32 9, i32 8, i32 7, i32 17}
+; CHECK: !8 = metadata !{metadata !"branch_weights", i32 24, i32 33}
+; CHECK: !9 = metadata !{metadata !"branch_weights", i32 8, i32 33}
+; CHECK-NOT: !9
diff --git a/test/Transforms/SimplifyCFG/sink-common-code.ll b/test/Transforms/SimplifyCFG/sink-common-code.ll
new file mode 100644
index 0000000..28d7279
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/sink-common-code.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
+
+define zeroext i1 @test1(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK: test1
+; CHECK: add
+; CHECK: select
+; CHECK: icmp
+; CHECK-NOT: br
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.else:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+define zeroext i1 @test2(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK: test2
+; CHECK: add
+; CHECK: select
+; CHECK: icmp
+; CHECK-NOT: br
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.else:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp uge i32 %blksA, %add
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
diff --git a/test/Transforms/SimplifyLibCalls/2009-02-12-StrTo.ll b/test/Transforms/SimplifyLibCalls/2009-02-12-StrTo.ll
deleted file mode 100644
index 2717228..0000000
--- a/test/Transforms/SimplifyLibCalls/2009-02-12-StrTo.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-; Test that we add nocapture to the declaration, and to the second call only.
-
-; CHECK: declare float @strtol(i8*, i8** nocapture, i32) nounwind
-declare float @strtol(i8* %s, i8** %endptr, i32 %base)
-
-define void @foo(i8* %x, i8** %endptr) {
-; CHECK:  call float @strtol(i8* %x, i8** %endptr, i32 10)
-  call float @strtol(i8* %x, i8** %endptr, i32 10)
-; CHECK: %2 = call float @strtol(i8* nocapture %x, i8** null, i32 10)
-  call float @strtol(i8* %x, i8** null, i32 10)
-  ret void
-}
diff --git a/test/Transforms/SimplifyLibCalls/FFS.ll b/test/Transforms/SimplifyLibCalls/FFS.ll
index e38d783..6aecbea 100644
--- a/test/Transforms/SimplifyLibCalls/FFS.ll
+++ b/test/Transforms/SimplifyLibCalls/FFS.ll
@@ -1,6 +1,7 @@
-; Test that the ToAsciiOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep "call.*@ffs"
+; Test that FFSOpt works correctly
+; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
+
+; CHECK-NOT: call{{.*}}@ffs
 
 @non_const = external global i32		; <i32*> [#uses=1]
 
@@ -34,3 +35,11 @@ define i32 @a(i64) nounwind {
         %2 = call i32 @ffsll(i64 %0)            ; <i32> [#uses=1]
         ret i32 %2
 }
+
+; PR13028
+define i32 @b() nounwind {
+  %ffs = call i32 @ffsll(i64 0)
+  ret i32 %ffs
+; CHECK: @b
+; CHECK-NEXT: ret i32 0
+}
diff --git a/test/Transforms/SimplifyLibCalls/StpCpy.ll b/test/Transforms/SimplifyLibCalls/StpCpy.ll
deleted file mode 100644
index 914b095..0000000
--- a/test/Transforms/SimplifyLibCalls/StpCpy.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; Test that the StpCpyOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
-
-@hello = constant [6 x i8] c"hello\00"
-
-declare i8* @stpcpy(i8*, i8*)
-
-declare i8* @__stpcpy_chk(i8*, i8*, i32) nounwind
-
-declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
-
-define i32 @t1() {
-; CHECK: @t1
-  %target = alloca [1024 x i8]
-  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
-  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
-  %rslt1 = call i8* @stpcpy( i8* %arg1, i8* %arg2 )
-; CHECK: @llvm.memcpy.p0i8.p0i8.i32
-  ret i32 0
-}
-
-define i32 @t2() {
-; CHECK: @t2
-  %target = alloca [1024 x i8]
-  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
-  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
-  %tmp1 = call i32 @llvm.objectsize.i32(i8* %arg1, i1 false)
-  %rslt1 = call i8* @__stpcpy_chk(i8* %arg1, i8* %arg2, i32 %tmp1)
-; CHECK: @__memcpy_chk
-  ret i32 0
-}
-
-define i8* @t3(i8* %arg) {
-; CHECK: @t3
-  %stpcpy = tail call i8* @stpcpy(i8* %arg, i8* %arg)
-; CHECK: [[LEN:%[a-z]+]] = call i32 @strlen(i8* %arg)
-; CHECK-NEXT: getelementptr inbounds i8* %arg, i32 [[LEN]]
-  ret i8* %stpcpy
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrCat.ll b/test/Transforms/SimplifyLibCalls/StrCat.ll
deleted file mode 100644
index 3ea691a..0000000
--- a/test/Transforms/SimplifyLibCalls/StrCat.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; Test that the StrCatOptimizer works correctly
-; PR3661
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep "call.*strcat"
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   grep "puts.*%arg1"
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "-p:64:64:64"
-
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=1]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=1]
-@null_hello = constant [7 x i8] c"\00hello\00"		; <[7 x i8]*> [#uses=1]
-
-declare i8* @strcat(i8*, i8*)
-
-declare i32 @puts(i8*)
-
-define i32 @main() {
-	%target = alloca [1024 x i8]		; <[1024 x i8]*> [#uses=1]
-	%arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0		; <i8*> [#uses=2]
-	store i8 0, i8* %arg1
-	%arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt1 = call i8* @strcat( i8* %arg1, i8* %arg2 )		; <i8*> [#uses=1]
-	%arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt2 = call i8* @strcat( i8* %rslt1, i8* %arg3 )		; <i8*> [#uses=1]
-	%arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt3 = call i8* @strcat( i8* %rslt2, i8* %arg4 )		; <i8*> [#uses=1]
-	call i32 @puts( i8* %rslt3 )		; <i32>:1 [#uses=0]
-	ret i32 0
-}
-
diff --git a/test/Transforms/SimplifyLibCalls/StrChr.ll b/test/Transforms/SimplifyLibCalls/StrChr.ll
deleted file mode 100644
index eaabeb2..0000000
--- a/test/Transforms/SimplifyLibCalls/StrChr.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; Test that the StrChrOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "-p:64:64:64"
-
-@hello = constant [14 x i8] c"hello world\5Cn\00"
-@null = constant [1 x i8] zeroinitializer
-
-declare i8* @strchr(i8*, i32)
-
-define i32 @foo(i32 %index) {
-	%hello_p = getelementptr [14 x i8]* @hello, i32 0, i32 0
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
-	%world = call i8* @strchr(i8* %hello_p, i32 119)
-; CHECK: getelementptr i8* %hello_p, i64 6
-	%ignore = call i8* @strchr(i8* %null_p, i32 119)
-; CHECK-NOT: call i8* strchr
-	%null = call i8* @strchr(i8* %hello_p, i32 0)
-; CHECK: getelementptr i8* %hello_p, i64 13
-	%result = call i8* @strchr(i8* %hello_p, i32 %index)
-; CHECK: call i8* @memchr(i8* %hello_p, i32 %index, i64 14)
-	ret i32 %index
-}
-
diff --git a/test/Transforms/SimplifyLibCalls/StrCmp.ll b/test/Transforms/SimplifyLibCalls/StrCmp.ll
deleted file mode 100644
index 60854d7..0000000
--- a/test/Transforms/SimplifyLibCalls/StrCmp.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; Test that the StrCmpOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=1]
-@hell = constant [5 x i8] c"hell\00"		; <[5 x i8]*> [#uses=1]
-@bell = constant [5 x i8] c"bell\00"		; <[5 x i8]*> [#uses=1]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=1]
-
-declare i32 @strcmp(i8*, i8*)
-
-; strcmp("", x) -> -*x
-define i32 @test1(i8* %str) {
-  %temp1 = call i32 @strcmp(i8* getelementptr inbounds ([1 x i8]* @null, i32 0, i32 0), i8* %str)
-  ret i32 %temp1
-  ; CHECK: @test1
-  ; CHECK: %strcmpload = load i8* %str
-  ; CHECK: %1 = zext i8 %strcmpload to i32
-  ; CHECK: %temp1 = sub i32 0, %1
-  ; CHECK: ret i32 %temp1
-}
-
-; strcmp(x, "") -> *x
-define i32 @test2(i8* %str) {
-  %temp1 = call i32 @strcmp(i8* %str, i8* getelementptr inbounds ([1 x i8]* @null, i32 0, i32 0))
-  ret i32 %temp1
-  ; CHECK: @test2
-  ; CHECK: %strcmpload = load i8* %str
-  ; CHECK: %temp1 = zext i8 %strcmpload to i32
-  ; CHECK: ret i32 %temp1
-}
-
-; strcmp(x, y)  -> cnst
-define i32 @test3() {
-  %temp1 = call i32 @strcmp(i8* getelementptr inbounds ([5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([6 x i8]* @hello, i32 0, i32 0))
-  ret i32 %temp1
-  ; CHECK: @test3
-  ; CHECK: ret i32 -1
-}
-define i32 @test4() {
-  %temp1 = call i32 @strcmp(i8* getelementptr inbounds ([5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([1 x i8]* @null, i32 0, i32 0))
-  ret i32 %temp1
-  ; CHECK: @test4
-  ; CHECK: ret i32 1
-}
-
-; strcmp(x, y)   -> memcmp(x, y, <known length>)
-; (This transform is rather difficult to trigger in a useful manner)
-define i32 @test5(i1 %b) {
-  %sel = select i1 %b, i8* getelementptr inbounds ([5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([5 x i8]* @bell, i32 0, i32 0)
-  %temp1 = call i32 @strcmp(i8* getelementptr inbounds ([6 x i8]* @hello, i32 0, i32 0), i8* %sel)
-  ret i32 %temp1
-  ; CHECK: @test5
-  ; CHECK: %memcmp = call i32 @memcmp(i8* getelementptr inbounds ([6 x i8]* @hello, i32 0, i32 0), i8* %sel, i32 5)
-  ; CHECK: ret i32 %memcmp
-}
-
-; strcmp(x,x)  -> 0
-define i32 @test6(i8* %str) {
-  %temp1 = call i32 @strcmp(i8* %str, i8* %str)
-  ret i32 %temp1
-  ; CHECK: @test6
-  ; CHECK: ret i32 0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrCpy.ll b/test/Transforms/SimplifyLibCalls/StrCpy.ll
deleted file mode 100644
index 83406ff..0000000
--- a/test/Transforms/SimplifyLibCalls/StrCpy.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-; Test that the StrCpyOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
-
-@hello = constant [6 x i8] c"hello\00"
-
-declare i8* @strcpy(i8*, i8*)
-
-declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind
-
-declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
-
-; rdar://6839935
-
-define i32 @t1() {
-; CHECK: @t1
-  %target = alloca [1024 x i8]
-  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
-  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
-  %rslt1 = call i8* @strcpy( i8* %arg1, i8* %arg2 )
-; CHECK: @llvm.memcpy.p0i8.p0i8.i32
-  ret i32 0
-}
-
-define i32 @t2() {
-; CHECK: @t2
-  %target = alloca [1024 x i8]
-  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
-  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
-  %tmp1 = call i32 @llvm.objectsize.i32(i8* %arg1, i1 false)
-  %rslt1 = call i8* @__strcpy_chk(i8* %arg1, i8* %arg2, i32 %tmp1)
-; CHECK: @__memcpy_chk
-  ret i32 0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrLen.ll b/test/Transforms/SimplifyLibCalls/StrLen.ll
deleted file mode 100644
index 4a20bbd..0000000
--- a/test/Transforms/SimplifyLibCalls/StrLen.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; Test that the StrCatOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:    not grep "call.*strlen"
-
-target datalayout = "e-p:32:32"
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=3]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=3]
-@null_hello = constant [7 x i8] c"\00hello\00"		; <[7 x i8]*> [#uses=1]
-@nullstring = constant i8 0
-
-declare i32 @strlen(i8*)
-
-define i32 @test1() {
-	%hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%hello_l = call i32 @strlen( i8* %hello_p )		; <i32> [#uses=1]
-	ret i32 %hello_l
-}
-
-define i32 @test2() {
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%null_l = call i32 @strlen( i8* %null_p )		; <i32> [#uses=1]
-	ret i32 %null_l
-}
-
-define i32 @test3() {
-	%null_hello_p = getelementptr [7 x i8]* @null_hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%null_hello_l = call i32 @strlen( i8* %null_hello_p )		; <i32> [#uses=1]
-	ret i32 %null_hello_l
-}
-
-define i1 @test4() {
-	%hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%hello_l = call i32 @strlen( i8* %hello_p )		; <i32> [#uses=1]
-	%eq_hello = icmp eq i32 %hello_l, 0		; <i1> [#uses=1]
-	ret i1 %eq_hello
-}
-
-define i1 @test5() {
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%null_l = call i32 @strlen( i8* %null_p )		; <i32> [#uses=1]
-	%eq_null = icmp eq i32 %null_l, 0		; <i1> [#uses=1]
-	ret i1 %eq_null
-}
-
-define i1 @test6() {
-	%hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%hello_l = call i32 @strlen( i8* %hello_p )		; <i32> [#uses=1]
-	%ne_hello = icmp ne i32 %hello_l, 0		; <i1> [#uses=1]
-	ret i1 %ne_hello
-}
-
-define i1 @test7() {
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%null_l = call i32 @strlen( i8* %null_p )		; <i32> [#uses=1]
-	%ne_null = icmp ne i32 %null_l, 0		; <i1> [#uses=1]
-	ret i1 %ne_null
-}
-
-define i32 @test8() {
-	%len = tail call i32 @strlen(i8* @nullstring) nounwind
-	ret i32 %len
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrNCat.ll b/test/Transforms/SimplifyLibCalls/StrNCat.ll
deleted file mode 100644
index 073792b..0000000
--- a/test/Transforms/SimplifyLibCalls/StrNCat.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; Test that the StrNCatOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep "call.*strncat"
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   grep "puts.*%arg1"
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "-p:64:64:64"
-
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=1]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=1]
-@null_hello = constant [7 x i8] c"\00hello\00"		; <[7 x i8]*> [#uses=1]
-
-declare i8* @strncat(i8*, i8*, i32)
-
-declare i32 @puts(i8*)
-
-define i32 @main() {
-	%target = alloca [1024 x i8]		; <[1024 x i8]*> [#uses=1]
-	%arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0		; <i8*> [#uses=2]
-	store i8 0, i8* %arg1
-	%arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt1 = call i8* @strncat( i8* %arg1, i8* %arg2, i32 6 )		; <i8*> [#uses=1]
-	%arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt2 = call i8* @strncat( i8* %rslt1, i8* %arg3, i32 42 )		; <i8*> [#uses=1]
-	%arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt3 = call i8* @strncat( i8* %rslt2, i8* %arg4, i32 42 )		; <i8*> [#uses=1]
-	call i32 @puts( i8* %rslt3 )		; <i32>:1 [#uses=0]
-	ret i32 0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrNCmp.ll b/test/Transforms/SimplifyLibCalls/StrNCmp.ll
deleted file mode 100644
index 0b2a501..0000000
--- a/test/Transforms/SimplifyLibCalls/StrNCmp.ll
+++ /dev/null
@@ -1,78 +0,0 @@
-; Test that the StrCmpOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=1]
-@hell = constant [5 x i8] c"hell\00"		; <[5 x i8]*> [#uses=1]
-@bell = constant [5 x i8] c"bell\00"		; <[5 x i8]*> [#uses=1]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=1]
-
-declare i32 @strncmp(i8*, i8*, i32)
-
-; strcmp("", x) -> -*x
-define i32 @test1(i8* %str) {
-  %temp1 = call i32 @strncmp(i8* getelementptr inbounds ([1 x i8]* @null, i32 0, i32 0), i8* %str, i32 10)
-  ret i32 %temp1
-  ; CHECK: @test1
-  ; CHECK: %strcmpload = load i8* %str
-  ; CHECK: %1 = zext i8 %strcmpload to i32
-  ; CHECK: %temp1 = sub i32 0, %1
-  ; CHECK: ret i32 %temp1
-}
-
-; strcmp(x, "") -> *x
-define i32 @test2(i8* %str) {
-  %temp1 = call i32 @strncmp(i8* %str, i8* getelementptr inbounds ([1 x i8]* @null, i32 0, i32 0), i32 10)
-  ret i32 %temp1
-  ; CHECK: @test2
-  ; CHECK: %strcmpload = load i8* %str
-  ; CHECK: %temp1 = zext i8 %strcmpload to i32
-  ; CHECK: ret i32 %temp1
-}
-
-; strncmp(x, y, n)  -> cnst
-define i32 @test3() {
-  %temp1 = call i32 @strncmp(i8* getelementptr inbounds ([5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([6 x i8]* @hello, i32 0, i32 0), i32 10)
-  ret i32 %temp1
-  ; CHECK: @test3
-  ; CHECK: ret i32 -1
-}
-define i32 @test4() {
-  %temp1 = call i32 @strncmp(i8* getelementptr inbounds ([5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([1 x i8]* @null, i32 0, i32 0), i32 10)
-  ret i32 %temp1
-  ; CHECK: @test4
-  ; CHECK: ret i32 1
-}
-define i32 @test5() {
-  %temp1 = call i32 @strncmp(i8* getelementptr inbounds ([5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([6 x i8]* @hello, i32 0, i32 0), i32 4)
-  ret i32 %temp1
-  ; CHECK: @test5
-  ; CHECK: ret i32 0
-}
-
-; strncmp(x,y,1) -> memcmp(x,y,1)
-define i32 @test6(i8* %str1, i8* %str2) {
-  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 1)
-  ret i32 %temp1
-  ; CHECK: @test6
-  ; CHECK: load i8*
-  ; CHECK: load i8*
-  ; CHECK: sub i32
-}
-
-; strncmp(x,y,0)   -> 0
-define i32 @test7(i8* %str1, i8* %str2) {
-  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 0)
-  ret i32 %temp1
-  ; CHECK: @test7
-  ; CHECK: ret i32 0
-}
-
-; strncmp(x,x,n)  -> 0
-define i32 @test8(i8* %str, i32 %n) {
-  %temp1 = call i32 @strncmp(i8* %str, i8* %str, i32 %n)
-  ret i32 %temp1
-  ; CHECK: @test8
-  ; CHECK: ret i32 0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrNCpy.ll b/test/Transforms/SimplifyLibCalls/StrNCpy.ll
deleted file mode 100644
index 4e47b31..0000000
--- a/test/Transforms/SimplifyLibCalls/StrNCpy.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; Test that the StrNCpyOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep "call.*strncpy"
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "-p:64:64:64"
-
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=1]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=1]
-@null_hello = constant [7 x i8] c"\00hello\00"		; <[7 x i8]*> [#uses=1]
-
-declare i8* @strncpy(i8*, i8*, i32)
-
-declare i32 @puts(i8*)
-
-define i32 @main() {
-	%target = alloca [1024 x i8]		; <[1024 x i8]*> [#uses=1]
-	%arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0		; <i8*> [#uses=2]
-	store i8 0, i8* %arg1
-	%arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt1 = call i8* @strncpy( i8* %arg1, i8* %arg2, i32 6 )		; <i8*> [#uses=1]
-	%arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt2 = call i8* @strncpy( i8* %rslt1, i8* %arg3, i32 42 )		; <i8*> [#uses=1]
-	%arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt3 = call i8* @strncpy( i8* %rslt2, i8* %arg4, i32 42 )		; <i8*> [#uses=1]
-	call i32 @puts( i8* %rslt3 )		; <i32>:1 [#uses=0]
-	ret i32 0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrPBrk.ll b/test/Transforms/SimplifyLibCalls/StrPBrk.ll
deleted file mode 100644
index 29c3b74..0000000
--- a/test/Transforms/SimplifyLibCalls/StrPBrk.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-target datalayout = "-p:64:64:64"
-
-@hello = constant [12 x i8] c"hello world\00"
-@w = constant [2 x i8] c"w\00"
-@null = constant [1 x i8] zeroinitializer
-
-declare i8* @strpbrk(i8*, i8*)
-
-define void @test(i8* %s1, i8* %s2) {
-	%hello_p = getelementptr [12 x i8]* @hello, i32 0, i32 0
-	%w_p = getelementptr [2 x i8]* @w, i32 0, i32 0
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
-	%test1 = call i8* @strpbrk(i8* %null_p, i8* %s2)
-	%test2 = call i8* @strpbrk(i8* %s1, i8* %null_p)
-; CHECK-NOT: call i8* @strpbrk
-	%test3 = call i8* @strpbrk(i8* %s1, i8* %w_p)
-; CHECK: call i8* @strchr(i8* %s1, i32 119)
-	%test4 = call i8* @strpbrk(i8* %hello_p, i8* %w_p)
-; CHECK: getelementptr i8* %hello_p, i64 6
-	%test5 = call i8* @strpbrk(i8* %s1, i8* %s2)
-; CHECK: call i8* @strpbrk(i8* %s1, i8* %s2)
-	ret void
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrRChr.ll b/test/Transforms/SimplifyLibCalls/StrRChr.ll
deleted file mode 100644
index 2259fc0..0000000
--- a/test/Transforms/SimplifyLibCalls/StrRChr.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; Test that the StrRChrOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-target datalayout = "-p:64:64:64"
-
-@hello = constant [14 x i8] c"hello world\5Cn\00"
-@null = constant [1 x i8] zeroinitializer
-
-declare i8* @strrchr(i8*, i32)
-
-define void @foo(i8* %bar) {
-	%hello_p = getelementptr [14 x i8]* @hello, i32 0, i32 0
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
-	%world = call i8* @strrchr(i8* %hello_p, i32 119)
-; CHECK: getelementptr i8* %hello_p, i64 6
-	%ignore = call i8* @strrchr(i8* %null_p, i32 119)
-; CHECK-NOT: call i8* strrchr
-	%null = call i8* @strrchr(i8* %hello_p, i32 0)
-; CHECK: getelementptr i8* %hello_p, i64 13
-	%strchr = call i8* @strrchr(i8* %bar, i32 0)
-; CHECK: call i8* @strchr(i8* %bar, i32 0)
-	ret void
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrSpn.ll b/test/Transforms/SimplifyLibCalls/StrSpn.ll
deleted file mode 100644
index 800c190..0000000
--- a/test/Transforms/SimplifyLibCalls/StrSpn.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-target datalayout = "-p:64:64:64"
-
-@abcba = constant [6 x i8] c"abcba\00"
-@abc = constant [4 x i8] c"abc\00"
-@null = constant [1 x i8] zeroinitializer
-
-declare i64 @strspn(i8*, i8*)
-
-define i64 @testspn(i8* %s1, i8* %s2) {
-  	%abcba_p = getelementptr [6 x i8]* @abcba, i32 0, i32 0
-	%abc_p = getelementptr [4 x i8]* @abc, i32 0, i32 0
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
-	%test1 = call i64 @strspn(i8* %s1, i8* %null_p)
-	%test2 = call i64 @strspn(i8* %null_p, i8* %s2)
-	%test3 = call i64 @strspn(i8* %abcba_p, i8* %abc_p)
-; CHECK-NOT: call i64 @strspn
-	%test4 = call i64 @strspn(i8* %s1, i8* %s2)
-; CHECK: call i64 @strspn(i8* %s1, i8* %s2)
-	ret i64 %test3
-; CHECK: ret i64 5
-}
-
-declare i64 @strcspn(i8*, i8*)
-
-define i64 @testcspn(i8* %s1, i8* %s2) {
-  	%abcba_p = getelementptr [6 x i8]* @abcba, i32 0, i32 0
-	%abc_p = getelementptr [4 x i8]* @abc, i32 0, i32 0
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
-	%test1 = call i64 @strcspn(i8* %s1, i8* %null_p)
-; CHECK: call i64 @strlen(i8* %s1)
-	%test2 = call i64 @strcspn(i8* %null_p, i8* %s2)
-	%test3 = call i64 @strcspn(i8* %abcba_p, i8* %abc_p)
-; CHECK-NOT: call i64 @strcspn
-	%test4 = call i64 @strcspn(i8* %s1, i8* %s2)
-; CHECK: call i64 @strcspn(i8* %s1, i8* %s2)
-        %add0 = add i64 %test1, %test3
-; CHECK: add i64 %{{.+}}, 0
-	ret i64 %add0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrStr.ll b/test/Transforms/SimplifyLibCalls/StrStr.ll
deleted file mode 100644
index eefd2e8..0000000
--- a/test/Transforms/SimplifyLibCalls/StrStr.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-; PR5783
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
-target triple = "i386-apple-darwin9.0"
-
-@.str = private constant [1 x i8] zeroinitializer ; <[1 x i8]*> [#uses=1]
-@.str1 = private constant [2 x i8] c"a\00"        ; <[2 x i8]*> [#uses=1]
-@.str2 = private constant [6 x i8] c"abcde\00"    ; <[6 x i8]*> [#uses=1]
-@.str3 = private constant [4 x i8] c"bcd\00"      ; <[4 x i8]*> [#uses=1]
-
-define i8* @test1(i8* %P) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* %P, i8* getelementptr inbounds ([1 x i8]* @.str, i32 0, i32 0)) nounwind ; <i8*> [#uses=1]
-  ret i8* %call
-; strstr(P, "") -> P
-; CHECK: @test1
-; CHECK: ret i8* %P
-}
-
-declare i8* @strstr(i8*, i8* nocapture) nounwind readonly
-
-define i8* @test2(i8* %P) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* %P, i8* getelementptr inbounds ([2 x i8]* @.str1, i32 0, i32 0)) nounwind ; <i8*> [#uses=1]
-  ret i8* %call
-; strstr(P, "a") -> strchr(P, 'a')
-; CHECK: @test2
-; CHECK: @strchr(i8* %P, i32 97)
-}
-
-define i8* @test3(i8* nocapture %P) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* getelementptr inbounds ([6 x i8]* @.str2, i32 0, i32 0), i8* getelementptr inbounds ([4 x i8]* @.str3, i32 0, i32 0)) nounwind ; <i8*> [#uses=1]
-  ret i8* %call
-; strstr("abcde", "bcd") -> "abcde"+1
-; CHECK: @test3
-; CHECK: getelementptr inbounds ([6 x i8]* @.str2, i32 0, i64 1)
-}
-
-define i8* @test4(i8* %P) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* %P, i8* %P) nounwind ; <i8*> [#uses=1]
-  ret i8* %call
-; strstr(P, P) -> P
-; CHECK: @test4
-; CHECK: ret i8* %P
-}
-
-define i1 @test5(i8* %P, i8* %Q) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* %P, i8* %Q) nounwind ; <i8*> [#uses=1]
-  %cmp = icmp eq i8* %call, %P
-  ret i1 %cmp
-; CHECK: @test5
-; CHECK: [[LEN:%[a-z]+]] = call {{i[0-9]+}} @strlen(i8* %Q)
-; CHECK: [[NCMP:%[a-z]+]] = call {{i[0-9]+}} @strncmp(i8* %P, i8* %Q, {{i[0-9]+}} [[LEN]])
-; CHECK: icmp eq {{i[0-9]+}} [[NCMP]], 0
-; CHECK: ret i1
-}
diff --git a/test/Transforms/SimplifyLibCalls/double-float-shrink.ll b/test/Transforms/SimplifyLibCalls/double-float-shrink.ll
new file mode 100644
index 0000000..b4ab8b4
--- /dev/null
+++ b/test/Transforms/SimplifyLibCalls/double-float-shrink.ll
@@ -0,0 +1,333 @@
+; RUN: opt  < %s -simplify-libcalls -enable-double-float-shrink -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define float @acos_test(float %f) nounwind readnone {
+; CHECK: acos_test
+    %conv = fpext float %f to double
+    %call = call double @acos(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @acosf(float %f)
+}
+
+define double @acos_test2(float %f) nounwind readnone {
+; CHECK: acos_test2
+    %conv = fpext float %f to double
+    %call = call double @acos(double %conv)
+    ret double %call
+; CHECK: call double @acos(double %conv)
+}
+
+define float @acosh_test(float %f) nounwind readnone {
+; CHECK: acosh_test
+    %conv = fpext float %f to double
+    %call = call double @acosh(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @acoshf(float %f)
+}
+
+define double @acosh_test2(float %f) nounwind readnone {
+; CHECK: acosh_test2
+    %conv = fpext float %f to double
+    %call = call double @acosh(double %conv)
+    ret double %call
+; CHECK: call double @acosh(double %conv)
+}
+
+define float @asin_test(float %f) nounwind readnone {
+; CHECK: asin_test
+    %conv = fpext float %f to double
+    %call = call double @asin(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @asinf(float %f)
+}
+
+define double @asin_test2(float %f) nounwind readnone {
+; CHECK: asin_test2
+    %conv = fpext float %f to double
+    %call = call double @asin(double %conv)
+    ret double %call
+; CHECK: call double @asin(double %conv)
+}
+
+define float @asinh_test(float %f) nounwind readnone {
+; CHECK: asinh_test
+    %conv = fpext float %f to double
+    %call = call double @asinh(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @asinhf(float %f)
+}
+
+define double @asinh_test2(float %f) nounwind readnone {
+; CHECK: asinh_test2
+    %conv = fpext float %f to double
+    %call = call double @asinh(double %conv)
+    ret double %call
+; CHECK: call double @asinh(double %conv)
+}
+
+define float @atan_test(float %f) nounwind readnone {
+; CHECK: atan_test
+    %conv = fpext float %f to double
+    %call = call double @atan(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @atanf(float %f)
+}
+
+define double @atan_test2(float %f) nounwind readnone {
+; CHECK: atan_test2
+    %conv = fpext float %f to double
+    %call = call double @atan(double %conv)
+    ret double %call
+; CHECK: call double @atan(double %conv)
+}
+define float @atanh_test(float %f) nounwind readnone {
+; CHECK: atanh_test
+    %conv = fpext float %f to double
+    %call = call double @atanh(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @atanhf(float %f)
+}
+
+define double @atanh_test2(float %f) nounwind readnone {
+; CHECK: atanh_test2
+    %conv = fpext float %f to double
+    %call = call double @atanh(double %conv)
+    ret double %call
+; CHECK: call double @atanh(double %conv)
+}
+define float @cbrt_test(float %f) nounwind readnone {
+; CHECK: cbrt_test
+    %conv = fpext float %f to double
+    %call = call double @cbrt(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @cbrtf(float %f)
+}
+
+define double @cbrt_test2(float %f) nounwind readnone {
+; CHECK: cbrt_test2
+    %conv = fpext float %f to double
+    %call = call double @cbrt(double %conv)
+    ret double %call
+; CHECK: call double @cbrt(double %conv)
+}
+define float @exp_test(float %f) nounwind readnone {
+; CHECK: exp_test
+    %conv = fpext float %f to double
+    %call = call double @exp(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @expf(float %f)
+}
+
+define double @exp_test2(float %f) nounwind readnone {
+; CHECK: exp_test2
+    %conv = fpext float %f to double
+    %call = call double @exp(double %conv)
+    ret double %call
+; CHECK: call double @exp(double %conv)
+}
+define float @expm1_test(float %f) nounwind readnone {
+; CHECK: expm1_test
+    %conv = fpext float %f to double
+    %call = call double @expm1(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @expm1f(float %f)
+}
+
+define double @expm1_test2(float %f) nounwind readnone {
+; CHECK: expm1_test2
+    %conv = fpext float %f to double
+    %call = call double @expm1(double %conv)
+    ret double %call
+; CHECK: call double @expm1(double %conv)
+}
+define float @exp10_test(float %f) nounwind readnone {
+; CHECK: exp10_test
+    %conv = fpext float %f to double
+    %call = call double @exp10(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @exp10f(float %f)
+}
+
+define double @exp10_test2(float %f) nounwind readnone {
+; CHECK: exp10_test2
+    %conv = fpext float %f to double
+    %call = call double @exp10(double %conv)
+    ret double %call
+; CHECK: call double @exp10(double %conv)
+}
+define float @log_test(float %f) nounwind readnone {
+; CHECK: log_test
+    %conv = fpext float %f to double
+    %call = call double @log(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @logf(float %f)
+}
+
+define double @log_test2(float %f) nounwind readnone {
+; CHECK: log_test2
+    %conv = fpext float %f to double
+    %call = call double @log(double %conv)
+    ret double %call
+; CHECK: call double @log(double %conv)
+}
+define float @log10_test(float %f) nounwind readnone {
+; CHECK: log10_test
+    %conv = fpext float %f to double
+    %call = call double @log10(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @log10f(float %f)
+}
+
+define double @log10_test2(float %f) nounwind readnone {
+; CHECK: log10_test2
+    %conv = fpext float %f to double
+    %call = call double @log10(double %conv)
+    ret double %call
+; CHECK: call double @log10(double %conv)
+}
+define float @log1p_test(float %f) nounwind readnone {
+; CHECK: log1p_test
+    %conv = fpext float %f to double
+    %call = call double @log1p(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @log1pf(float %f)
+}
+
+define double @log1p_test2(float %f) nounwind readnone {
+; CHECK: log1p_test2
+    %conv = fpext float %f to double
+    %call = call double @log1p(double %conv)
+    ret double %call
+; CHECK: call double @log1p(double %conv)
+}
+define float @log2_test(float %f) nounwind readnone {
+; CHECK: log2_test
+    %conv = fpext float %f to double
+    %call = call double @log2(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @log2f(float %f)
+}
+
+define double @log2_test2(float %f) nounwind readnone {
+; CHECK: log2_test2
+    %conv = fpext float %f to double
+    %call = call double @log2(double %conv)
+    ret double %call
+; CHECK: call double @log2(double %conv)
+}
+define float @logb_test(float %f) nounwind readnone {
+; CHECK: logb_test
+    %conv = fpext float %f to double
+    %call = call double @logb(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @logbf(float %f)
+}
+
+define double @logb_test2(float %f) nounwind readnone {
+; CHECK: logb_test2
+    %conv = fpext float %f to double
+    %call = call double @logb(double %conv)
+    ret double %call
+; CHECK: call double @logb(double %conv)
+}
+define float @sin_test(float %f) nounwind readnone {
+; CHECK: sin_test
+    %conv = fpext float %f to double
+    %call = call double @sin(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @sinf(float %f)
+}
+
+define double @sin_test2(float %f) nounwind readnone {
+; CHECK: sin_test2
+    %conv = fpext float %f to double
+    %call = call double @sin(double %conv)
+    ret double %call
+; CHECK: call double @sin(double %conv)
+}
+define float @sqrt_test(float %f) nounwind readnone {
+; CHECK: sqrt_test
+    %conv = fpext float %f to double
+    %call = call double @sqrt(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @sqrtf(float %f)
+}
+
+define double @sqrt_test2(float %f) nounwind readnone {
+; CHECK: sqrt_test2
+    %conv = fpext float %f to double
+    %call = call double @sqrt(double %conv)
+    ret double %call
+; CHECK: call double @sqrt(double %conv)
+}
+define float @tan_test(float %f) nounwind readnone {
+; CHECK: tan_test
+    %conv = fpext float %f to double
+    %call = call double @tan(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @tanf(float %f)
+}
+
+define double @tan_test2(float %f) nounwind readnone {
+; CHECK: tan_test2
+    %conv = fpext float %f to double
+    %call = call double @tan(double %conv)
+    ret double %call
+; CHECK: call double @tan(double %conv)
+}
+define float @tanh_test(float %f) nounwind readnone {
+; CHECK: tanh_test
+    %conv = fpext float %f to double
+    %call = call double @tanh(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @tanhf(float %f)
+}
+
+define double @tanh_test2(float %f) nounwind readnone {
+; CHECK: tanh_test2
+    %conv = fpext float %f to double
+    %call = call double @tanh(double %conv)
+    ret double %call
+; CHECK: call double @tanh(double %conv)
+}
+
+declare double @tanh(double) nounwind readnone
+declare double @tan(double) nounwind readnone
+declare double @sqrt(double) nounwind readnone
+declare double @sin(double) nounwind readnone
+declare double @log2(double) nounwind readnone
+declare double @log1p(double) nounwind readnone
+declare double @log10(double) nounwind readnone
+declare double @log(double) nounwind readnone
+declare double @logb(double) nounwind readnone
+declare double @exp10(double) nounwind readnone
+declare double @expm1(double) nounwind readnone
+declare double @exp(double) nounwind readnone
+declare double @cbrt(double) nounwind readnone
+declare double @atanh(double) nounwind readnone
+declare double @atan(double) nounwind readnone
+declare double @acos(double) nounwind readnone
+declare double @acosh(double) nounwind readnone
+declare double @asin(double) nounwind readnone
+declare double @asinh(double) nounwind readnone
diff --git a/test/Transforms/SimplifyLibCalls/float-shrink-compare.ll b/test/Transforms/SimplifyLibCalls/float-shrink-compare.ll
new file mode 100644
index 0000000..aecb887
--- /dev/null
+++ b/test/Transforms/SimplifyLibCalls/float-shrink-compare.ll
@@ -0,0 +1,179 @@
+; RUN: opt -S -simplify-libcalls -instcombine %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @test1(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @ceil(double %1) nounwind readnone
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test1
+; CHECK-NEXT: %ceilf = call float @ceilf(float %x)
+; CHECK-NEXT: fcmp oeq float %ceilf, %y
+}
+
+define i32 @test2(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @fabs(double %1) nounwind readnone
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test2
+; CHECK-NEXT: %fabsf = call float @fabsf(float %x)
+; CHECK-NEXT: fcmp oeq float %fabsf, %y
+}
+
+define i32 @test3(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @floor(double %1) nounwind readnone
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test3
+; CHECK-NEXT: %floorf = call float @floorf(float %x)
+; CHECK-NEXT: fcmp oeq float %floorf, %y
+}
+
+define i32 @test4(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @nearbyint(double %1) nounwind
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test4
+; CHECK-NEXT: %nearbyintf = call float @nearbyintf(float %x)
+; CHECK-NEXT: fcmp oeq float %nearbyintf, %y
+}
+
+define i32 @test5(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @rint(double %1) nounwind
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test5
+; CHECK-NEXT: %rintf = call float @rintf(float %x)
+; CHECK-NEXT: fcmp oeq float %rintf, %y
+}
+
+define i32 @test6(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @round(double %1) nounwind readnone
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test6
+; CHECK-NEXT: %roundf = call float @roundf(float %x)
+; CHECK-NEXT: fcmp oeq float %roundf, %y
+}
+
+define i32 @test7(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @trunc(double %1) nounwind
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test7
+; CHECK-NEXT: %truncf = call float @truncf(float %x)
+; CHECK-NEXT: fcmp oeq float %truncf, %y
+}
+
+define i32 @test8(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @ceil(double %2) nounwind readnone
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test8
+; CHECK-NEXT: %ceilf = call float @ceilf(float %x)
+; CHECK-NEXT: fcmp oeq float %ceilf, %y
+}
+
+define i32 @test9(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @fabs(double %2) nounwind readnone
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test9
+; CHECK-NEXT: %fabsf = call float @fabsf(float %x)
+; CHECK-NEXT: fcmp oeq float %fabsf, %y
+}
+
+define i32 @test10(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @floor(double %2) nounwind readnone
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test10
+; CHECK-NEXT: %floorf = call float @floorf(float %x)
+; CHECK-NEXT: fcmp oeq float %floorf, %y
+}
+
+define i32 @test11(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @nearbyint(double %2) nounwind
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test11
+; CHECK-NEXT: %nearbyintf = call float @nearbyintf(float %x)
+; CHECK-NEXT: fcmp oeq float %nearbyintf, %y
+}
+
+define i32 @test12(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @rint(double %2) nounwind
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test12
+; CHECK-NEXT: %rintf = call float @rintf(float %x)
+; CHECK-NEXT: fcmp oeq float %rintf, %y
+}
+
+define i32 @test13(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @round(double %2) nounwind readnone
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test13
+; CHECK-NEXT: %roundf = call float @roundf(float %x)
+; CHECK-NEXT: fcmp oeq float %roundf, %y
+}
+
+define i32 @test14(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @trunc(double %2) nounwind
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test14
+; CHECK-NEXT: %truncf = call float @truncf(float %x)
+; CHECK-NEXT: fcmp oeq float %truncf, %y
+}
+
+declare double @fabs(double) nounwind readnone
+declare double @ceil(double) nounwind readnone
+declare double @floor(double) nounwind readnone
+declare double @nearbyint(double) nounwind readnone
+declare double @rint(double) nounwind readnone
+declare double @round(double) nounwind readnone
+declare double @trunc(double) nounwind readnone
diff --git a/test/Transforms/SimplifyLibCalls/floor.ll b/test/Transforms/SimplifyLibCalls/floor.ll
index 03dcdf5..93c62c2 100644
--- a/test/Transforms/SimplifyLibCalls/floor.ll
+++ b/test/Transforms/SimplifyLibCalls/floor.ll
@@ -9,6 +9,8 @@
 ; DO-SIMPLIFY: call float @ceilf(
 ; DO-SIMPLIFY: call float @roundf(
 ; DO-SIMPLIFY: call float @nearbyintf(
+; DO-SIMPLIFY: call float @truncf(
+; DO-SIMPLIFY: call float @fabsf(
 
 ; C89-SIMPLIFY: call float @floorf(
 ; C89-SIMPLIFY: call float @ceilf(
@@ -19,6 +21,8 @@
 ; DONT-SIMPLIFY: call double @ceil(
 ; DONT-SIMPLIFY: call double @round(
 ; DONT-SIMPLIFY: call double @nearbyint(
+; DONT-SIMPLIFY: call double @trunc(
+; DONT-SIMPLIFY: call double @fabs(
 
 declare double @floor(double)
 
@@ -28,6 +32,10 @@ declare double @round(double)
 
 declare double @nearbyint(double)
 
+declare double @trunc(double)
+
+declare double @fabs(double)
+
 define float @test_floor(float %C) {
 	%D = fpext float %C to double		; <double> [#uses=1]
         ; --> floorf
@@ -60,3 +68,18 @@ define float @test_nearbyint(float %C) {
 	ret float %F
 }
 
+define float @test_trunc(float %C) {
+	%D = fpext float %C to double
+	; --> truncf
+        %E = call double @trunc(double %D)
+	%F = fptrunc double %E to float
+	ret float %F
+}
+
+define float @test_fabs(float %C) {
+	%D = fpext float %C to double
+	; --> fabsf
+        %E = call double @fabs(double %D)
+	%F = fptrunc double %E to float
+	ret float %F
+}
diff --git a/test/Transforms/SimplifyLibCalls/memcmp.ll b/test/Transforms/SimplifyLibCalls/memcmp.ll
deleted file mode 100644
index 6ca4dc9..0000000
--- a/test/Transforms/SimplifyLibCalls/memcmp.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; Test that the memcmpOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-@h = constant [2 x i8] c"h\00"		; <[2 x i8]*> [#uses=0]
-@hel = constant [4 x i8] c"hel\00"		; <[4 x i8]*> [#uses=0]
-@hello_u = constant [8 x i8] c"hello_u\00"		; <[8 x i8]*> [#uses=0]
-
-declare i32 @memcmp(i8*, i8*, i32)
-
-define void @test(i8* %P, i8* %Q, i32 %N, i32* %IP, i1* %BP) {
-	%A = call i32 @memcmp( i8* %P, i8* %P, i32 %N )		; <i32> [#uses=1]
-; CHECK-NOT: call {{.*}} memcmp
-; CHECK: store volatile
-	store volatile i32 %A, i32* %IP
-	%B = call i32 @memcmp( i8* %P, i8* %Q, i32 0 )		; <i32> [#uses=1]
-; CHECK-NOT: call {{.*}} memcmp
-; CHECK: store volatile
-	store volatile i32 %B, i32* %IP
-	%C = call i32 @memcmp( i8* %P, i8* %Q, i32 1 )		; <i32> [#uses=1]
-; CHECK: load
-; CHECK: zext
-; CHECK: load
-; CHECK: zext
-; CHECK: sub
-; CHECK: store volatile
-	store volatile i32 %C, i32* %IP
-  %F = call i32 @memcmp(i8* getelementptr ([4 x i8]* @hel, i32 0, i32 0),
-                        i8* getelementptr ([8 x i8]* @hello_u, i32 0, i32 0),
-                        i32 3)
-; CHECK-NOT: call {{.*}} memcmp
-; CHECK: store volatile
-  store volatile i32 %F, i32* %IP
-	ret void
-}
-
diff --git a/test/Transforms/SimplifyLibCalls/memmove.ll b/test/Transforms/SimplifyLibCalls/memmove.ll
deleted file mode 100644
index 5aaeeeb..0000000
--- a/test/Transforms/SimplifyLibCalls/memmove.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | grep "llvm.memmove"
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i686-pc-linux-gnu"
-
-define i8* @test(i8* %a, i8* %b, i32 %x) {
-entry:
-	%call = call i8* @memmove(i8* %a, i8* %b, i32 %x )
-	ret i8* %call
-}
-
-declare i8* @memmove(i8*,i8*,i32)
-
diff --git a/test/Transforms/SimplifyLibCalls/memset-64.ll b/test/Transforms/SimplifyLibCalls/memset-64.ll
deleted file mode 100644
index 92412de..0000000
--- a/test/Transforms/SimplifyLibCalls/memset-64.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | grep "llvm.memset"
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-target triple = "x86_64-pc-linux-gnu"
-
-define void @a(i8* %x) nounwind {
-entry:
-	%call = call i8* @memset(i8* %x, i32 1, i64 100)		; <i8*> [#uses=0]
-	ret void
-}
-
-declare i8* @memset(i8*, i32, i64)
-
diff --git a/test/Transforms/SimplifyLibCalls/memset.ll b/test/Transforms/SimplifyLibCalls/memset.ll
deleted file mode 100644
index 853215a..0000000
--- a/test/Transforms/SimplifyLibCalls/memset.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | grep "llvm.memset"
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i686-pc-linux-gnu"
-
-define i8* @test(i8* %a, i32 %b, i32 %x) {
-entry:
-	%call = call i8* @memset(i8* %a, i32 %b, i32 %x )
-	ret i8* %call
-}
-
-declare i8* @memset(i8*,i32,i32)
-
diff --git a/test/Transforms/SimplifyLibCalls/weak-symbols.ll b/test/Transforms/SimplifyLibCalls/weak-symbols.ll
deleted file mode 100644
index 5875b21..0000000
--- a/test/Transforms/SimplifyLibCalls/weak-symbols.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-; PR4738
-
-; SimplifyLibcalls shouldn't assume anything about weak symbols.
-
-@real_init = weak_odr constant [2 x i8] c"y\00"
-@fake_init = weak constant [2 x i8] c"y\00"
-@.str = private constant [2 x i8] c"y\00"
-
-; CHECK: define i32 @foo
-; CHECK: call i32 @strcmp
-define i32 @foo() nounwind {
-entry:
-  %t0 = call i32 @strcmp(i8* getelementptr inbounds ([2 x i8]* @fake_init, i64 0, i64 0), i8* getelementptr inbounds ([2 x i8]* @.str, i64 0, i64 0)) nounwind readonly
-  ret i32 %t0
-}
-
-; CHECK: define i32 @bar
-; CHECK: ret i32 0
-define i32 @bar() nounwind {
-entry:
-  %t0 = call i32 @strcmp(i8* getelementptr inbounds ([2 x i8]* @real_init, i64 0, i64 0), i8* getelementptr inbounds ([2 x i8]* @.str, i64 0, i64 0)) nounwind readonly
-  ret i32 %t0
-}
-
-declare i32 @strcmp(i8*, i8*) nounwind readonly
diff --git a/test/Verifier/invoke.ll b/test/Verifier/invoke.ll
index a48f9b6..c2750bb 100644
--- a/test/Verifier/invoke.ll
+++ b/test/Verifier/invoke.ll
@@ -19,7 +19,6 @@ L2:		; preds = %0
 	br label %L
 L:		; preds = %L2, %L1, %L1
 ; CHECK: The unwind destination does not have a landingpad instruction
-; CHECK: Instruction does not dominate all uses
 	ret i32 %A
 }
 
diff --git a/test/lit.cfg b/test/lit.cfg
index 6f44bb3..79eaa23 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -5,6 +5,7 @@
 import os
 import sys
 import re
+import platform
 
 # name: The name of this test suite.
 config.name = 'LLVM'
@@ -139,9 +140,22 @@ if config.test_exec_root is None:
 
 ###
 
-# When running under valgrind, we mangle '-vg' or '-vg_leak' onto the end of the
-# triple so we can check it with XFAIL and XTARGET.
-config.target_triple += lit.valgrindTriple
+# Provide a target triple for mcjit tests
+mcjit_triple = config.target_triple
+# Force ELF format on Windows
+if re.search(r'cygwin|mingw32|win32', mcjit_triple):
+  mcjit_triple += "-elf"
+config.substitutions.append( ('%mcjit_triple', mcjit_triple) )
+
+# Provide a substition for those tests that need to run the jit to obtain data
+# but simply want use the currently considered most reliable jit for platform
+# FIXME: ppc32 is not ready for mcjit.
+if 'arm' in config.target_triple \
+   or 'powerpc64' in config.target_triple:
+    defaultIsMCJIT = 'true'
+else:
+    defaultIsMCJIT = 'false'
+config.substitutions.append( ('%defaultjit', '-use-mcjit='+defaultIsMCJIT) )
 
 # Process jit implementation option
 jit_impl_cfg = lit.params.get('jit_impl', None)
@@ -230,6 +244,10 @@ else:
 if loadable_module:
     config.available_features.add('loadable_module')
 
+# LTO on OS X
+if config.lto_is_enabled == "1" and platform.system() == "Darwin":
+    config.available_features.add('lto_on_osx')
+
 # llc knows whether he is compiled with -DNDEBUG.
 import subprocess
 try:
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 178b22f..2bbe63e 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -11,6 +11,7 @@ config.python_executable = "@PYTHON_EXECUTABLE@"
 config.ocamlopt_executable = "@OCAMLOPT@"
 config.enable_shared = @ENABLE_SHARED@
 config.enable_assertions = @ENABLE_ASSERTIONS@
+config.lto_is_enabled = "@LTO_IS_ENABLED@"
 config.targets_to_build = "@TARGETS_TO_BUILD@"
 config.llvm_bindings = "@LLVM_BINDINGS@"
 config.host_os = "@HOST_OS@"