Vendor import of llvm release_32 branch r168974 (effectively, 3.2 RC2):

http://llvm.org/svn/llvm-project/llvm/branches/release_32@168974
author: dim <dim@FreeBSD.org> 2012-12-02 13:10:19 +0000
committer: dim <dim@FreeBSD.org> 2012-12-02 13:10:19 +0000
commit: 6de2c08bc400b4aca9fb46684e8bdb56eed9b09f (patch)
tree: 32b4679ab4b8f28e5228daafc65e9dc436935353 /test/CodeGen/X86
parent: 4dc93743c9d40c29c0a3bec2aae328cac0d289e8 (diff)
download: FreeBSD-src-6de2c08bc400b4aca9fb46684e8bdb56eed9b09f.zip
FreeBSD-src-6de2c08bc400b4aca9fb46684e8bdb56eed9b09f.tar.gz
115 files changed, 5728 insertions, 236 deletions
diff --git a/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll b/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
index 8b55bd7..3d058bc 100644
--- a/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
+++ b/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=corei7 | FileCheck %s
 ; rdar://r7512579
 
 ; PHI defs in the atomic loop should be used by the add / adc
@@ -7,17 +7,16 @@
 define void @t(i64* nocapture %p) nounwind ssp {
 entry:
 ; CHECK: t:
-; CHECK: movl $1
-; CHECK: movl (%ebp), %eax
-; CHECK: movl 4(%ebp), %edx
+; CHECK: movl ([[REG:%[a-z]+]]), %eax
+; CHECK: movl 4([[REG]]), %edx
 ; CHECK: LBB0_1:
-; CHECK-NOT: movl $1
-; CHECK-NOT: movl $0
-; CHECK: addl
-; CHECK: adcl
+; CHECK: movl %eax, %ebx
+; CHECK: addl {{%[a-z]+}}, %ebx
+; CHECK: movl %edx, %ecx
+; CHECK: adcl {{%[a-z]+}}, %ecx
 ; CHECK: lock
-; CHECK: cmpxchg8b
-; CHECK: jne
+; CHECK-NEXT: cmpxchg8b ([[REG]])
+; CHECK-NEXT: jne
   %0 = atomicrmw add i64* %p, i64 1 seq_cst
   ret void
 }
diff --git a/test/CodeGen/X86/2012-01-18-vbitcast.ll b/test/CodeGen/X86/2012-01-18-vbitcast.ll
index 8a3ccc8..3ce7db6 100644
--- a/test/CodeGen/X86/2012-01-18-vbitcast.ll
+++ b/test/CodeGen/X86/2012-01-18-vbitcast.ll
@@ -2,8 +2,8 @@
 
 ;CHECK: vcast
 define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) {
-;CHECK: pshufd
-;CHECK: pshufd
+;CHECK: pmovzxdq
+;CHECK: pmovzxdq
   %af = bitcast <2 x float> %a to <2 x i32>
   %bf = bitcast <2 x float> %b to <2 x i32>
   %x = sub <2 x i32> %af, %bf
diff --git a/test/CodeGen/X86/2012-03-15-build_vector_wl.ll b/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
index fec17e9..c4b307e 100644
--- a/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
+++ b/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
@@ -4,7 +4,7 @@
 define <4 x i8> @build_vector_again(<16 x i8> %in) nounwind readnone {
 entry:
   %out = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK: shufb
+; CHECK: pmovzxbd
   ret <4 x i8> %out
 ; CHECK: ret
 }
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 9a66b67..0465952 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -5,7 +5,7 @@
 ; It's hard to test for the ISEL condition because CodeGen optimizes
 ; away the bugpointed code. Just ensure the basics are still there.
 ;CHECK: func:
-;CHECK: vpxor
+;CHECK: vxorps
 ;CHECK: vinsertf128
 ;CHECK: vpshufd
 ;CHECK: vpshufd
diff --git a/test/CodeGen/X86/2012-07-10-extload64.ll b/test/CodeGen/X86/2012-07-10-extload64.ll
index 906b748..4abdded 100644
--- a/test/CodeGen/X86/2012-07-10-extload64.ll
+++ b/test/CodeGen/X86/2012-07-10-extload64.ll
@@ -3,7 +3,7 @@
 ; CHECK: load_store
 define void @load_store(<4 x i16>* %in) {
 entry:
-; CHECK: movsd
+; CHECK: pmovzxwd
   %A27 = load <4 x i16>* %in, align 4
   %A28 = add <4 x i16> %A27, %A27
 ; CHECK: movlpd
@@ -27,6 +27,6 @@ define <2 x i32> @load_64(<2 x i32>* %ptr) {
 BB:
   %t = load <2 x i32>* %ptr
   ret <2 x i32> %t
-;CHECK: movsd
+;CHECK: pmovzxdq
 ;CHECK: ret
 }
diff --git a/test/CodeGen/X86/2012-08-16-setcc.ll b/test/CodeGen/X86/2012-08-16-setcc.ll
new file mode 100644
index 0000000..ed51156
--- /dev/null
+++ b/test/CodeGen/X86/2012-08-16-setcc.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s
+
+; rdar://12081007
+
+; CHECK: and_1:
+; CHECK: andb
+; CHECK-NEXT: cmovnel
+; CHECK: ret
+define i32 @and_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
+  %1 = and i8 %b, %a
+  %2 = icmp ne i8 %1, 0
+  %3 = select i1 %2, i32 %x, i32 0
+  ret i32 %3
+}
+
+; CHECK: and_2:
+; CHECK: andb
+; CHECK-NEXT: setne
+; CHECK: ret
+define zeroext i1 @and_2(i8 zeroext %a, i8 zeroext %b) {
+  %1 = and i8 %b, %a
+  %2 = icmp ne i8 %1, 0
+  ret i1 %2
+}
+
+; CHECK: xor_1:
+; CHECK: xorb
+; CHECK-NEXT: cmovnel
+; CHECK: ret
+define i32 @xor_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
+  %1 = xor i8 %b, %a
+  %2 = icmp ne i8 %1, 0
+  %3 = select i1 %2, i32 %x, i32 0
+  ret i32 %3
+}
+
+; CHECK: xor_2:
+; CHECK: xorb
+; CHECK-NEXT: setne
+; CHECK: ret
+define zeroext i1 @xor_2(i8 zeroext %a, i8 zeroext %b) {
+  %1 = xor i8 %b, %a
+  %2 = icmp ne i8 %1, 0
+  ret i1 %2
+}
diff --git a/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll b/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
new file mode 100644
index 0000000..6ebbb2e
--- /dev/null
+++ b/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -enable-unsafe-fp-math
+; <rdar://problem/12180135>
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+define i32 @foo(float %mean) nounwind readnone ssp align 2 {
+entry:
+  %cmp = fcmp olt float %mean, -3.000000e+00
+  %f.0 = select i1 %cmp, float -3.000000e+00, float %mean
+  %cmp2 = fcmp ult float %f.0, 3.000000e+00
+  %f.1 = select i1 %cmp2, float %f.0, float 0x4007EB8520000000
+  %add = fadd float %f.1, 3.000000e+00
+  %div = fdiv float %add, 2.343750e-02
+  %0 = fpext float %div to double
+  %conv = select i1 undef, double 2.550000e+02, double %0
+  %add8 = fadd double %conv, 5.000000e-01
+  %conv9 = fptosi double %add8 to i32
+  %.conv9 = select i1 undef, i32 255, i32 %conv9
+  ret i32 %.conv9
+}
diff --git a/test/CodeGen/X86/2012-09-13-dagco-fneg.ll b/test/CodeGen/X86/2012-09-13-dagco-fneg.ll
new file mode 100644
index 0000000..7b9bab9
--- /dev/null
+++ b/test/CodeGen/X86/2012-09-13-dagco-fneg.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: foo
+; Make sure we are not trying to use scalar xor on the high bits of the vector.
+; CHECK-NOT: xorq
+; CHECK: xorl
+; CHECK-NEXT: ret
+
+define i32 @foo() {
+bb:
+  %tmp44.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00>
+  %0 = bitcast <4 x float> %tmp44.i to i128
+  %1 = zext i128 %0 to i512
+  %2 = shl nuw nsw i512 %1, 256
+  %ins = or i512 %2, 3325764857622480139933400731976840738652108318779753826115024029985671937147149347761402413803120180680770390816681124225944317364750115981129923635970048
+  store i512 %ins, i512* undef, align 64
+  ret i32 0
+}
diff --git a/test/CodeGen/X86/2012-09-28-CGPBug.ll b/test/CodeGen/X86/2012-09-28-CGPBug.ll
new file mode 100644
index 0000000..32d7d01
--- /dev/null
+++ b/test/CodeGen/X86/2012-09-28-CGPBug.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=i386-apple-macosx < %s | FileCheck %s
+; rdar://12396696
+
+@JT = global [4 x i32] [i32 sub (i32 ptrtoint (i8* blockaddress(@h, %18) to i32), i32 ptrtoint (i8* blockaddress(@h, %11) to i32)), i32 sub (i32 ptrtoint (i8* blockaddress(@h, %17) to i32), i32 ptrtoint (i8* blockaddress(@h, %11) to i32)), i32 sub (i32 ptrtoint (i8* blockaddress(@h, %22) to i32), i32 ptrtoint (i8* blockaddress(@h, %18) to i32)), i32 sub (i32 ptrtoint (i8* blockaddress(@h, %22) to i32), i32 ptrtoint (i8* blockaddress(@h, %17) to i32))]
+@gGlobalLock = external global i8*
+@.str40 = external global [35 x i8]
+
+; CHECK: _JT:
+; CHECK-NOT: .long Ltmp{{[0-9]+}}-1
+; CHECK-NOT: .long 1-Ltmp{{[0-9]+}}
+; CHECK: .long Ltmp{{[0-9]+}}-Ltmp{{[0-9]+}}
+; CHECK: .long Ltmp{{[0-9]+}}-Ltmp{{[0-9]+}}
+; CHECK: .long Ltmp{{[0-9]+}}-Ltmp{{[0-9]+}}
+; CHECK: .long Ltmp{{[0-9]+}}-Ltmp{{[0-9]+}}
+
+define void @h(i8*) nounwind ssp {
+  %2 = alloca i8*
+  store i8* %0, i8** %2
+  %3 = load i8** %2
+  %4 = bitcast i8* %3 to { i32, i32 }*
+  %5 = getelementptr { i32, i32 }* %4, i32 0, i32 0
+  %6 = load i32* %5
+  %7 = srem i32 %6, 2
+  %8 = icmp slt i32 %6, 2
+  %9 = select i1 %8, i32 %6, i32 %7
+  %10 = icmp eq i32 %9, 0
+  br label %11
+
+; <label>:11                                      ; preds = %1
+  %12 = zext i1 %10 to i32
+  %13 = getelementptr [4 x i32]* @JT, i32 0, i32 %12
+  %14 = load i32* %13
+  %15 = add i32 %14, ptrtoint (i8* blockaddress(@h, %11) to i32)
+  %16 = inttoptr i32 %15 to i8*
+  indirectbr i8* %16, [label %17, label %18]
+
+; <label>:17                                      ; preds = %11
+  tail call void (i8*, ...)* @g(i8* getelementptr inbounds ([35 x i8]* @.str40, i32 0, i32 0))
+  br label %22
+
+; <label>:18                                      ; preds = %11
+  %19 = call i32 @f(i32 -1037694186) nounwind
+  %20 = inttoptr i32 %19 to i32 (i8**)*
+  %21 = tail call i32 %20(i8** @gGlobalLock)
+  br label %22
+
+; <label>:22                                      ; preds = %18, %17
+  ret void
+}
+
+declare i32 @f(i32)
+
+declare void @g(i8*, ...)
diff --git a/test/CodeGen/X86/2012-10-02-DAGCycle.ll b/test/CodeGen/X86/2012-10-02-DAGCycle.ll
new file mode 100644
index 0000000..8d914db
--- /dev/null
+++ b/test/CodeGen/X86/2012-10-02-DAGCycle.ll
@@ -0,0 +1,52 @@
+; RUN: llc -mtriple=i386-apple-macosx -relocation-model=pic < %s
+; RUN: llc -mtriple=x86_64-apple-macosx -relocation-model=pic < %s
+
+; rdar://12393897
+
+%TRp = type { i32, %TRH*, i32, i32 }
+%TRH = type { i8*, i8*, i8*, i8*, {}* }
+
+define i32 @t(%TRp* inreg %rp) nounwind optsize ssp {
+entry:
+  %handler = getelementptr inbounds %TRp* %rp, i32 0, i32 1
+  %0 = load %TRH** %handler, align 4
+  %sync = getelementptr inbounds %TRH* %0, i32 0, i32 4
+  %sync12 = load {}** %sync, align 4
+  %1 = bitcast {}* %sync12 to i32 (%TRp*)*
+  %call = tail call i32 %1(%TRp* inreg %rp) nounwind optsize
+  ret i32 %call
+}
+
+%btConeShape = type { %btConvexInternalShape, float, float, float, [3 x i32] }
+%btConvexInternalShape = type { %btConvexShape, %btVector, %btVector, float, float }
+%btConvexShape = type { %btCollisionShape }
+%btCollisionShape = type { i32 (...)**, i32, i8* }
+%btVector = type { [4 x float] }
+
+define { <2 x float>, <2 x float> } @t2(%btConeShape* %this) unnamed_addr uwtable ssp align 2 {
+entry:
+  %0 = getelementptr inbounds %btConeShape* %this, i64 0, i32 0
+  br i1 undef, label %if.then, label %if.end17
+
+if.then:                                          ; preds = %entry
+  %vecnorm.sroa.2.8.copyload = load float* undef, align 4
+  %cmp4 = fcmp olt float undef, 0x3D10000000000000
+  %vecnorm.sroa.2.8.copyload36 = select i1 %cmp4, float -1.000000e+00, float %vecnorm.sroa.2.8.copyload
+  %call.i.i.i = tail call float @sqrtf(float 0.000000e+00) nounwind readnone
+  %div.i.i = fdiv float 1.000000e+00, %call.i.i.i
+  %mul7.i.i.i = fmul float %div.i.i, %vecnorm.sroa.2.8.copyload36
+  %1 = load float (%btConvexInternalShape*)** undef, align 8
+  %call12 = tail call float %1(%btConvexInternalShape* %0)
+  %mul7.i.i = fmul float %call12, %mul7.i.i.i
+  %retval.sroa.0.4.insert = insertelement <2 x float> zeroinitializer, float undef, i32 1
+  %add13.i = fadd float undef, %mul7.i.i
+  %retval.sroa.1.8.insert = insertelement <2 x float> undef, float %add13.i, i32 0
+  br label %if.end17
+
+if.end17:                                         ; preds = %if.then, %entry
+  %retval.sroa.1.8.load3338 = phi <2 x float> [ %retval.sroa.1.8.insert, %if.then ], [ undef, %entry ]
+  %retval.sroa.0.0.load3137 = phi <2 x float> [ %retval.sroa.0.4.insert, %if.then ], [ undef, %entry ]
+  ret { <2 x float>, <2 x float> } undef
+}
+
+declare float @sqrtf(float) nounwind readnone
diff --git a/test/CodeGen/X86/2012-10-03-DAGCycle.ll b/test/CodeGen/X86/2012-10-03-DAGCycle.ll
new file mode 100644
index 0000000..72083c7
--- /dev/null
+++ b/test/CodeGen/X86/2012-10-03-DAGCycle.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=x86_64-apple-macosx -mcpu=corei7 < %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.pluto.0 = type { %struct.bar.1, %struct.hoge.368* }
+%struct.bar.1 = type { %i8* }
+%i8 = type { i8 }
+%struct.hoge.368 = type { i32, i32 }
+%struct.widget.375 = type { i32, i32, %i8*, %struct.hoge.368* }
+
+define fastcc void @bar(%struct.pluto.0* %arg) nounwind uwtable ssp align 2 {
+bb:
+  %tmp1 = alloca %struct.widget.375, align 8
+  %tmp2 = getelementptr inbounds %struct.pluto.0* %arg, i64 0, i32 1
+  %tmp3 = load %struct.hoge.368** %tmp2, align 8
+  store %struct.pluto.0* %arg, %struct.pluto.0** undef, align 8
+  %tmp = getelementptr inbounds %struct.widget.375* %tmp1, i64 0, i32 2
+  %tmp4 = getelementptr %struct.pluto.0* %arg, i64 0, i32 0, i32 0
+  %tmp5 = load %i8** %tmp4, align 8
+  store %i8* %tmp5, %i8** %tmp, align 8
+  %tmp6 = getelementptr inbounds %struct.widget.375* %tmp1, i64 0, i32 3
+  store %struct.hoge.368* %tmp3, %struct.hoge.368** %tmp6, align 8
+  br i1 undef, label %bb8, label %bb7
+
+bb7:                                              ; preds = %bb
+  unreachable
+
+bb8:                                              ; preds = %bb
+  unreachable
+}
diff --git a/test/CodeGen/X86/2012-10-18-crash-dagco.ll b/test/CodeGen/X86/2012-10-18-crash-dagco.ll
new file mode 100644
index 0000000..5b98624
--- /dev/null
+++ b/test/CodeGen/X86/2012-10-18-crash-dagco.ll
@@ -0,0 +1,61 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 -disable-cgp-select2branch < %s
+
+; We should not crash on this test.
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+target triple = "i386-apple-darwin9.0.0"
+
+@global = external constant [411 x i8], align 1
+
+define void @snork() nounwind {
+bb:
+  br i1 undef, label %bb26, label %bb27
+
+bb26:                                             ; preds = %bb48, %bb26, %bb
+  switch i32 undef, label %bb26 [
+    i32 142771596, label %bb28
+  ]
+
+bb27:                                             ; preds = %bb48, %bb
+  switch i32 undef, label %bb49 [
+    i32 142771596, label %bb28
+  ]
+
+bb28:                                             ; preds = %bb27, %bb26
+  %tmp = load i32* null
+  %tmp29 = trunc i32 %tmp to i8
+  store i8* undef, i8** undef
+  %tmp30 = load i32* null
+  %tmp31 = icmp eq i32 %tmp30, 0
+  %tmp32 = getelementptr inbounds [411 x i8]* @global, i32 0, i32 undef
+  %tmp33 = load i8* %tmp32, align 1
+  %tmp34 = getelementptr inbounds [411 x i8]* @global, i32 0, i32 0
+  %tmp35 = load i8* %tmp34, align 1
+  %tmp36 = select i1 %tmp31, i8 %tmp35, i8 %tmp33
+  %tmp37 = select i1 undef, i8 %tmp29, i8 %tmp36
+  %tmp38 = zext i8 %tmp37 to i32
+  %tmp39 = select i1 undef, i32 0, i32 %tmp38
+  %tmp40 = getelementptr inbounds i32* null, i32 %tmp39
+  %tmp41 = load i32* %tmp40, align 4
+  %tmp42 = load i32* undef, align 4
+  %tmp43 = load i32* undef
+  %tmp44 = xor i32 %tmp42, %tmp43
+  %tmp45 = lshr i32 %tmp44, 8
+  %tmp46 = lshr i32 %tmp44, 7
+  call void @spam()
+  unreachable
+
+bb47:                                             ; No predecessors!
+  ret void
+
+bb48:                                             ; No predecessors!
+  br i1 undef, label %bb27, label %bb26
+
+bb49:                                             ; preds = %bb49, %bb27
+  br label %bb49
+
+bb50:                                             ; preds = %bb50
+  br label %bb50
+}
+
+declare void @spam() noreturn nounwind
diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll
new file mode 100644
index 0000000..64825ba
--- /dev/null
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -0,0 +1,305 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=+avx < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
+%struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }
+
+; CHECK: merge_const_store
+; save 1,2,3 ... as one big integer.
+; CHECK: movabsq $578437695752307201
+; CHECK: ret
+define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+.lr.ph:
+  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
+  %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
+  %2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
+  store i8 1, i8* %2, align 1
+  %3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
+  store i8 2, i8* %3, align 1
+  %4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2
+  store i8 3, i8* %4, align 1
+  %5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3
+  store i8 4, i8* %5, align 1
+  %6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4
+  store i8 5, i8* %6, align 1
+  %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5
+  store i8 6, i8* %7, align 1
+  %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6
+  store i8 7, i8* %8, align 1
+  %9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7
+  store i8 8, i8* %9, align 1
+  %10 = add nsw i32 %i.02, 1
+  %11 = getelementptr inbounds %struct.A* %.01, i64 1
+  %exitcond = icmp eq i32 %10, %count
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+._crit_edge:
+  ret void
+}
+
+; Move the constants using a single vector store.
+; CHECK: merge_const_store_vec
+; CHECK: vmovups  %ymm0, (%rsi)
+; CHECK: ret
+define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+.lr.ph:
+  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
+  %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
+  %2 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
+  store i32 0, i32* %2, align 4
+  %3 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
+  store i32 0, i32* %3, align 4
+  %4 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2
+  store i32 0, i32* %4, align 4
+  %5 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3
+  store i32 0, i32* %5, align 4
+  %6 = getelementptr inbounds %struct.B* %.01, i64 0, i32 4
+  store i32 0, i32* %6, align 4
+  %7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 5
+  store i32 0, i32* %7, align 4
+  %8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 6
+  store i32 0, i32* %8, align 4
+  %9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 7
+  store i32 0, i32* %9, align 4
+  %10 = add nsw i32 %i.02, 1
+  %11 = getelementptr inbounds %struct.B* %.01, i64 1
+  %exitcond = icmp eq i32 %10, %count
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+._crit_edge:
+  ret void
+}
+
+; Move the first 4 constants as a single vector. Move the rest as scalars.
+; CHECK: merge_nonconst_store
+; CHECK: movl $67305985
+; CHECK: movb
+; CHECK: movb
+; CHECK: movb
+; CHECK: movb
+; CHECK: ret
+define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+.lr.ph:
+  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
+  %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
+  %2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
+  store i8 1, i8* %2, align 1
+  %3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
+  store i8 2, i8* %3, align 1
+  %4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2
+  store i8 3, i8* %4, align 1
+  %5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3
+  store i8 4, i8* %5, align 1
+  %6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4
+  store i8 %zz, i8* %6, align 1                     ;  <----------- Not a const;
+  %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5
+  store i8 6, i8* %7, align 1
+  %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6
+  store i8 7, i8* %8, align 1
+  %9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7
+  store i8 8, i8* %9, align 1
+  %10 = add nsw i32 %i.02, 1
+  %11 = getelementptr inbounds %struct.A* %.01, i64 1
+  %exitcond = icmp eq i32 %10, %count
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+._crit_edge:
+  ret void
+}
+
+
+;CHECK: merge_loads_i16
+; load:
+;CHECK: movw
+; store:
+;CHECK: movw
+;CHECK: ret
+define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0
+  %3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1
+  br label %4
+
+; <label>:4                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
+  %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ]
+  %5 = load i8* %2, align 1
+  %6 = load i8* %3, align 1
+  %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
+  store i8 %5, i8* %7, align 1
+  %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
+  store i8 %6, i8* %8, align 1
+  %9 = add nsw i32 %i.02, 1
+  %10 = getelementptr inbounds %struct.A* %.01, i64 1
+  %exitcond = icmp eq i32 %9, %count
+  br i1 %exitcond, label %._crit_edge, label %4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
+; The loads and the stores are interleved. Can't merge them.
+;CHECK: no_merge_loads
+;CHECK: movb
+;CHECK: movb
+;CHECK: movb
+;CHECK: movb
+;CHECK: ret
+define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0
+  %3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1
+  br label %a4
+
+a4:                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
+  %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]
+  %a5 = load i8* %2, align 1
+  %a7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
+  store i8 %a5, i8* %a7, align 1
+  %a8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
+  %a6 = load i8* %3, align 1
+  store i8 %a6, i8* %a8, align 1
+  %a9 = add nsw i32 %i.02, 1
+  %a10 = getelementptr inbounds %struct.A* %.01, i64 1
+  %exitcond = icmp eq i32 %a9, %count
+  br i1 %exitcond, label %._crit_edge, label %a4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
+
+;CHECK: merge_loads_integer
+; load:
+;CHECK: movq
+; store:
+;CHECK: movq
+;CHECK: ret
+define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = getelementptr inbounds %struct.B* %q, i64 0, i32 0
+  %3 = getelementptr inbounds %struct.B* %q, i64 0, i32 1
+  br label %4
+
+; <label>:4                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
+  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %10, %4 ]
+  %5 = load i32* %2
+  %6 = load i32* %3
+  %7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
+  store i32 %5, i32* %7
+  %8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
+  store i32 %6, i32* %8
+  %9 = add nsw i32 %i.02, 1
+  %10 = getelementptr inbounds %struct.B* %.01, i64 1
+  %exitcond = icmp eq i32 %9, %count
+  br i1 %exitcond, label %._crit_edge, label %4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
+
+;CHECK: merge_loads_vector
+; load:
+;CHECK: movups
+; store:
+;CHECK: movups
+;CHECK: ret
+define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %a1 = icmp sgt i32 %count, 0
+  br i1 %a1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %a2 = getelementptr inbounds %struct.B* %q, i64 0, i32 0
+  %a3 = getelementptr inbounds %struct.B* %q, i64 0, i32 1
+  %a4 = getelementptr inbounds %struct.B* %q, i64 0, i32 2
+  %a5 = getelementptr inbounds %struct.B* %q, i64 0, i32 3
+  br label %block4
+
+block4:                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
+  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
+  %a7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
+  %a8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
+  %a9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2
+  %a10 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3
+  %b1 = load i32* %a2
+  %b2 = load i32* %a3
+  %b3 = load i32* %a4
+  %b4 = load i32* %a5
+  store i32 %b1, i32* %a7
+  store i32 %b2, i32* %a8
+  store i32 %b3, i32* %a9
+  store i32 %b4, i32* %a10
+  %c9 = add nsw i32 %i.02, 1
+  %c10 = getelementptr inbounds %struct.B* %.01, i64 1
+  %exitcond = icmp eq i32 %c9, %count
+  br i1 %exitcond, label %._crit_edge, label %block4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
+;CHECK: merge_loads_no_align
+; load:
+;CHECK: movl
+;CHECK: movl
+;CHECK: movl
+;CHECK: movl
+; store:
+;CHECK: movl
+;CHECK: movl
+;CHECK: movl
+;CHECK: movl
+;CHECK: ret
+define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %a1 = icmp sgt i32 %count, 0
+  br i1 %a1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %a2 = getelementptr inbounds %struct.B* %q, i64 0, i32 0
+  %a3 = getelementptr inbounds %struct.B* %q, i64 0, i32 1
+  %a4 = getelementptr inbounds %struct.B* %q, i64 0, i32 2
+  %a5 = getelementptr inbounds %struct.B* %q, i64 0, i32 3
+  br label %block4
+
+block4:                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
+  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
+  %a7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
+  %a8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
+  %a9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2
+  %a10 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3
+  %b1 = load i32* %a2, align 1
+  %b2 = load i32* %a3, align 1
+  %b3 = load i32* %a4, align 1
+  %b4 = load i32* %a5, align 1
+  store i32 %b1, i32* %a7, align 1
+  store i32 %b2, i32* %a8, align 1
+  store i32 %b3, i32* %a9, align 1
+  store i32 %b4, i32* %a10, align 1
+  %c9 = add nsw i32 %i.02, 1
+  %c10 = getelementptr inbounds %struct.B* %.01, i64 1
+  %exitcond = icmp eq i32 %c9, %count
+  br i1 %exitcond, label %._crit_edge, label %block4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
diff --git a/test/CodeGen/X86/StackColoring-dbg.ll b/test/CodeGen/X86/StackColoring-dbg.ll
new file mode 100644
index 0000000..5982544
--- /dev/null
+++ b/test/CodeGen/X86/StackColoring-dbg.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s
+
+; Make sure that we don't crash when dbg values are used.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define void @foo() nounwind uwtable ssp {
+entry:
+  %x.i = alloca i8, align 1
+  %y.i = alloca [256 x i8], align 16
+  %0 = getelementptr inbounds [256 x i8]* %y.i, i64 0, i64 0
+  br label %for.body
+
+for.body:
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.start(i64 -1, i8* %x.i) nounwind
+  call void @llvm.dbg.declare(metadata !{i8* %x.i}, metadata !22) nounwind
+  br label %for.body
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+!16 = metadata !{i32 786468, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6}
+!2 = metadata !{i32 0}
+!22 = metadata !{i32 786688, metadata !2, metadata !"x", metadata !2, i32 16, metadata !16, i32 0, i32 0}
diff --git a/test/CodeGen/X86/StackColoring.ll b/test/CodeGen/X86/StackColoring.ll
new file mode 100644
index 0000000..f8ae74f
--- /dev/null
+++ b/test/CodeGen/X86/StackColoring.ll
@@ -0,0 +1,410 @@
+; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s | FileCheck %s --check-prefix=YESCOLOR
+; RUN: llc -mcpu=corei7 -no-stack-coloring=true  < %s | FileCheck %s --check-prefix=NOCOLOR
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;YESCOLOR: subq  $136, %rsp
+;NOCOLOR: subq  $264, %rsp
+
+define i32 @myCall_w2(i32 %in) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+}
+
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall2_no_merge(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  ret i32 %t7
+bb3:
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  ret i32 0
+}
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall2_w2(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+;YESCOLOR: subq  $208, %rsp
+;NOCOLOR: subq  $400, %rsp
+
+
+
+
+define i32 @myCall_w4(i32 %in) {
+entry:
+  %a1 = alloca [14 x i8*], align 8
+  %a2 = alloca [13 x i8*], align 8
+  %a3 = alloca [12 x i8*], align 8
+  %a4 = alloca [11 x i8*], align 8
+  %b1 = bitcast [14 x i8*]* %a1 to i8*
+  %b2 = bitcast [13 x i8*]* %a2 to i8*
+  %b3 = bitcast [12 x i8*]* %a3 to i8*
+  %b4 = bitcast [11 x i8*]* %a4 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b4)
+  call void @llvm.lifetime.start(i64 -1, i8* %b1)
+  %t1 = call i32 @foo(i32 %in, i8* %b1)
+  %t2 = call i32 @foo(i32 %in, i8* %b1)
+  call void @llvm.lifetime.end(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t9 = call i32 @foo(i32 %in, i8* %b2)
+  %t8 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+  %t3 = call i32 @foo(i32 %in, i8* %b3)
+  %t4 = call i32 @foo(i32 %in, i8* %b3)
+  call void @llvm.lifetime.end(i64 -1, i8* %b3)
+  %t11 = call i32 @foo(i32 %in, i8* %b4)
+  call void @llvm.lifetime.end(i64 -1, i8* %b4)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+}
+
+;YESCOLOR: subq  $112, %rsp
+;NOCOLOR: subq  $400, %rsp
+
+define i32 @myCall2_w4(i32 %in) {
+entry:
+  %a1 = alloca [14 x i8*], align 8
+  %a2 = alloca [13 x i8*], align 8
+  %a3 = alloca [12 x i8*], align 8
+  %a4 = alloca [11 x i8*], align 8
+  %b1 = bitcast [14 x i8*]* %a1 to i8*
+  %b2 = bitcast [13 x i8*]* %a2 to i8*
+  %b3 = bitcast [12 x i8*]* %a3 to i8*
+  %b4 = bitcast [11 x i8*]* %a4 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b1)
+  %t1 = call i32 @foo(i32 %in, i8* %b1)
+  %t2 = call i32 @foo(i32 %in, i8* %b1)
+  call void @llvm.lifetime.end(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t9 = call i32 @foo(i32 %in, i8* %b2)
+  %t8 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+  %t3 = call i32 @foo(i32 %in, i8* %b3)
+  %t4 = call i32 @foo(i32 %in, i8* %b3)
+  call void @llvm.lifetime.end(i64 -1, i8* %b3)
+  br i1 undef, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b4)
+  %t11 = call i32 @foo(i32 %in, i8* %b4)
+  call void @llvm.lifetime.end(i64 -1, i8* %b4)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+
+define i32 @myCall2_noend(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall2_noend2(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall2_nostart(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+; Adopt the test from Transforms/Inline/array_merge.ll'
+;YESCOLOR: subq  $816, %rsp
+;NOCOLOR: subq  $1616, %rsp
+define void @array_merge() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  %2 = bitcast [100 x i32]* %A.i1 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %2) nounwind
+  %3 = bitcast [100 x i32]* %B.i2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %3) nounwind
+  call void @bar([100 x i32]* %A.i1, [100 x i32]* %B.i2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
+  ret void
+}
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @func_phi_lifetime(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb0, label %bb1
+
+bb0:
+  %I1 = bitcast [17 x i8*]* %a to i8*
+  br label %bb2
+
+bb1:
+  %I2 = bitcast [16 x i8*]* %a2 to i8*
+  br label %bb2
+
+bb2:
+  %split = phi i8* [ %I1, %bb0 ], [ %I2, %bb1 ]
+  call void @llvm.lifetime.start(i64 -1, i8* %split)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  call void @llvm.lifetime.end(i64 -1, i8* %split)
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: multi_region_bb
+;NOCOLOR: multi_region_bb
+define void @multi_region_bb() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind ; <---- start #1
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  %2 = bitcast [100 x i32]* %A.i1 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %2) nounwind
+  %3 = bitcast [100 x i32]* %B.i2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %3) nounwind
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind  ; <---- start #2
+  call void @bar([100 x i32]* %A.i1, [100 x i32]* %B.i2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
+  ret void
+}
+
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall_end_before_begin(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+; Check that we don't assert and crash even when there are allocas
+; outside the declared lifetime regions.
+;YESCOLOR: bad_range
+;NOCOLOR:  bad_range
+define void @bad_range() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  br label %block2
+
+block2:
+  ; I am used outside the marked lifetime.
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  ret void
+}
+
+
+; Check that we don't assert and crash even when there are usages
+; of allocas which do not read or write outside the declared lifetime regions.
+;YESCOLOR: shady_range
+;NOCOLOR:  shady_range
+
+%struct.Klass = type { i32, i32 }
+
+define i32 @shady_range(i32 %argc, i8** nocapture %argv) uwtable {
+  %a.i = alloca [4 x %struct.Klass], align 16
+  %b.i = alloca [4 x %struct.Klass], align 16
+  %a8 = bitcast [4 x %struct.Klass]* %a.i to i8*
+  %b8 = bitcast [4 x %struct.Klass]* %b.i to i8*
+  ; I am used outside the lifetime zone below:
+  %z2 = getelementptr inbounds [4 x %struct.Klass]* %a.i, i64 0, i64 0, i32 0
+  call void @llvm.lifetime.start(i64 -1, i8* %a8)
+  call void @llvm.lifetime.start(i64 -1, i8* %b8)
+  %z3 = load i32* %z2, align 16
+  %r = call i32 @foo(i32 %z3, i8* %a8)
+  %r2 = call i32 @foo(i32 %z3, i8* %b8)
+  call void @llvm.lifetime.end(i64 -1, i8* %a8)
+  call void @llvm.lifetime.end(i64 -1, i8* %b8)
+  ret i32 9
+}
+
+declare void @bar([100 x i32]* , [100 x i32]*) nounwind
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+declare i32 @foo(i32, i8*)
+
diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll
index a4abccb..4e30f2b 100644
--- a/test/CodeGen/X86/add-of-carry.ll
+++ b/test/CodeGen/X86/add-of-carry.ll
@@ -30,4 +30,17 @@ entry:
   ret i32 %z.0
 }
 
+; <rdar://problem/12579915>
+define i32 @test3(i32 %x, i32 %y, i32 %res) nounwind uwtable readnone ssp {
+entry:
+  %cmp = icmp ugt i32 %x, %y
+  %dec = sext i1 %cmp to i32
+  %dec.res = add nsw i32 %dec, %res
+  ret i32 %dec.res
+; CHECK: test3:
+; CHECK: cmpl
+; CHECK: sbbl
+; CHECK: ret
+}
+
 declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/X86/atom-bypass-slow-division.ll b/test/CodeGen/X86/atom-bypass-slow-division.ll
new file mode 100644
index 0000000..e7c9605
--- /dev/null
+++ b/test/CodeGen/X86/atom-bypass-slow-division.ll
@@ -0,0 +1,112 @@
+; RUN: llc < %s -mcpu=atom -mtriple=i686-linux  | FileCheck %s
+
+define i32 @test_get_quotient(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+  %result = sdiv i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @test_get_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+  %result = srem i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient_and_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+; CEECK-NOT: idivl
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, %b
+  %resultrem = srem i32 %a, %b
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
+; CHECK: test_use_div_and_idiv
+; CHECK: idivl
+; CHECK: divb
+; CHECK: divl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+  %resultidiv = sdiv i32 %a, %b
+  %resultdiv = udiv i32 %a, %b
+  %result = add i32 %resultidiv, %resultdiv
+  ret i32 %result
+}
+
+define i32 @test_use_div_imm_imm() nounwind {
+; CHECK: test_use_div_imm_imm
+; CHECK: movl $64
+  %resultdiv = sdiv i32 256, 4
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_div_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_div_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, 33
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_rem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultrem = srem i32 %a, 33
+  ret i32 %resultrem
+}
+
+define i32 @test_use_divrem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_divrem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, 33
+  %resultrem = srem i32 %a, 33
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @test_use_div_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_div_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_rem_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
diff --git a/test/CodeGen/X86/atom-shuf.ll b/test/CodeGen/X86/atom-shuf.ll
new file mode 100644
index 0000000..4c3f2f6
--- /dev/null
+++ b/test/CodeGen/X86/atom-shuf.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -mtriple=x86_64-linux-pc -mcpu=atom | FileCheck %s
+
+define <16 x i8> @foo(<16 x i8> %in) {
+  %r = shufflevector <16 x i8> %in, <16 x i8> undef, <16 x i32> < i32 7, i32 3, i32 2, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %r
+; CHECK: foo
+; CHECK: pshufb
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/atomic-minmax-i6432.ll b/test/CodeGen/X86/atomic-minmax-i6432.ll
new file mode 100644
index 0000000..e3ef605
--- /dev/null
+++ b/test/CodeGen/X86/atomic-minmax-i6432.ll
@@ -0,0 +1,67 @@
+; RUN: llc -march=x86 -mattr=+cmov -mtriple=i386-pc-linux < %s | FileCheck %s -check-prefix=LINUX
+; RUN: llc -march=x86 -mtriple=i386-macosx -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC
+
+@sc64 = external global i64
+
+define void @atomic_maxmin_i6432() {
+; LINUX: atomic_maxmin_i6432
+  %1 = atomicrmw max  i64* @sc64, i64 5 acquire
+; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
+; LINUX: cmpl
+; LINUX: setl
+; LINUX: cmpl
+; LINUX: setl
+; LINUX: cmovne
+; LINUX: cmovne
+; LINUX: lock
+; LINUX-NEXT: cmpxchg8b
+; LINUX: jne [[LABEL]]
+  %2 = atomicrmw min  i64* @sc64, i64 6 acquire
+; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
+; LINUX: cmpl
+; LINUX: setg
+; LINUX: cmpl
+; LINUX: setg
+; LINUX: cmovne
+; LINUX: cmovne
+; LINUX: lock
+; LINUX-NEXT: cmpxchg8b
+; LINUX: jne [[LABEL]]
+  %3 = atomicrmw umax i64* @sc64, i64 7 acquire
+; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
+; LINUX: cmpl
+; LINUX: setb
+; LINUX: cmpl
+; LINUX: setb
+; LINUX: cmovne
+; LINUX: cmovne
+; LINUX: lock
+; LINUX-NEXT: cmpxchg8b
+; LINUX: jne [[LABEL]]
+  %4 = atomicrmw umin i64* @sc64, i64 8 acquire
+; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
+; LINUX: cmpl
+; LINUX: seta
+; LINUX: cmpl
+; LINUX: seta
+; LINUX: cmovne
+; LINUX: cmovne
+; LINUX: lock
+; LINUX-NEXT: cmpxchg8b
+; LINUX: jne [[LABEL]]
+  ret void
+}
+
+; rdar://12453106
+@id = internal global i64 0, align 8
+
+define void @tf_bug(i8* %ptr) nounwind {
+; PIC: tf_bug:
+; PIC: movl _id-L1$pb(
+; PIC: movl (_id-L1$pb)+4(
+  %tmp1 = atomicrmw add i64* @id, i64 1 seq_cst
+  %tmp2 = add i64 %tmp1, 1
+  %tmp3 = bitcast i8* %ptr to i64*
+  store i64 %tmp2, i64* %tmp3, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/atomic-pointer.ll b/test/CodeGen/X86/atomic-pointer.ll
new file mode 100644
index 0000000..a455277
--- /dev/null
+++ b/test/CodeGen/X86/atomic-pointer.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=i686-none-linux | FileCheck %s
+
+define i32* @test_atomic_ptr_load(i32** %a0) {
+; CHECK: test_atomic_ptr_load
+; CHECK: movl
+; CHECK: movl
+; CHECK: ret
+0:
+  %0 = load atomic i32** %a0 seq_cst, align 4
+  ret i32* %0
+}
+
+define void @test_atomic_ptr_store(i32* %a0, i32** %a1) {
+; CHECK: test_atomic_ptr_store
+; CHECK: movl
+; CHECK: movl
+; CHECK: xchgl
+; CHECK: ret
+0:
+  store atomic i32* %a0, i32** %a1 seq_cst, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/atomic16.ll b/test/CodeGen/X86/atomic16.ll
new file mode 100644
index 0000000..824995d
--- /dev/null
+++ b/test/CodeGen/X86/atomic16.ll
@@ -0,0 +1,250 @@
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mcpu=corei7 -show-mc-encoding | FileCheck %s --check-prefix X64
+; RUN: llc < %s -O0 -mtriple=i386-unknown-unknown -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc16 = external global i16
+
+define void @atomic_fetch_add16() nounwind {
+; X64:   atomic_fetch_add16
+; X32:   atomic_fetch_add16
+entry:
+; 32-bit
+  %t1 = atomicrmw add  i16* @sc16, i16 1 acquire
+; X64:       lock
+; X64:       incw
+; X32:       lock
+; X32:       incw
+  %t2 = atomicrmw add  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       addw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       addw $3
+  %t3 = atomicrmw add  i16* @sc16, i16 5 acquire
+; X64:       lock
+; X64:       xaddw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xaddw
+  %t4 = atomicrmw add  i16* @sc16, i16 %t3 acquire
+; X64:       lock
+; X64:       addw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       addw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_sub16() nounwind {
+; X64:   atomic_fetch_sub16
+; X32:   atomic_fetch_sub16
+  %t1 = atomicrmw sub  i16* @sc16, i16 1 acquire
+; X64:       lock
+; X64:       decw
+; X32:       lock
+; X32:       decw
+  %t2 = atomicrmw sub  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       subw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       subw $3
+  %t3 = atomicrmw sub  i16* @sc16, i16 5 acquire
+; X64:       lock
+; X64:       xaddw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xaddw
+  %t4 = atomicrmw sub  i16* @sc16, i16 %t3 acquire
+; X64:       lock
+; X64:       subw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       subw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_and16() nounwind {
+; X64:   atomic_fetch_and16
+; X32:   atomic_fetch_and16
+  %t1 = atomicrmw and  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       andw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       andw $3
+  %t2 = atomicrmw and  i16* @sc16, i16 5 acquire
+; X64:       andw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       andw
+; X32:       lock
+; X32:       cmpxchgw
+  %t3 = atomicrmw and  i16* @sc16, i16 %t2 acquire
+; X64:       lock
+; X64:       andw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       andw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_or16() nounwind {
+; X64:   atomic_fetch_or16
+; X32:   atomic_fetch_or16
+  %t1 = atomicrmw or   i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       orw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       orw $3
+  %t2 = atomicrmw or   i16* @sc16, i16 5 acquire
+; X64:       orw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       orw
+; X32:       lock
+; X32:       cmpxchgw
+  %t3 = atomicrmw or   i16* @sc16, i16 %t2 acquire
+; X64:       lock
+; X64:       orw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       orw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_xor16() nounwind {
+; X64:   atomic_fetch_xor16
+; X32:   atomic_fetch_xor16
+  %t1 = atomicrmw xor  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       xorw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xorw $3
+  %t2 = atomicrmw xor  i16* @sc16, i16 5 acquire
+; X64:       xorw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       xorw
+; X32:       lock
+; X32:       cmpxchgw
+  %t3 = atomicrmw xor  i16* @sc16, i16 %t2 acquire
+; X64:       lock
+; X64:       xorw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xorw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_nand16(i16 %x) nounwind {
+; X64:   atomic_fetch_nand16
+; X32:   atomic_fetch_nand16
+  %t1 = atomicrmw nand i16* @sc16, i16 %x acquire
+; X64:       andw
+; X64:       notw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       andw
+; X32:       notw
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max16(i16 %x) nounwind {
+  %t1 = atomicrmw max  i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min16(i16 %x) nounwind {
+  %t1 = atomicrmw min  i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax16(i16 %x) nounwind {
+  %t1 = atomicrmw umax i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin16(i16 %x) nounwind {
+  %t1 = atomicrmw umin i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg16() nounwind {
+  %t1 = cmpxchg i16* @sc16, i16 0, i16 1 acquire
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store16(i16 %x) nounwind {
+  store atomic i16 %x, i16* @sc16 release, align 4
+; X64-NOT:   lock
+; X64:       movw
+; X32-NOT:   lock
+; X32:       movw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap16(i16 %x) nounwind {
+  %t1 = atomicrmw xchg i16* @sc16, i16 %x acquire
+; X64-NOT:   lock
+; X64:       xchgw
+; X32-NOT:   lock
+; X32:       xchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic32.ll b/test/CodeGen/X86/atomic32.ll
new file mode 100644
index 0000000..dc927d8
--- /dev/null
+++ b/test/CodeGen/X86/atomic32.ll
@@ -0,0 +1,250 @@
+; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
+; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc32 = external global i32
+
+define void @atomic_fetch_add32() nounwind {
+; X64:   atomic_fetch_add32
+; X32:   atomic_fetch_add32
+entry:
+; 32-bit
+  %t1 = atomicrmw add  i32* @sc32, i32 1 acquire
+; X64:       lock
+; X64:       incl
+; X32:       lock
+; X32:       incl
+  %t2 = atomicrmw add  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       addl $3
+; X32:       lock
+; X32:       addl $3
+  %t3 = atomicrmw add  i32* @sc32, i32 5 acquire
+; X64:       lock
+; X64:       xaddl
+; X32:       lock
+; X32:       xaddl
+  %t4 = atomicrmw add  i32* @sc32, i32 %t3 acquire
+; X64:       lock
+; X64:       addl
+; X32:       lock
+; X32:       addl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_sub32() nounwind {
+; X64:   atomic_fetch_sub32
+; X32:   atomic_fetch_sub32
+  %t1 = atomicrmw sub  i32* @sc32, i32 1 acquire
+; X64:       lock
+; X64:       decl
+; X32:       lock
+; X32:       decl
+  %t2 = atomicrmw sub  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       subl $3
+; X32:       lock
+; X32:       subl $3
+  %t3 = atomicrmw sub  i32* @sc32, i32 5 acquire
+; X64:       lock
+; X64:       xaddl
+; X32:       lock
+; X32:       xaddl
+  %t4 = atomicrmw sub  i32* @sc32, i32 %t3 acquire
+; X64:       lock
+; X64:       subl
+; X32:       lock
+; X32:       subl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_and32() nounwind {
+; X64:   atomic_fetch_and32
+; X32:   atomic_fetch_and32
+  %t1 = atomicrmw and  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       andl $3
+; X32:       lock
+; X32:       andl $3
+  %t2 = atomicrmw and  i32* @sc32, i32 5 acquire
+; X64:       andl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchgl
+  %t3 = atomicrmw and  i32* @sc32, i32 %t2 acquire
+; X64:       lock
+; X64:       andl
+; X32:       lock
+; X32:       andl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_or32() nounwind {
+; X64:   atomic_fetch_or32
+; X32:   atomic_fetch_or32
+  %t1 = atomicrmw or   i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       orl $3
+; X32:       lock
+; X32:       orl $3
+  %t2 = atomicrmw or   i32* @sc32, i32 5 acquire
+; X64:       orl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchgl
+  %t3 = atomicrmw or   i32* @sc32, i32 %t2 acquire
+; X64:       lock
+; X64:       orl
+; X32:       lock
+; X32:       orl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_xor32() nounwind {
+; X64:   atomic_fetch_xor32
+; X32:   atomic_fetch_xor32
+  %t1 = atomicrmw xor  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       xorl $3
+; X32:       lock
+; X32:       xorl $3
+  %t2 = atomicrmw xor  i32* @sc32, i32 5 acquire
+; X64:       xorl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchgl
+  %t3 = atomicrmw xor  i32* @sc32, i32 %t2 acquire
+; X64:       lock
+; X64:       xorl
+; X32:       lock
+; X32:       xorl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_nand32(i32 %x) nounwind {
+; X64:   atomic_fetch_nand32
+; X32:   atomic_fetch_nand32
+  %t1 = atomicrmw nand i32* @sc32, i32 %x acquire
+; X64:       andl
+; X64:       notl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       andl
+; X32:       notl
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max32(i32 %x) nounwind {
+  %t1 = atomicrmw max  i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min32(i32 %x) nounwind {
+  %t1 = atomicrmw min  i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax32(i32 %x) nounwind {
+  %t1 = atomicrmw umax i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin32(i32 %x) nounwind {
+  %t1 = atomicrmw umin i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg32() nounwind {
+  %t1 = cmpxchg i32* @sc32, i32 0, i32 1 acquire
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store32(i32 %x) nounwind {
+  store atomic i32 %x, i32* @sc32 release, align 4
+; X64-NOT:   lock
+; X64:       movl
+; X32-NOT:   lock
+; X32:       movl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap32(i32 %x) nounwind {
+  %t1 = atomicrmw xchg i32* @sc32, i32 %x acquire
+; X64-NOT:   lock
+; X64:       xchgl
+; X32-NOT:   lock
+; X32:       xchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic64.ll b/test/CodeGen/X86/atomic64.ll
new file mode 100644
index 0000000..45785cc
--- /dev/null
+++ b/test/CodeGen/X86/atomic64.ll
@@ -0,0 +1,216 @@
+; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
+
+@sc64 = external global i64
+
+define void @atomic_fetch_add64() nounwind {
+; X64:   atomic_fetch_add64
+entry:
+  %t1 = atomicrmw add  i64* @sc64, i64 1 acquire
+; X64:       lock
+; X64:       incq
+  %t2 = atomicrmw add  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       addq $3
+  %t3 = atomicrmw add  i64* @sc64, i64 5 acquire
+; X64:       lock
+; X64:       xaddq
+  %t4 = atomicrmw add  i64* @sc64, i64 %t3 acquire
+; X64:       lock
+; X64:       addq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_sub64() nounwind {
+; X64:   atomic_fetch_sub64
+  %t1 = atomicrmw sub  i64* @sc64, i64 1 acquire
+; X64:       lock
+; X64:       decq
+  %t2 = atomicrmw sub  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       subq $3
+  %t3 = atomicrmw sub  i64* @sc64, i64 5 acquire
+; X64:       lock
+; X64:       xaddq
+  %t4 = atomicrmw sub  i64* @sc64, i64 %t3 acquire
+; X64:       lock
+; X64:       subq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_and64() nounwind {
+; X64:   atomic_fetch_and64
+  %t1 = atomicrmw and  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       andq $3
+  %t2 = atomicrmw and  i64* @sc64, i64 5 acquire
+; X64:       andq
+; X64:       lock
+; X64:       cmpxchgq
+  %t3 = atomicrmw and  i64* @sc64, i64 %t2 acquire
+; X64:       lock
+; X64:       andq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_or64() nounwind {
+; X64:   atomic_fetch_or64
+  %t1 = atomicrmw or   i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       orq $3
+  %t2 = atomicrmw or   i64* @sc64, i64 5 acquire
+; X64:       orq
+; X64:       lock
+; X64:       cmpxchgq
+  %t3 = atomicrmw or   i64* @sc64, i64 %t2 acquire
+; X64:       lock
+; X64:       orq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_xor64() nounwind {
+; X64:   atomic_fetch_xor64
+  %t1 = atomicrmw xor  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       xorq $3
+  %t2 = atomicrmw xor  i64* @sc64, i64 5 acquire
+; X64:       xorq
+; X64:       lock
+; X64:       cmpxchgq
+  %t3 = atomicrmw xor  i64* @sc64, i64 %t2 acquire
+; X64:       lock
+; X64:       xorq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_nand64(i64 %x) nounwind {
+; X64:   atomic_fetch_nand64
+; X32:   atomic_fetch_nand64
+  %t1 = atomicrmw nand i64* @sc64, i64 %x acquire
+; X64:       andq
+; X64:       notq
+; X64:       lock
+; X64:       cmpxchgq
+; X32:       andl
+; X32:       andl
+; X32:       notl
+; X32:       notl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max64(i64 %x) nounwind {
+  %t1 = atomicrmw max  i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min64(i64 %x) nounwind {
+  %t1 = atomicrmw min  i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax64(i64 %x) nounwind {
+  %t1 = atomicrmw umax i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin64(i64 %x) nounwind {
+  %t1 = atomicrmw umin i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg64() nounwind {
+  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire
+; X64:       lock
+; X64:       cmpxchgq
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store64(i64 %x) nounwind {
+  store atomic i64 %x, i64* @sc64 release, align 8
+; X64-NOT:   lock
+; X64:       movq
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap64(i64 %x) nounwind {
+  %t1 = atomicrmw xchg i64* @sc64, i64 %x acquire
+; X64-NOT:   lock
+; X64:       xchgq
+; X32:       lock
+; X32:       xchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic6432.ll b/test/CodeGen/X86/atomic6432.ll
new file mode 100644
index 0000000..f9b21c5
--- /dev/null
+++ b/test/CodeGen/X86/atomic6432.ll
@@ -0,0 +1,208 @@
+; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc64 = external global i64
+
+define void @atomic_fetch_add64() nounwind {
+; X32:   atomic_fetch_add64
+entry:
+  %t1 = atomicrmw add  i64* @sc64, i64 1 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw add  i64* @sc64, i64 3 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw add  i64* @sc64, i64 5 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t4 = atomicrmw add  i64* @sc64, i64 %t3 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_sub64() nounwind {
+; X32:   atomic_fetch_sub64
+  %t1 = atomicrmw sub  i64* @sc64, i64 1 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw sub  i64* @sc64, i64 3 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw sub  i64* @sc64, i64 5 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t4 = atomicrmw sub  i64* @sc64, i64 %t3 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_and64() nounwind {
+; X32:   atomic_fetch_and64
+  %t1 = atomicrmw and  i64* @sc64, i64 3 acquire
+; X32:       andl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw and  i64* @sc64, i64 5 acquire
+; X32:       andl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw and  i64* @sc64, i64 %t2 acquire
+; X32:       andl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_or64() nounwind {
+; X32:   atomic_fetch_or64
+  %t1 = atomicrmw or   i64* @sc64, i64 3 acquire
+; X32:       orl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw or   i64* @sc64, i64 5 acquire
+; X32:       orl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw or   i64* @sc64, i64 %t2 acquire
+; X32:       orl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_xor64() nounwind {
+; X32:   atomic_fetch_xor64
+  %t1 = atomicrmw xor  i64* @sc64, i64 3 acquire
+; X32:       xorl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw xor  i64* @sc64, i64 5 acquire
+; X32:       xorl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw xor  i64* @sc64, i64 %t2 acquire
+; X32:       xorl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_nand64(i64 %x) nounwind {
+; X32:   atomic_fetch_nand64
+  %t1 = atomicrmw nand i64* @sc64, i64 %x acquire
+; X32:       andl
+; X32:       andl
+; X32:       notl
+; X32:       notl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_max64(i64 %x) nounwind {
+  %t1 = atomicrmw max  i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_min64(i64 %x) nounwind {
+  %t1 = atomicrmw min  i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_umax64(i64 %x) nounwind {
+  %t1 = atomicrmw umax i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_umin64(i64 %x) nounwind {
+  %t1 = atomicrmw umin i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg64() nounwind {
+  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_store64(i64 %x) nounwind {
+  store atomic i64 %x, i64* @sc64 release, align 8
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_swap64(i64 %x) nounwind {
+  %t1 = atomicrmw xchg i64* @sc64, i64 %x acquire
+; X32:       lock
+; X32:       xchg8b
+  ret void
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic8.ll b/test/CodeGen/X86/atomic8.ll
new file mode 100644
index 0000000..4124284
--- /dev/null
+++ b/test/CodeGen/X86/atomic8.ll
@@ -0,0 +1,250 @@
+; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
+; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc8 = external global i8
+
+define void @atomic_fetch_add8() nounwind {
+; X64:   atomic_fetch_add8
+; X32:   atomic_fetch_add8
+entry:
+; 32-bit
+  %t1 = atomicrmw add  i8* @sc8, i8 1 acquire
+; X64:       lock
+; X64:       incb
+; X32:       lock
+; X32:       incb
+  %t2 = atomicrmw add  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       addb $3
+; X32:       lock
+; X32:       addb $3
+  %t3 = atomicrmw add  i8* @sc8, i8 5 acquire
+; X64:       lock
+; X64:       xaddb
+; X32:       lock
+; X32:       xaddb
+  %t4 = atomicrmw add  i8* @sc8, i8 %t3 acquire
+; X64:       lock
+; X64:       addb
+; X32:       lock
+; X32:       addb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_sub8() nounwind {
+; X64:   atomic_fetch_sub8
+; X32:   atomic_fetch_sub8
+  %t1 = atomicrmw sub  i8* @sc8, i8 1 acquire
+; X64:       lock
+; X64:       decb
+; X32:       lock
+; X32:       decb
+  %t2 = atomicrmw sub  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       subb $3
+; X32:       lock
+; X32:       subb $3
+  %t3 = atomicrmw sub  i8* @sc8, i8 5 acquire
+; X64:       lock
+; X64:       xaddb
+; X32:       lock
+; X32:       xaddb
+  %t4 = atomicrmw sub  i8* @sc8, i8 %t3 acquire
+; X64:       lock
+; X64:       subb
+; X32:       lock
+; X32:       subb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_and8() nounwind {
+; X64:   atomic_fetch_and8
+; X32:   atomic_fetch_and8
+  %t1 = atomicrmw and  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       andb $3
+; X32:       lock
+; X32:       andb $3
+  %t2 = atomicrmw and  i8* @sc8, i8 5 acquire
+; X64:       andb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       andb
+; X32:       lock
+; X32:       cmpxchgb
+  %t3 = atomicrmw and  i8* @sc8, i8 %t2 acquire
+; X64:       lock
+; X64:       andb
+; X32:       lock
+; X32:       andb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_or8() nounwind {
+; X64:   atomic_fetch_or8
+; X32:   atomic_fetch_or8
+  %t1 = atomicrmw or   i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       orb $3
+; X32:       lock
+; X32:       orb $3
+  %t2 = atomicrmw or   i8* @sc8, i8 5 acquire
+; X64:       orb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       orb
+; X32:       lock
+; X32:       cmpxchgb
+  %t3 = atomicrmw or   i8* @sc8, i8 %t2 acquire
+; X64:       lock
+; X64:       orb
+; X32:       lock
+; X32:       orb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_xor8() nounwind {
+; X64:   atomic_fetch_xor8
+; X32:   atomic_fetch_xor8
+  %t1 = atomicrmw xor  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       xorb $3
+; X32:       lock
+; X32:       xorb $3
+  %t2 = atomicrmw xor  i8* @sc8, i8 5 acquire
+; X64:       xorb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       xorb
+; X32:       lock
+; X32:       cmpxchgb
+  %t3 = atomicrmw xor  i8* @sc8, i8 %t2 acquire
+; X64:       lock
+; X64:       xorb
+; X32:       lock
+; X32:       xorb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_nand8(i8 %x) nounwind {
+; X64:   atomic_fetch_nand8
+; X32:   atomic_fetch_nand8
+  %t1 = atomicrmw nand i8* @sc8, i8 %x acquire
+; X64:       andb
+; X64:       notb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       andb
+; X32:       notb
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max8(i8 %x) nounwind {
+  %t1 = atomicrmw max  i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min8(i8 %x) nounwind {
+  %t1 = atomicrmw min  i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax8(i8 %x) nounwind {
+  %t1 = atomicrmw umax i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin8(i8 %x) nounwind {
+  %t1 = atomicrmw umin i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg8() nounwind {
+  %t1 = cmpxchg i8* @sc8, i8 0, i8 1 acquire
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store8(i8 %x) nounwind {
+  store atomic i8 %x, i8* @sc8 release, align 4
+; X64-NOT:   lock
+; X64:       movb
+; X32-NOT:   lock
+; X32:       movb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap8(i8 %x) nounwind {
+  %t1 = atomicrmw xchg i8* @sc8, i8 %x acquire
+; X64-NOT:   lock
+; X64:       xchgb
+; X32-NOT:   lock
+; X32:       xchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic_add.ll b/test/CodeGen/X86/atomic_add.ll
index 1fce256..d944998 100644
--- a/test/CodeGen/X86/atomic_add.ll
+++ b/test/CodeGen/X86/atomic_add.ll
@@ -178,7 +178,8 @@ entry:
 define void @sub2(i16* nocapture %p, i32 %v) nounwind ssp {
 entry:
 ; CHECK: sub2:
-; CHECK: negl
+; CHECK-NOT: negl
+; CHECK: subw
 	%0 = trunc i32 %v to i16		; <i16> [#uses=1]
   %1 = atomicrmw sub i16* %p, i16 %0 monotonic
   ret void
diff --git a/test/CodeGen/X86/atomic_op.ll b/test/CodeGen/X86/atomic_op.ll
index 152bece..c5fa07d 100644
--- a/test/CodeGen/X86/atomic_op.ll
+++ b/test/CodeGen/X86/atomic_op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+cmov | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
@@ -107,13 +107,12 @@ entry:
         ; CHECK: cmpxchgl
   %17 = cmpxchg i32* %val2, i32 1976, i32 1 monotonic
 	store i32 %17, i32* %old
+        ; CHECK: movl  [[R17atomic:.*]], %eax
         ; CHECK: movl	$1401, %[[R17mask:[a-z]*]]
-        ; CHECK: movl	[[R17atomic:.*]], %eax
-        ; CHECK: movl	%eax, %[[R17newval:[a-z]*]]
-        ; CHECK: andl	%[[R17mask]], %[[R17newval]]
-        ; CHECK: notl	%[[R17newval]]
+        ; CHECK: andl	%eax, %[[R17mask]]
+        ; CHECK: notl	%[[R17mask]]
         ; CHECK: lock
-        ; CHECK: cmpxchgl	%[[R17newval]], [[R17atomic]]
+        ; CHECK: cmpxchgl	%[[R17mask]], [[R17atomic]]
         ; CHECK: jne
         ; CHECK: movl	%eax,
   %18 = atomicrmw nand i32* %val2, i32 1401 monotonic
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 8ad0fa8..95854c7 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -109,8 +109,8 @@ allocas:
 ; rdar://10566486
 ; CHECK: fneg
 ; CHECK: vxorps
-define <16 x float> @fneg(<16 x float> addrspace(1)* nocapture %out) nounwind {
-  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+define <16 x float> @fneg(<16 x float> %a) nounwind {
+  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
   ret <16 x float> %1
 }
 
diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll
new file mode 100644
index 0000000..1446b36
--- /dev/null
+++ b/test/CodeGen/X86/avx-intel-ocl.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=NOT_WIN %s
+
+declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
+declare <16 x float> @func_float16(<16 x float>, <16 x float>)
+; WIN64: testf16_inp
+; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN64: leaq    {{.*}}(%rsp), %rcx
+; WIN64: call
+; WIN64: ret
+
+; WIN32: testf16_inp
+; WIN32: movl    %eax, (%esp)
+; WIN32: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN32: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN32: call
+; WIN32: ret
+
+; NOT_WIN: testf16_inp
+; NOT_WIN: vaddps  {{.*}}, {{%ymm[0-1]}}
+; NOT_WIN: vaddps  {{.*}}, {{%ymm[0-1]}}
+; NOT_WIN: leaq    {{.*}}(%rsp), %rdi
+; NOT_WIN: call
+; NOT_WIN: ret
+
+;test calling conventions - input parameters
+define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %2, %1
+  ret <16 x float> %3
+}
+
+;test calling conventions - preserved registers
+
+; preserved ymm6-ymm15
+; WIN64: testf16_regs
+; WIN64: call
+; WIN64: vaddps  {{%ymm[6-7]}}, %ymm0, %ymm0
+; WIN64: vaddps  {{%ymm[6-7]}}, %ymm1, %ymm1
+; WIN64: ret
+
+; preserved ymm8-ymm15
+; NOT_WIN: testf16_regs
+; NOT_WIN: call
+; NOT_WIN: vaddps  {{%ymm[8-9]}}, %ymm0, %ymm0
+; NOT_WIN: vaddps  {{%ymm[8-9]}}, %ymm1, %ymm1
+; NOT_WIN: ret
+
+define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %1, %b
+  %4 = fadd <16 x float> %2, %3
+  ret <16 x float> %4
+}
+
+; test calling conventions - prolog and epilog
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: call
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: call
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
+   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
+   ret <16 x float> %c
+}
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index c44beb4..88ecd5a 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -1140,9 +1140,9 @@ declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) noun
 
 
 define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
-  ; CHECK: movl
-  ; CHECK: movl
-  ; CHECK: vpcmpestri
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: vpcmpestri $7
   ; CHECK: movl
   %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
@@ -1150,6 +1150,18 @@ define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
 declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
 
 
+define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: vpcmpestri $7, (
+  ; CHECK: movl
+  %1 = load <16 x i8>* %a0
+  %2 = load <16 x i8>* %a2
+  %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+
+
 define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
@@ -1216,8 +1228,19 @@ define <16 x i8> @test_x86_sse42_pcmpestrm128(<16 x i8> %a0, <16 x i8> %a2) {
 declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
 
 
+define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2) {
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: vpcmpestrm $7,
+  ; CHECK-NOT: vmov
+  %1 = load <16 x i8>* %a2
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %1, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
 define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcmpistri
+  ; CHECK: vpcmpistri $7
   ; CHECK: movl
   %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
@@ -1225,6 +1248,16 @@ define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
 declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 
+define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
+  ; CHECK: vpcmpistri $7, (
+  ; CHECK: movl
+  %1 = load <16 x i8>* %a0
+  %2 = load <16 x i8>* %a1
+  %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %1, <16 x i8> %2, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+
+
 define i32 @test_x86_sse42_pcmpistria128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
   ; CHECK: seta
@@ -1271,7 +1304,7 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 
 define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcmpistrm
+  ; CHECK: vpcmpistrm $7
   ; CHECK-NOT: vmov
   %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
@@ -1279,6 +1312,15 @@ define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
 declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 
+define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1) {
+  ; CHECK: vpcmpistrm $7, (
+  ; CHECK-NOT: vmov
+  %1 = load <16 x i8>* %a1
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %1, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
 define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: vaddss
   %res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 9b41709..ec11654 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -229,9 +229,8 @@ define   <8 x float> @test17(<4 x float> %y) {
 }
 
 ; CHECK: test18
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: vunpcklps
+; CHECK: vmovshdup
+; CHECK: vblendps
 ; CHECK: ret
 define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
   %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -239,9 +238,8 @@ define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
 }
 
 ; CHECK: test19
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: vunpcklps
+; CHECK: vmovsldup
+; CHECK: vblendps
 ; CHECK: ret
 define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
   %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
diff --git a/test/CodeGen/X86/avx-vextractf128.ll b/test/CodeGen/X86/avx-vextractf128.ll
index fe0f6ca..ff56a45 100644
--- a/test/CodeGen/X86/avx-vextractf128.ll
+++ b/test/CodeGen/X86/avx-vextractf128.ll
@@ -19,12 +19,12 @@ entry:
 }
 
 ; CHECK: @t0
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t0(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
 entry:
-  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 1)
   %1 = bitcast float* %addr to <4 x float>*
   store <4 x float> %0, <4 x float>* %1, align 16
   ret void
@@ -32,27 +32,13 @@ entry:
 
 declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
 
-; CHECK: @t1
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovups %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t1(float* %addr, <8 x float> %a) nounwind uwtable ssp {
-entry:
-  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
-  %1 = bitcast float* %addr to i8*
-  tail call void @llvm.x86.sse.storeu.ps(i8* %1, <4 x float> %0)
-  ret void
-}
-
-declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
-
 ; CHECK: @t2
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t2(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
 entry:
-  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
+  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 1)
   %1 = bitcast double* %addr to <2 x double>*
   store <2 x double> %0, <2 x double>* %1, align 16
   ret void
@@ -60,28 +46,14 @@ entry:
 
 declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
 
-; CHECK: @t3
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovups %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t3(double* %addr, <4 x double> %a) nounwind uwtable ssp {
-entry:
-  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
-  %1 = bitcast double* %addr to i8*
-  tail call void @llvm.x86.sse2.storeu.pd(i8* %1, <2 x double> %0)
-  ret void
-}
-
-declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
-
 ; CHECK: @t4
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t4(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
 entry:
   %0 = bitcast <4 x i64> %a to <8 x i32>
-  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 1)
   %2 = bitcast <4 x i32> %1 to <2 x i64>
   store <2 x i64> %2, <2 x i64>* %addr, align 16
   ret void
@@ -90,17 +62,43 @@ entry:
 declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
 
 ; CHECK: @t5
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovdqu %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t5(<2 x i64>* %addr, <4 x i64> %a) nounwind uwtable ssp {
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t5(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
+  %1 = bitcast float* %addr to <4 x float>*
+  store <4 x float> %0, <4 x float>* %1, align 16
+  ret void
+}
+
+; CHECK: @t6
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t6(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
+entry:
+  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
+  %1 = bitcast double* %addr to <2 x double>*
+  store <2 x double> %0, <2 x double>* %1, align 16
+  ret void
+}
+
+; CHECK: @t7
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t7(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
 entry:
   %0 = bitcast <4 x i64> %a to <8 x i32>
   %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
-  %2 = bitcast <2 x i64>* %addr to i8*
-  %3 = bitcast <4 x i32> %1 to <16 x i8>
-  tail call void @llvm.x86.sse2.storeu.dq(i8* %2, <16 x i8> %3)
+  %2 = bitcast <4 x i32> %1 to <2 x i64>
+  store <2 x i64> %2, <2 x i64>* %addr, align 16
   ret void
 }
 
-declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
+; CHECK: @t8
+; CHECK: vmovups %xmm0, (%rdi)
+define void @t8(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+entry:
+  %0 = bitcast <4 x i64> %a to <8 x i32>
+  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+  %2 = bitcast <4 x i32> %1 to <2 x i64>
+  store <2 x i64> %2, <2 x i64>* %addr, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
index c5899fa..a414e68 100644
--- a/test/CodeGen/X86/avx2-shuffle.ll
+++ b/test/CodeGen/X86/avx2-shuffle.ll
@@ -26,3 +26,37 @@ entry:
   %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 3, i32 undef, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i16> %shuffle.i
 }
+
+; CHECK: vpshufb_test
+; CHECK: vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 19, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
+
+; CHECK: vpshufb1_test
+; CHECK: vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb1_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
+
+
+; CHECK: vpshufb2_test
+; CHECK: vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb2_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
diff --git a/test/CodeGen/X86/bitcast-i256.ll b/test/CodeGen/X86/bitcast-i256.ll
new file mode 100644
index 0000000..85ac2fe
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-i256.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i < %s | FileCheck %s --check-prefix CHECK
+
+define i256 @foo(<8 x i32> %a) {
+  %r = bitcast <8 x i32> %a to i256
+  ret i256 %r
+; CHECK: foo
+; CHECK: vextractf128
+; CHECK: vpextrq
+; CHECK: vpextrq
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/bool-simplify.ll b/test/CodeGen/X86/bool-simplify.ll
index 0cb9fd9..09eb5d1 100644
--- a/test/CodeGen/X86/bool-simplify.ll
+++ b/test/CodeGen/X86/bool-simplify.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx,+rdrand | FileCheck %s
 
 define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) {
   %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
@@ -39,4 +39,20 @@ define i32 @bax(<2 x i64> %c) {
 ; CHECK: ret
 }
 
+define i32 @rnd(i32 %arg) nounwind uwtable {
+  %1 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
+  %2 = extractvalue { i32, i32 } %1, 0
+  %3 = extractvalue { i32, i32 } %1, 1
+  %4 = icmp eq i32 %3, 0
+  %5 = select i1 %4, i32 0, i32 %arg
+  %6 = add i32 %5, %2
+  ret i32 %6
+; CHECK: rnd
+; CHECK: rdrand
+; CHECK: cmov
+; CHECK-NOT: cmov
+; CHECK: ret
+}
+
 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
+declare { i32, i32 } @llvm.x86.rdrand.32() nounwind
diff --git a/test/CodeGen/X86/buildvec-insertvec.ll b/test/CodeGen/X86/buildvec-insertvec.ll
new file mode 100644
index 0000000..3fb69a4
--- /dev/null
+++ b/test/CodeGen/X86/buildvec-insertvec.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {
+  %t0 = fptoui <3 x float> %in to <3 x i8>
+  %t1 = shufflevector <3 x i8> %t0, <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %t2 = insertelement <4 x i8> %t1, i8 -1, i32 3
+  store <4 x i8> %t2, <4 x i8>* %out, align 4
+  ret void
+; CHECK: foo
+; CHECK: cvttps2dq
+; CHECK-NOT: pextrd
+; CHECK: pinsrd
+; CHECK-NEXT: pshufb
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/cmov-fp.ll b/test/CodeGen/X86/cmov-fp.ll
new file mode 100644
index 0000000..ca91f9e
--- /dev/null
+++ b/test/CodeGen/X86/cmov-fp.ll
@@ -0,0 +1,451 @@
+; RUN: llc -march x86 -mcpu pentium4 < %s | FileCheck %s -check-prefix=SSE
+; RUN: llc -march x86 -mcpu pentium3 < %s | FileCheck %s -check-prefix=NOSSE2
+; RUN: llc -march x86 -mcpu pentium2 < %s | FileCheck %s -check-prefix=NOSSE1
+; RUN: llc -march x86 -mcpu pentium < %s | FileCheck %s -check-prefix=NOCMOV
+; PR14035
+
+define double @test1(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp ugt i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test1:
+; SSE: movsd
+
+; NOSSE2: test1:
+; NOSSE2: fcmovnbe
+
+; NOSSE1: test1:
+; NOSSE1: fcmovnbe
+
+; NOCMOV: test1:
+; NOCMOV: fstp
+
+}
+
+define double @test2(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp uge i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test2:
+; SSE: movsd
+
+; NOSSE2: test2:
+; NOSSE2: fcmovnb
+
+; NOSSE1: test2:
+; NOSSE1: fcmovnb
+
+; NOCMOV: test2:
+; NOCMOV: fstp
+}
+
+define double @test3(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test3:
+; SSE: movsd
+
+; NOSSE2: test3:
+; NOSSE2: fcmovb
+
+; NOSSE1: test3:
+; NOSSE1: fcmovb
+
+; NOCMOV: test3:
+; NOCMOV: fstp
+}
+
+define double @test4(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp ule i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test4:
+; SSE: movsd
+
+; NOSSE2: test4:
+; NOSSE2: fcmovbe
+
+; NOSSE1: test4:
+; NOSSE1: fcmovbe
+
+; NOCMOV: test4:
+; NOCMOV: fstp
+}
+
+define double @test5(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp sgt i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test5:
+; SSE: movsd
+
+; NOSSE2: test5:
+; NOSSE2: fstp
+
+; NOSSE1: test5:
+; NOSSE1: fstp
+
+; NOCMOV: test5:
+; NOCMOV: fstp
+}
+
+define double @test6(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp sge i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test6:
+; SSE: movsd
+
+; NOSSE2: test6:
+; NOSSE2: fstp
+
+; NOSSE1: test6:
+; NOSSE1: fstp
+
+; NOCMOV: test6:
+; NOCMOV: fstp
+}
+
+define double @test7(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test7:
+; SSE: movsd
+
+; NOSSE2: test7:
+; NOSSE2: fstp
+
+; NOSSE1: test7:
+; NOSSE1: fstp
+
+; NOCMOV: test7:
+; NOCMOV: fstp
+}
+
+define double @test8(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp sle i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test8:
+; SSE: movsd
+
+; NOSSE2: test8:
+; NOSSE2: fstp
+
+; NOSSE1: test8:
+; NOSSE1: fstp
+
+; NOCMOV: test8:
+; NOCMOV: fstp
+}
+
+define float @test9(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp ugt i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test9:
+; SSE: movss
+
+; NOSSE2: test9:
+; NOSSE2: movss
+
+; NOSSE1: test9:
+; NOSSE1: fcmovnbe
+
+; NOCMOV: test9:
+; NOCMOV: fstp
+}
+
+define float @test10(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp uge i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test10:
+; SSE: movss
+
+; NOSSE2: test10:
+; NOSSE2: movss
+
+; NOSSE1: test10:
+; NOSSE1: fcmovnb
+
+; NOCMOV: test10:
+; NOCMOV: fstp
+}
+
+define float @test11(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test11:
+; SSE: movss
+
+; NOSSE2: test11:
+; NOSSE2: movss
+
+; NOSSE1: test11:
+; NOSSE1: fcmovb
+
+; NOCMOV: test11:
+; NOCMOV: fstp
+}
+
+define float @test12(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp ule i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test12:
+; SSE: movss
+
+; NOSSE2: test12:
+; NOSSE2: movss
+
+; NOSSE1: test12:
+; NOSSE1: fcmovbe
+
+; NOCMOV: test12:
+; NOCMOV: fstp
+}
+
+define float @test13(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp sgt i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test13:
+; SSE: movss
+
+; NOSSE2: test13:
+; NOSSE2: movss
+
+; NOSSE1: test13:
+; NOSSE1: fstp
+
+; NOCMOV: test13:
+; NOCMOV: fstp
+}
+
+define float @test14(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp sge i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test14:
+; SSE: movss
+
+; NOSSE2: test14:
+; NOSSE2: movss
+
+; NOSSE1: test14:
+; NOSSE1: fstp
+
+; NOCMOV: test14:
+; NOCMOV: fstp
+}
+
+define float @test15(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test15:
+; SSE: movss
+
+; NOSSE2: test15:
+; NOSSE2: movss
+
+; NOSSE1: test15:
+; NOSSE1: fstp
+
+; NOCMOV: test15:
+; NOCMOV: fstp
+}
+
+define float @test16(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp sle i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test16:
+; SSE: movss
+
+; NOSSE2: test16:
+; NOSSE2: movss
+
+; NOSSE1: test16:
+; NOSSE1: fstp
+
+; NOCMOV: test16:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test17(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp ugt i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test17:
+; SSE: fcmovnbe
+
+; NOSSE2: test17:
+; NOSSE2: fcmovnbe
+
+; NOSSE1: test17:
+; NOSSE1: fcmovnbe
+
+; NOCMOV: test17:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test18(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp uge i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test18:
+; SSE: fcmovnb
+
+; NOSSE2: test18:
+; NOSSE2: fcmovnb
+
+; NOSSE1: test18:
+; NOSSE1: fcmovnb
+
+; NOCMOV: test18:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test19(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test19:
+; SSE: fcmovb
+
+; NOSSE2: test19:
+; NOSSE2: fcmovb
+
+; NOSSE1: test19:
+; NOSSE1: fcmovb
+
+; NOCMOV: test19:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test20(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp ule i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test20:
+; SSE: fcmovbe
+
+; NOSSE2: test20:
+; NOSSE2: fcmovbe
+
+; NOSSE1: test20:
+; NOSSE1: fcmovbe
+
+; NOCMOV: test20:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test21(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp sgt i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; We don't emit a branch for fp80, why?
+; SSE: test21:
+; SSE: testb
+; SSE: fcmovne
+
+; NOSSE2: test21:
+; NOSSE2: testb
+; NOSSE2: fcmovne
+
+; NOSSE1: test21:
+; NOSSE1: testb
+; NOSSE1: fcmovne
+
+; NOCMOV: test21:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test22(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp sge i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test22:
+; SSE: testb
+; SSE: fcmovne
+
+; NOSSE2: test22:
+; NOSSE2: testb
+; NOSSE2: fcmovne
+
+; NOSSE1: test22:
+; NOSSE1: testb
+; NOSSE1: fcmovne
+
+; NOCMOV: test22:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test23(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test23:
+; SSE: testb
+; SSE: fcmovne
+
+; NOSSE2: test23:
+; NOSSE2: testb
+; NOSSE2: fcmovne
+
+; NOSSE1: test23:
+; NOSSE1: testb
+; NOSSE1: fcmovne
+
+; NOCMOV: test23:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test24(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp sle i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test24:
+; SSE: testb
+; SSE: fcmovne
+
+; NOSSE2: test24:
+; NOSSE2: testb
+; NOSSE2: fcmovne
+
+; NOSSE1: test24:
+; NOSSE1: testb
+; NOSSE1: fcmovne
+
+; NOCMOV: test24:
+; NOCMOV: fstp
+}
diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll
index 9badfc8..276d0db 100644
--- a/test/CodeGen/X86/crash.ll
+++ b/test/CodeGen/X86/crash.ll
@@ -442,3 +442,150 @@ entry:
   ret void
 }
 declare void @_Z6PrintFz(...)
+
+@a = external global i32, align 4
+@fn1.g = private unnamed_addr constant [9 x i32*] [i32* null, i32* @a, i32* null, i32* null, i32* null, i32* null, i32* null, i32* null, i32* null], align 16
+@e = external global i32, align 4
+
+define void @pr13943() nounwind uwtable ssp {
+entry:
+  %srcval = load i576* bitcast ([9 x i32*]* @fn1.g to i576*), align 16
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %g.0 = phi i576 [ %srcval, %entry ], [ %ins, %for.inc ]
+  %0 = load i32* @e, align 4
+  %1 = lshr i576 %g.0, 64
+  %2 = trunc i576 %1 to i64
+  %3 = inttoptr i64 %2 to i32*
+  %cmp = icmp eq i32* undef, %3
+  %conv2 = zext i1 %cmp to i32
+  %and = and i32 %conv2, %0
+  tail call void (...)* @fn3(i32 %and) nounwind
+  %tobool = icmp eq i32 undef, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.cond
+  ret void
+
+for.inc:                                          ; preds = %for.cond
+  %4 = shl i576 %1, 384
+  %mask = and i576 %g.0, -726838724295606890509921801691610055141362320587174446476410459910173841445449629921945328942266354949348255351381262292727973638307841
+  %5 = and i576 %4, 726838724295606890509921801691610055141362320587174446476410459910173841445449629921945328942266354949348255351381262292727973638307840
+  %ins = or i576 %5, %mask
+  br label %for.cond
+}
+
+declare void @fn3(...)
+
+; Check coalescing of IMPLICIT_DEF instructions:
+;
+; %vreg1 = IMPLICIT_DEF
+; %vreg2 = MOV32r0
+;
+; When coalescing %vreg1 and %vreg2, the IMPLICIT_DEF instruction should be
+; erased along with its value number.
+;
+define void @rdar12474033() nounwind ssp {
+bb:
+  br i1 undef, label %bb21, label %bb1
+
+bb1:                                              ; preds = %bb
+  switch i32 undef, label %bb10 [
+    i32 4, label %bb2
+    i32 1, label %bb9
+    i32 5, label %bb3
+    i32 6, label %bb3
+    i32 2, label %bb9
+  ]
+
+bb2:                                              ; preds = %bb1
+  unreachable
+
+bb3:                                              ; preds = %bb1, %bb1
+  br i1 undef, label %bb4, label %bb5
+
+bb4:                                              ; preds = %bb3
+  unreachable
+
+bb5:                                              ; preds = %bb3
+  %tmp = load <4 x float>* undef, align 1
+  %tmp6 = bitcast <4 x float> %tmp to i128
+  %tmp7 = load <4 x float>* undef, align 1
+  %tmp8 = bitcast <4 x float> %tmp7 to i128
+  br label %bb10
+
+bb9:                                              ; preds = %bb1, %bb1
+  unreachable
+
+bb10:                                             ; preds = %bb5, %bb1
+  %tmp11 = phi i128 [ undef, %bb1 ], [ %tmp6, %bb5 ]
+  %tmp12 = phi i128 [ 0, %bb1 ], [ %tmp8, %bb5 ]
+  switch i32 undef, label %bb21 [
+    i32 2, label %bb18
+    i32 3, label %bb13
+    i32 5, label %bb16
+    i32 6, label %bb17
+    i32 1, label %bb18
+  ]
+
+bb13:                                             ; preds = %bb10
+  br i1 undef, label %bb15, label %bb14
+
+bb14:                                             ; preds = %bb13
+  br label %bb21
+
+bb15:                                             ; preds = %bb13
+  unreachable
+
+bb16:                                             ; preds = %bb10
+  unreachable
+
+bb17:                                             ; preds = %bb10
+  unreachable
+
+bb18:                                             ; preds = %bb10, %bb10
+  %tmp19 = bitcast i128 %tmp11 to <4 x float>
+  %tmp20 = bitcast i128 %tmp12 to <4 x float>
+  br label %bb21
+
+bb21:                                             ; preds = %bb18, %bb14, %bb10, %bb
+  %tmp22 = phi <4 x float> [ undef, %bb ], [ undef, %bb10 ], [ undef, %bb14 ], [ %tmp20, %bb18 ]
+  %tmp23 = phi <4 x float> [ undef, %bb ], [ undef, %bb10 ], [ undef, %bb14 ], [ %tmp19, %bb18 ]
+  store <4 x float> %tmp23, <4 x float>* undef, align 16
+  store <4 x float> %tmp22, <4 x float>* undef, align 16
+  switch i32 undef, label %bb29 [
+    i32 5, label %bb27
+    i32 1, label %bb24
+    i32 2, label %bb25
+    i32 14, label %bb28
+    i32 4, label %bb26
+  ]
+
+bb24:                                             ; preds = %bb21
+  unreachable
+
+bb25:                                             ; preds = %bb21
+  br label %bb29
+
+bb26:                                             ; preds = %bb21
+  br label %bb29
+
+bb27:                                             ; preds = %bb21
+  unreachable
+
+bb28:                                             ; preds = %bb21
+  br label %bb29
+
+bb29:                                             ; preds = %bb28, %bb26, %bb25, %bb21
+  unreachable
+}
+
+define void @pr14194() nounwind uwtable {
+  %tmp = load i64* undef, align 16
+  %tmp1 = trunc i64 %tmp to i32
+  %tmp2 = lshr i64 %tmp, 32
+  %tmp3 = trunc i64 %tmp2 to i32
+  %tmp4 = call { i32, i32 } asm sideeffect "", "=&r,=&r,r,r,0,1,~{dirflag},~{fpsr},~{flags}"(i32 %tmp3, i32 undef, i32 %tmp3, i32 %tmp1) nounwind
+ ret void
+}
diff --git a/test/CodeGen/X86/cvtv2f32.ll b/test/CodeGen/X86/cvtv2f32.ll
new file mode 100644
index 0000000..466b096
--- /dev/null
+++ b/test/CodeGen/X86/cvtv2f32.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=i686-linux-pc -mcpu=corei7 | FileCheck %s
+
+define <2 x float> @foo(i32 %x, i32 %y, <2 x float> %v) {
+  %t1 = uitofp i32 %x to float
+  %t2 = insertelement <2 x float> undef, float %t1, i32 0
+  %t3 = uitofp i32 %y to float
+  %t4 = insertelement <2 x float> %t2, float %t3, i32 1
+  %t5 = fmul <2 x float> %v, %t4
+  ret <2 x float> %t5
+; CHECK: foo
+; CHECK: or
+; CHECK: subpd
+; CHECK: cvtpd2ps
+; CHECK: ret
+}
+
+define <2 x float> @bar(<2 x i32> %in) {
+  %r = uitofp <2 x i32> %in to <2 x float>
+  ret <2 x float> %r
+; CHECK: bar
+; CHECK: or
+; CHECK: subpd
+; CHECK: cvtpd2ps
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/early-ifcvt-crash.ll b/test/CodeGen/X86/early-ifcvt-crash.ll
new file mode 100644
index 0000000..c828026
--- /dev/null
+++ b/test/CodeGen/X86/early-ifcvt-crash.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -x86-early-ifcvt -verify-machineinstrs
+; RUN: llc < %s -x86-early-ifcvt -stress-early-ifcvt -verify-machineinstrs
+;
+; Run these tests with and without -stress-early-ifcvt to exercise heuristics.
+;
+target triple = "x86_64-apple-macosx10.8.0"
+
+; MachineTraceMetrics::Ensemble::addLiveIns crashes because the first operand
+; on an inline asm instruction is not a vreg def.
+; <rdar://problem/12472811>
+define void @f1() nounwind {
+entry:
+  br i1 undef, label %if.then6.i, label %if.end.i
+
+if.then6.i:
+  br label %if.end.i
+
+if.end.i:
+  br i1 undef, label %if.end25.i, label %if.else17.i
+
+if.else17.i:
+  %shl24.i = shl i32 undef, undef
+  br label %if.end25.i
+
+if.end25.i:
+  %storemerge31.i = phi i32 [ %shl24.i, %if.else17.i ], [ 0, %if.end.i ]
+  store i32 %storemerge31.i, i32* undef, align 4
+  %0 = tail call i32 asm sideeffect "", "=r,r,i,i"(i32 undef, i32 15, i32 1) nounwind
+  %conv = trunc i32 %0 to i8
+  store i8 %conv, i8* undef, align 1
+  unreachable
+}
diff --git a/test/CodeGen/X86/early-ifcvt.ll b/test/CodeGen/X86/early-ifcvt.ll
index 7883ffa..2e1852d 100644
--- a/test/CodeGen/X86/early-ifcvt.ll
+++ b/test/CodeGen/X86/early-ifcvt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -enable-early-ifcvt -stress-early-ifcvt | FileCheck %s
+; RUN: llc < %s -x86-early-ifcvt -stress-early-ifcvt | FileCheck %s
 target triple = "x86_64-apple-macosx10.8.0"
 
 ; CHECK: mm2
@@ -67,3 +67,78 @@ if.end41:
 }
 
 declare void @fprintf(...) nounwind
+
+; CHECK: BZ2_decompress
+; This test case contains irreducible control flow, so MachineLoopInfo doesn't
+; recognize the cycle in the CFG. This would confuse MachineTraceMetrics.
+define void @BZ2_decompress(i8* %s) nounwind ssp {
+entry:
+  switch i32 undef, label %sw.default [
+    i32 39, label %if.end.sw.bb2050_crit_edge
+    i32 36, label %sw.bb1788
+    i32 37, label %if.end.sw.bb1855_crit_edge
+    i32 40, label %sw.bb2409
+    i32 38, label %sw.bb1983
+    i32 44, label %if.end.sw.bb3058_crit_edge
+  ]
+
+if.end.sw.bb3058_crit_edge:                       ; preds = %entry
+  br label %save_state_and_return
+
+if.end.sw.bb1855_crit_edge:                       ; preds = %entry
+  br label %save_state_and_return
+
+if.end.sw.bb2050_crit_edge:                       ; preds = %entry
+  br label %sw.bb2050
+
+sw.bb1788:                                        ; preds = %entry
+  br label %save_state_and_return
+
+sw.bb1983:                                        ; preds = %entry
+  br i1 undef, label %save_state_and_return, label %if.then1990
+
+if.then1990:                                      ; preds = %sw.bb1983
+  br label %while.body2038
+
+while.body2038:                                   ; preds = %sw.bb2050, %if.then1990
+  %groupPos.8 = phi i32 [ 0, %if.then1990 ], [ %groupPos.9, %sw.bb2050 ]
+  br i1 undef, label %save_state_and_return, label %if.end2042
+
+if.end2042:                                       ; preds = %while.body2038
+  br i1 undef, label %if.end2048, label %while.end2104
+
+if.end2048:                                       ; preds = %if.end2042
+  %bsLive2054.pre = getelementptr inbounds i8* %s, i32 8
+  br label %sw.bb2050
+
+sw.bb2050:                                        ; preds = %if.end2048, %if.end.sw.bb2050_crit_edge
+  %groupPos.9 = phi i32 [ 0, %if.end.sw.bb2050_crit_edge ], [ %groupPos.8, %if.end2048 ]
+  %and2064 = and i32 undef, 1
+  br label %while.body2038
+
+while.end2104:                                    ; preds = %if.end2042
+  br i1 undef, label %save_state_and_return, label %if.end2117
+
+if.end2117:                                       ; preds = %while.end2104
+  br i1 undef, label %while.body2161.lr.ph, label %while.body2145.lr.ph
+
+while.body2145.lr.ph:                             ; preds = %if.end2117
+  br label %save_state_and_return
+
+while.body2161.lr.ph:                             ; preds = %if.end2117
+  br label %save_state_and_return
+
+sw.bb2409:                                        ; preds = %entry
+  br label %save_state_and_return
+
+sw.default:                                       ; preds = %entry
+  call void @BZ2_bz__AssertH__fail() nounwind
+  br label %save_state_and_return
+
+save_state_and_return:
+  %groupPos.14 = phi i32 [ 0, %sw.default ], [ %groupPos.8, %while.body2038 ], [ %groupPos.8, %while.end2104 ], [ 0, %if.end.sw.bb3058_crit_edge ], [ 0, %if.end.sw.bb1855_crit_edge ], [ %groupPos.8, %while.body2161.lr.ph ], [ %groupPos.8, %while.body2145.lr.ph ], [ 0, %sw.bb2409 ], [ 0, %sw.bb1788 ], [ 0, %sw.bb1983 ]
+  store i32 %groupPos.14, i32* undef, align 4
+  ret void
+}
+
+declare void @BZ2_bz__AssertH__fail()
diff --git a/test/CodeGen/X86/extract-concat.ll b/test/CodeGen/X86/extract-concat.ll
new file mode 100644
index 0000000..704309e
--- /dev/null
+++ b/test/CodeGen/X86/extract-concat.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define void @foo(<4 x float> %in, <4 x i8>* %out) {
+  %t0 = fptosi <4 x float> %in to <4 x i32>
+  %t1 = trunc <4 x i32> %t0 to <4 x i16>
+  %t2 = shufflevector <4 x i16> %t1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t3 = trunc <8 x i16> %t2 to <8 x i8>
+  %t4 = shufflevector <8 x i8> %t3, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t5 = insertelement <4 x i8> %t4, i8 -1, i32 3
+  store <4 x i8> %t5, <4 x i8>* %out
+  ret void
+; CHECK: foo
+; CHECK: cvttps2dq
+; CHECK-NOT: pextrd
+; CHECK: pshufb
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/fast-cc-callee-pops.ll b/test/CodeGen/X86/fast-cc-callee-pops.ll
index ea10897..2c5b80a 100644
--- a/test/CodeGen/X86/fast-cc-callee-pops.ll
+++ b/test/CodeGen/X86/fast-cc-callee-pops.ll
@@ -2,12 +2,12 @@
 
 ; Check that a fastcc function pops its stack variables before returning.
 
-define x86_fastcallcc void @func(i64 %X, i64 %Y, float %G, double %Z) nounwind {
+define x86_fastcallcc void @func(i64 inreg %X, i64 %Y, float %G, double %Z) nounwind {
         ret void
 ; CHECK: ret{{.*}}20
 }
 
-define x86_thiscallcc void @func2(i32 %X, i64 %Y, float %G, double %Z) nounwind {
+define x86_thiscallcc void @func2(i32 inreg %X, i64 %Y, float %G, double %Z) nounwind {
         ret void
 ; CHECK: ret{{.*}}20
 }
diff --git a/test/CodeGen/X86/fast-cc-merge-stack-adj.ll b/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
index 14cb136..d591f94 100644
--- a/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
+++ b/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
@@ -3,7 +3,7 @@
 
 target triple = "i686-pc-linux-gnu"
 
-declare x86_fastcallcc void @func(i32*, i64)
+declare x86_fastcallcc void @func(i32*, i64 inreg)
 
 define x86_fastcallcc void @caller(i32, i64) {
         %X = alloca i32         ; <i32*> [#uses=1]
diff --git a/test/CodeGen/X86/fast-cc-pass-in-regs.ll b/test/CodeGen/X86/fast-cc-pass-in-regs.ll
index a96e504..b60b68b 100644
--- a/test/CodeGen/X86/fast-cc-pass-in-regs.ll
+++ b/test/CodeGen/X86/fast-cc-pass-in-regs.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | FileCheck %s
 ; check that fastcc is passing stuff in regs.
 
-declare x86_fastcallcc i64 @callee(i64)
+declare x86_fastcallcc i64 @callee(i64 inreg)
 
 define i64 @caller() {
         %X = call x86_fastcallcc  i64 @callee( i64 4294967299 )          ; <i64> [#uses=1]
@@ -9,7 +9,7 @@ define i64 @caller() {
         ret i64 %X
 }
 
-define x86_fastcallcc i64 @caller2(i64 %X) {
+define x86_fastcallcc i64 @caller2(i64 inreg %X) {
         ret i64 %X
 ; CHECK: mov{{.*}}EAX, ECX
 }
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index d8f4663..cdfaf7f 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s  -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=-avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=+avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
@@ -197,6 +198,11 @@ block2:
 ; CHECK: cvtsi2sdq {{.*}} %xmm0
 ; CHECK: movb $1, %al
 ; CHECK: callq _test16callee
+
+; AVX: movabsq $1
+; AVX: vmovsd LCP{{.*}}_{{.*}}(%rip), %xmm0
+; AVX: movb $1, %al
+; AVX: callq _test16callee
   call void (...)* @test16callee(double 1.000000e+00)
   ret void
 }
@@ -285,3 +291,16 @@ entry:
 }
 
 declare void @foo22(i32)
+
+; PR13563
+define void @test23(i8* noalias sret %result) {
+  %a = alloca i8
+  %b = call i8* @foo23()
+  ret void
+; CHECK: test23:
+; CHECK: call
+; CHECK: movq  %rdi, %rax
+; CHECK: ret
+}
+
+declare i8* @foo23()
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index b0c1d0a..bd3514c 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -1,11 +1,13 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma  | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=i386-apple-darwin10               | FileCheck %s --check-prefix=CHECK-FMA-CALL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10             | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma,-fma4  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=-fma,-fma4  | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10  -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma4  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
 
 ; CHECK: test_f32
 ; CHECK-FMA-INST: vfmadd213ss
-; CHECK-FMA-CALL: _fmaf
+; CHECK-FMA-CALL: fmaf
 
 define float @test_f32(float %a, float %b, float %c) nounwind readnone ssp {
 entry:
@@ -15,7 +17,7 @@ entry:
 
 ; CHECK: test_f64
 ; CHECK-FMA-INST: vfmadd213sd
-; CHECK-FMA-CALL: _fma
+; CHECK-FMA-CALL: fma
 
 define double @test_f64(double %a, double %b, double %c) nounwind readnone ssp {
 entry:
@@ -24,7 +26,7 @@ entry:
 }
 
 ; CHECK: test_f80
-; CHECK: _fmal
+; CHECK: fmal
 
 define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) nounwind readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/fma3-intrinsics.ll b/test/CodeGen/X86/fma3-intrinsics.ll
index 90529e0..e3910a6 100755
--- a/test/CodeGen/X86/fma3-intrinsics.ll
+++ b/test/CodeGen/X86/fma3-intrinsics.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 -mattr=avx2,+fma | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 | FileCheck %s
+; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
 
 define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   ; CHECK: fmadd213ss %xmm
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
index fd414b3..2fe1ecd 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+avx,+fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s
 
 ; VFMADD
 define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 5d97a87..6d98d59 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -1,8 +1,13 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 -fp-contract=fast | FileCheck %s --check-prefix=CHECK_FMA4
 
 ; CHECK: test_x86_fmadd_ps
-; CHECK: vfmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: vfmadd213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps
+; CHECK_FMA4: vfmaddps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fadd <4 x float> %x, %a2
@@ -10,8 +15,11 @@ define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
 }
 
 ; CHECK: test_x86_fmsub_ps
-; CHECK: fmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fmsub213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps
+; CHECK_FMA4: vfmsubps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fsub <4 x float> %x, %a2
@@ -19,8 +27,11 @@ define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
 }
 
 ; CHECK: test_x86_fnmadd_ps
-; CHECK: fnmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fnmadd213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps
+; CHECK_FMA4: vfnmaddps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fsub <4 x float> %a2, %x
@@ -28,8 +39,11 @@ define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x fl
 }
 
 ; CHECK: test_x86_fnmsub_ps
-; CHECK: fnmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fnmsub213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmsub_ps
+; CHECK_FMA4: fnmsubps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
@@ -38,8 +52,11 @@ define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x fl
 }
 
 ; CHECK: test_x86_fmadd_ps_y
-; CHECK: vfmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfmadd213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps_y
+; CHECK_FMA4: vfmaddps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fadd <8 x float> %x, %a2
@@ -47,8 +64,11 @@ define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
 }
 
 ; CHECK: test_x86_fmsub_ps_y
-; CHECK: vfmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfmsub213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps_y
+; CHECK_FMA4: vfmsubps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fsub <8 x float> %x, %a2
@@ -56,8 +76,11 @@ define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
 }
 
 ; CHECK: test_x86_fnmadd_ps_y
-; CHECK: vfnmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfnmadd213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps_y
+; CHECK_FMA4: vfnmaddps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fsub <8 x float> %a2, %x
@@ -65,7 +88,7 @@ define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x
 }
 
 ; CHECK: test_x86_fnmsub_ps_y
-; CHECK: vfnmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfnmsub213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
 define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
@@ -75,8 +98,11 @@ define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x
 }
 
 ; CHECK: test_x86_fmadd_pd_y
-; CHECK: vfmadd213pd     %ymm2, %ymm0, %ymm1
+; CHECK: vfmadd213pd     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_pd_y
+; CHECK_FMA4: vfmaddpd     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
   %x = fmul <4 x double> %a0, %a1
   %res = fadd <4 x double> %x, %a2
@@ -84,8 +110,11 @@ define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4
 }
 
 ; CHECK: test_x86_fmsub_pd_y
-; CHECK: vfmsub213pd     %ymm2, %ymm0, %ymm1
+; CHECK: vfmsub213pd     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_pd_y
+; CHECK_FMA4: vfmsubpd     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
   %x = fmul <4 x double> %a0, %a1
   %res = fsub <4 x double> %x, %a2
@@ -93,8 +122,11 @@ define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4
 }
 
 ; CHECK: test_x86_fmsub_pd
-; CHECK: vfmsub213pd     %xmm2, %xmm0, %xmm1
+; CHECK: vfmsub213pd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_pd
+; CHECK_FMA4: vfmsubpd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
   %x = fmul <2 x double> %a0, %a1
   %res = fsub <2 x double> %x, %a2
@@ -102,8 +134,11 @@ define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x
 }
 
 ; CHECK: test_x86_fnmadd_ss
-; CHECK: vfnmadd213ss    %xmm2, %xmm0, %xmm1
+; CHECK: vfnmadd213ss    %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ss
+; CHECK_FMA4: vfnmaddss    %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
   %x = fmul float %a0, %a1
   %res = fsub float %a2, %x
@@ -111,8 +146,11 @@ define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
 }
 
 ; CHECK: test_x86_fnmadd_sd
-; CHECK: vfnmadd213sd     %xmm2, %xmm0, %xmm1
+; CHECK: vfnmadd213sd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_sd
+; CHECK_FMA4: vfnmaddsd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
   %x = fmul double %a0, %a1
   %res = fsub double %a2, %x
@@ -120,8 +158,11 @@ define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
 }
 
 ; CHECK: test_x86_fmsub_sd
-; CHECK: vfmsub213sd     %xmm2, %xmm0, %xmm1
+; CHECK: vfmsub213sd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_sd
+; CHECK_FMA4: vfmsubsd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
   %x = fmul double %a0, %a1
   %res = fsub double %x, %a2
@@ -129,11 +170,43 @@ define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
 }
 
 ; CHECK: test_x86_fnmsub_ss
-; CHECK: vfnmsub213ss     %xmm2, %xmm0, %xmm1
+; CHECK: vfnmsub213ss     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmsub_ss
+; CHECK_FMA4: vfnmsubss     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
   %x = fsub float -0.000000e+00, %a0
   %y = fmul float %x, %a1
   %res = fsub float %y, %a2
   ret float %res
 }
+
+; CHECK: test_x86_fmadd_ps
+; CHECK: vmovaps         (%rdi), %xmm2
+; CHECK: vfmadd213ps     %xmm1, %xmm0, %xmm2
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps
+; CHECK_FMA4: vfmaddps     %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK_FMA4: ret
+define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = load <4 x float>* %a0
+  %y = fmul <4 x float> %x, %a1
+  %res = fadd <4 x float> %y, %a2
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fmsub_ps
+; CHECK: vmovaps         (%rdi), %xmm2
+; CHECK: fmsub213ps     %xmm1, %xmm0, %xmm2
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps
+; CHECK_FMA4: vfmsubps     %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK_FMA4: ret
+define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = load <4 x float>* %a0
+  %y = fmul <4 x float> %x, %a1
+  %res = fsub <4 x float> %y, %a2
+  ret <4 x float> %res
+}
+
diff --git a/test/CodeGen/X86/fold-load.ll b/test/CodeGen/X86/fold-load.ll
index c961f75..d836665 100644
--- a/test/CodeGen/X86/fold-load.ll
+++ b/test/CodeGen/X86/fold-load.ll
@@ -57,13 +57,13 @@ entry:
   %0 = load i32* %P, align 4
   %1 = load i32* %Q, align 4
   %2 = xor i32 %0, %1
-  %3 = and i32 %2, 65535
+  %3 = and i32 %2, 89947
   %4 = icmp eq i32 %3, 0
   br i1 %4, label %exit, label %land.end
 
 exit:
   %shr.i.i19 = xor i32 %1, %0
-  %5 = and i32 %shr.i.i19, 2147418112
+  %5 = and i32 %shr.i.i19, 3456789123
   %6 = icmp eq i32 %5, 0
   br label %land.end
 
diff --git a/test/CodeGen/X86/fp-fast.ll b/test/CodeGen/X86/fp-fast.ll
new file mode 100644
index 0000000..d70aa7d
--- /dev/null
+++ b/test/CodeGen/X86/fp-fast.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=x86-64 -mattr=+avx,-fma4 -mtriple=x86_64-apple-darwin -enable-unsafe-fp-math < %s | FileCheck %s
+
+; CHECK: test1
+define float @test1(float %a) {
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fadd float %a, %a
+  %r = fadd float %t1, %t1
+  ret float %r
+}
+
+; CHECK: test2
+define float @test2(float %a) {
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fmul float 4.0, %a
+  %t2 = fadd float %a, %a
+  %r = fadd float %t1, %t2
+  ret float %r
+}
+
+; CHECK: test3
+define float @test3(float %a) {
+; CHECK-NOT: addss
+; CHECK: xorps
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fmul float 2.0, %a
+  %t2 = fadd float %a, %a
+  %r = fsub float %t1, %t2
+  ret float %r
+}
+
+; CHECK: test4
+define float @test4(float %a) {
+; CHECK-NOT: fma
+; CHECK-NOT mul
+; CHECK-NOT: add
+; CHECK: ret
+  %t1 = fmul float %a, 0.0
+  %t2 = fadd float %a, %t1
+  ret float %t2
+}
+
+; CHECK: test5
+define float @test5(float %a) {
+; CHECK-NOT: add
+; CHECK: vxorps
+; CHECK: ret
+  %t1 = fsub float -0.0, %a
+  %t2 = fadd float %a, %t1
+  ret float %t2
+}
diff --git a/test/CodeGen/X86/fp-load-trunc.ll b/test/CodeGen/X86/fp-load-trunc.ll
new file mode 100644
index 0000000..2ae65c9
--- /dev/null
+++ b/test/CodeGen/X86/fp-load-trunc.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX
+
+define <1 x float> @test1(<1 x double>* %p) nounwind {
+; CHECK: test1
+; CHECK: cvtsd2ss
+; CHECK: ret
+; AVX:   test1
+; AVX:   vcvtsd2ss
+; AVX:   ret
+  %x = load <1 x double>* %p
+  %y = fptrunc <1 x double> %x to <1 x float>
+  ret <1 x float> %y
+}
+
+define <2 x float> @test2(<2 x double>* %p) nounwind {
+; CHECK: test2
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: ret
+; AVX:   test2
+; AVX:   vcvtpd2psx {{[0-9]*}}(%{{.*}})
+; AVX:   ret
+  %x = load <2 x double>* %p
+  %y = fptrunc <2 x double> %x to <2 x float>
+  ret <2 x float> %y
+}
+
+define <4 x float> @test3(<4 x double>* %p) nounwind {
+; CHECK: test3
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: movlhps
+; CHECK: ret
+; AVX:   test3
+; AVX:   vcvtpd2psy {{[0-9]*}}(%{{.*}})
+; AVX:   ret
+  %x = load <4 x double>* %p
+  %y = fptrunc <4 x double> %x to <4 x float>
+  ret <4 x float> %y
+}
+
+define <8 x float> @test4(<8 x double>* %p) nounwind {
+; CHECK: test4
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: movlhps
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: movlhps
+; CHECK: ret
+; AVX:   test4
+; AVX:   vcvtpd2psy {{[0-9]*}}(%{{.*}})
+; AVX:   vcvtpd2psy {{[0-9]*}}(%{{.*}})
+; AVX:   vinsertf128
+; AVX:   ret
+  %x = load <8 x double>* %p
+  %y = fptrunc <8 x double> %x to <8 x float>
+  ret <8 x float> %y
+}
+
+
diff --git a/test/CodeGen/X86/fp-trunc.ll b/test/CodeGen/X86/fp-trunc.ll
index 170637a..25442fc 100644
--- a/test/CodeGen/X86/fp-trunc.ll
+++ b/test/CodeGen/X86/fp-trunc.ll
@@ -1,33 +1,56 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX
 
 define <1 x float> @test1(<1 x double> %x) nounwind {
+; CHECK: test1
 ; CHECK: cvtsd2ss
 ; CHECK: ret
+; AVX:   test1
+; AVX:   vcvtsd2ss
+; AVX:   ret
   %y = fptrunc <1 x double> %x to <1 x float>
   ret <1 x float> %y
 }
 
-
 define <2 x float> @test2(<2 x double> %x) nounwind {
-; FIXME: It would be nice if this compiled down to a cvtpd2ps
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
+; CHECK: test2
+; CHECK: cvtpd2ps
 ; CHECK: ret
+; AVX:   test2
+; AVX-NOT:  vcvtpd2psy
+; AVX:   vcvtpd2ps
+; AVX:   ret
   %y = fptrunc <2 x double> %x to <2 x float>
   ret <2 x float> %y
 }
 
-define <8 x float> @test3(<8 x double> %x) nounwind {
-; FIXME: It would be nice if this compiled down to a series of cvtpd2ps
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
+define <4 x float> @test3(<4 x double> %x) nounwind {
+; CHECK: test3
+; CHECK: cvtpd2ps
+; CHECK: cvtpd2ps
+; CHECK: movlhps
+; CHECK: ret
+; AVX:   test3
+; AVX:   vcvtpd2psy
+; AVX:   ret
+  %y = fptrunc <4 x double> %x to <4 x float>
+  ret <4 x float> %y
+}
+
+define <8 x float> @test4(<8 x double> %x) nounwind {
+; CHECK: test4
+; CHECK: cvtpd2ps
+; CHECK: cvtpd2ps
+; CHECK: movlhps
+; CHECK: cvtpd2ps
+; CHECK: cvtpd2ps
+; CHECK: movlhps
 ; CHECK: ret
+; AVX:   test4
+; AVX:   vcvtpd2psy
+; AVX:   vcvtpd2psy
+; AVX:   vinsertf128
+; AVX:   ret
   %y = fptrunc <8 x double> %x to <8 x float>
   ret <8 x float> %y
 }
diff --git a/test/CodeGen/X86/handle-move.ll b/test/CodeGen/X86/handle-move.ll
new file mode 100644
index 0000000..e9f7a96
--- /dev/null
+++ b/test/CodeGen/X86/handle-move.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=x86-64 -mcpu=core2 -fast-isel -enable-misched -misched=shuffle -misched-bottomup -verify-machineinstrs < %s
+; RUN: llc -march=x86-64 -mcpu=core2 -fast-isel -enable-misched -misched=shuffle -misched-topdown -verify-machineinstrs < %s
+; REQUIRES: asserts
+;
+; Test the LiveIntervals::handleMove() function.
+;
+; Moving the DIV32r instruction exercises the regunit update code because
+; %EDX has a live range into the function and is used by the DIV32r.
+;
+; Here sinking a kill + dead def:
+; 144B -> 180B: DIV32r %vreg4, %EAX<imp-def>, %EDX<imp-def,dead>, %EFLAGS<imp-def,dead>, %EAX<imp-use,kill>, %EDX<imp-use>
+;       %vreg4: [48r,144r:0)  0@48r
+;         -->   [48r,180r:0)  0@48r
+;       DH:     [0B,16r:0)[128r,144r:2)[144r,144d:1)  0@0B-phi 1@144r 2@128r
+;         -->   [0B,16r:0)[128r,180r:2)[180r,180d:1)  0@0B-phi 1@180r 2@128r
+;       DL:     [0B,16r:0)[128r,144r:2)[144r,144d:1)  0@0B-phi 1@144r 2@128r
+;         -->   [0B,16r:0)[128r,180r:2)[180r,180d:1)  0@0B-phi 1@180r 2@128r
+;
+define i32 @f1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind uwtable readnone ssp {
+entry:
+  %y = add i32 %c, 1
+  %x = udiv i32 %b, %a
+  %add = add nsw i32 %y, %x
+  ret i32 %add
+}
+
+; Same as above, but moving a kill + live def:
+; 144B -> 180B: DIV32r %vreg4, %EAX<imp-def,dead>, %EDX<imp-def>, %EFLAGS<imp-def,dead>, %EAX<imp-use,kill>, %EDX<imp-use>
+;       %vreg4: [48r,144r:0)  0@48r
+;         -->   [48r,180r:0)  0@48r
+;       DH:     [0B,16r:0)[128r,144r:2)[144r,184r:1)  0@0B-phi 1@144r 2@128r
+;         -->   [0B,16r:0)[128r,180r:2)[180r,184r:1)  0@0B-phi 1@180r 2@128r
+;       DL:     [0B,16r:0)[128r,144r:2)[144r,184r:1)  0@0B-phi 1@144r 2@128r
+;         -->   [0B,16r:0)[128r,180r:2)[180r,184r:1)  0@0B-phi 1@180r 2@128r
+;
+define i32 @f2(i32 %a, i32 %b, i32 %c, i32 %d) nounwind uwtable readnone ssp {
+entry:
+  %y = sub i32 %c, %d
+  %x = urem i32 %b, %a
+  %add = add nsw i32 %x, %y
+  ret i32 %add
+}
+
+; Moving a use below the existing kill (%vreg5):
+; Moving a tied virtual register def (%vreg11):
+;
+; 96B -> 120B: %vreg11<def,tied1> = SUB32rr %vreg11<tied0>, %vreg5
+;       %vreg11:        [80r,96r:1)[96r,144r:0)  0@96r 1@80r
+;            -->        [80r,120r:1)[120r,144r:0)  0@120r 1@80r
+;       %vreg5:         [16r,112r:0)  0@16r
+;            -->        [16r,120r:0)  0@16r
+;
+define i32 @f3(i32 %a, i32 %b, i32 %c, i32 %d) nounwind uwtable readnone ssp {
+entry:
+  %y = sub i32 %a, %b
+  %x = add i32 %a, %b
+  %r = mul i32 %x, %y
+  ret i32 %r
+}
+
+; Move EFLAGS dead def across another def:
+; handleMove 208B -> 36B: %EDX<def> = MOV32r0 %EFLAGS<imp-def,dead>
+;    EFLAGS:    [20r,20d:4)[160r,160d:3)[208r,208d:0)[224r,224d:1)[272r,272d:2)[304r,304d:5)  0@208r 1@224r 2@272r 3@160r 4@20r 5@304r
+;         -->   [20r,20d:4)[36r,36d:0)[160r,160d:3)[224r,224d:1)[272r,272d:2)[304r,304d:5)  0@36r 1@224r 2@272r 3@160r 4@20r 5@304r
+;
+define i32 @f4(i32 %a, i32 %b, i32 %c, i32 %d) nounwind uwtable readnone ssp {
+entry:
+  %x = sub i32 %a, %b
+  %y = sub i32 %b, %c
+  %z = sub i32 %c, %d
+  %r1 = udiv i32 %x, %y
+  %r2 = mul i32 %z, %r1
+  ret i32 %r2
+}
diff --git a/test/CodeGen/X86/inline-asm-tied.ll b/test/CodeGen/X86/inline-asm-tied.ll
index 91576fb..597236e 100644
--- a/test/CodeGen/X86/inline-asm-tied.ll
+++ b/test/CodeGen/X86/inline-asm-tied.ll
@@ -19,3 +19,12 @@ entry:
 	%1 = load i64* %retval		; <i64> [#uses=1]
 	ret i64 %1
 }
+
+; The tied operands are not necessarily in the same order as the defs.
+; PR13742
+define i64 @swapped(i64 %x, i64 %y) nounwind {
+entry:
+	%x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) nounwind
+        %x1 = extractvalue { i64, i64 } %x0, 0
+        ret i64 %x1
+}
diff --git a/test/CodeGen/X86/inline-asm.ll b/test/CodeGen/X86/inline-asm.ll
index e6eb9ef..d201ebd 100644
--- a/test/CodeGen/X86/inline-asm.ll
+++ b/test/CodeGen/X86/inline-asm.ll
@@ -52,3 +52,10 @@ entry:
   %0 = call { i32, i32, i32, i32, i32 } asm sideeffect "", "=&r,=&r,=&r,=&r,=&q,r,~{ecx},~{memory},~{dirflag},~{fpsr},~{flags}"(i8* %h) nounwind
   ret void
 }
+
+; Mix normal and EC defs of the same register.
+define i32 @pr14376() nounwind noinline {
+entry:
+  %asm = tail call i32 asm sideeffect "", "={ax},i,~{eax},~{flags},~{rax}"(i64 61) nounwind
+  ret i32 %asm
+}
diff --git a/test/CodeGen/X86/inlineasm-sched-bug.ll b/test/CodeGen/X86/inlineasm-sched-bug.ll
new file mode 100644
index 0000000..08de0c0
--- /dev/null
+++ b/test/CodeGen/X86/inlineasm-sched-bug.ll
@@ -0,0 +1,13 @@
+; PR13504
+; RUN: llc -march=x86 -mcpu=atom <%s | FileCheck %s
+; CHECK: bsfl
+; CHECK-NOT: movl
+
+define i32 @foo(i32 %treemap) nounwind uwtable {
+entry:
+  %sub = sub i32 0, %treemap
+  %and = and i32 %treemap, %sub
+  %0 = tail call i32 asm "bsfl $1,$0\0A\09", "=r,rm,~{dirflag},~{fpsr},~{flags}"(i32 %and) nounwind
+  ret i32 %0
+}
+
diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll
index 48e2106..0e34222 100644
--- a/test/CodeGen/X86/jump_sign.ll
+++ b/test/CodeGen/X86/jump_sign.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=pentiumpro | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=pentiumpro -verify-machineinstrs | FileCheck %s
 
 define i32 @f(i32 %X) {
 entry:
@@ -219,7 +219,6 @@ entry:
 ; by sbb, we should not optimize cmp away.
 define i32 @q(i32 %j.4, i32 %w, i32 %el) {
 ; CHECK: q:
-; CHECK: sub
 ; CHECK: cmp
 ; CHECK-NEXT: sbb
   %tmp532 = add i32 %j.4, %w
@@ -253,3 +252,56 @@ return:
   %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
   ret i8* %retval.0
 }
+
+; Test optimizations of dec/inc.
+define i32 @dec(i32 %a) nounwind {
+entry:
+; CHECK: dec:
+; CHECK: decl
+; CHECK-NOT: test
+; CHECK: cmovsl
+  %sub = sub nsw i32 %a, 1
+  %cmp = icmp sgt i32 %sub, 0
+  %cond = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %cond
+}
+
+define i32 @inc(i32 %a) nounwind {
+entry:
+; CHECK: inc:
+; CHECK: incl
+; CHECK-NOT: test
+; CHECK: cmovsl
+  %add = add nsw i32 %a, 1
+  %cmp = icmp sgt i32 %add, 0
+  %cond = select i1 %cmp, i32 %add, i32 0
+  ret i32 %cond
+}
+
+; PR13966
+@b = common global i32 0, align 4
+@a = common global i32 0, align 4
+define i32 @test1(i32 %p1) nounwind uwtable {
+entry:
+; CHECK: test1:
+; CHECK: testb
+; CHECK: j
+; CHECK: ret
+  %0 = load i32* @b, align 4
+  %cmp = icmp ult i32 %0, %p1
+  %conv = zext i1 %cmp to i32
+  %1 = load i32* @a, align 4
+  %and = and i32 %conv, %1
+  %conv1 = trunc i32 %and to i8
+  %2 = urem i8 %conv1, 3
+  %tobool = icmp eq i8 %2, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  %dec = add nsw i32 %1, -1
+  store i32 %dec, i32* @a, align 4
+  br label %if.end
+
+if.end:
+  ret i32 undef
+}
diff --git a/test/CodeGen/X86/misched-balance.ll b/test/CodeGen/X86/misched-balance.ll
new file mode 100644
index 0000000..2184d9e
--- /dev/null
+++ b/test/CodeGen/X86/misched-balance.ll
@@ -0,0 +1,230 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
+; RUN:          -verify-machineinstrs | FileCheck %s
+;
+; Verify that misched resource/latency balancy heuristics are sane.
+
+define void @unrolled_mmult1(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
+  i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
+ i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
+  nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+; imull folded loads should be in order and interleaved with addl, never
+; adjacent. Also check that we have no spilling.
+;
+; Since mmult1 IR is already in good order, this effectively ensure
+; the scheduler maintains source order.
+;
+; CHECK: %for.body
+; CHECK-NOT: %rsp
+; CHECK: imull 4
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 8
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 12
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 16
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 20
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 24
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 28
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 32
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 36
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: %end
+for.body:
+  %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
+  %tmp57 = load i32* %tmp56, align 4
+  %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i
+  %tmp58 = load i32* %arrayidx12.us.i61, align 4
+  %mul.us.i = mul nsw i32 %tmp58, %tmp57
+  %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1
+  %tmp59 = load i32* %arrayidx8.us.i.1, align 4
+  %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i
+  %tmp60 = load i32* %arrayidx12.us.i61.1, align 4
+  %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59
+  %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i
+  %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2
+  %tmp61 = load i32* %arrayidx8.us.i.2, align 4
+  %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i
+  %tmp62 = load i32* %arrayidx12.us.i61.2, align 4
+  %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61
+  %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1
+  %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3
+  %tmp63 = load i32* %arrayidx8.us.i.3, align 4
+  %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i
+  %tmp64 = load i32* %arrayidx12.us.i61.3, align 4
+  %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63
+  %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2
+  %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4
+  %tmp65 = load i32* %arrayidx8.us.i.4, align 4
+  %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i
+  %tmp66 = load i32* %arrayidx12.us.i61.4, align 4
+  %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65
+  %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3
+  %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5
+  %tmp67 = load i32* %arrayidx8.us.i.5, align 4
+  %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i
+  %tmp68 = load i32* %arrayidx12.us.i61.5, align 4
+  %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67
+  %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4
+  %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6
+  %tmp69 = load i32* %arrayidx8.us.i.6, align 4
+  %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i
+  %tmp70 = load i32* %arrayidx12.us.i61.6, align 4
+  %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69
+  %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5
+  %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7
+  %tmp71 = load i32* %arrayidx8.us.i.7, align 4
+  %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i
+  %tmp72 = load i32* %arrayidx12.us.i61.7, align 4
+  %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71
+  %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6
+  %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8
+  %tmp73 = load i32* %arrayidx8.us.i.8, align 4
+  %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i
+  %tmp74 = load i32* %arrayidx12.us.i61.8, align 4
+  %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73
+  %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7
+  %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9
+  %tmp75 = load i32* %arrayidx8.us.i.9, align 4
+  %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i
+  %tmp76 = load i32* %arrayidx12.us.i61.9, align 4
+  %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75
+  %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8
+  %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i
+  store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4
+  %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %end, label %for.body
+
+end:
+  ret void
+}
+
+; Unlike the above loop, this IR starts out bad and must be
+; rescheduled.
+;
+; CHECK: %for.body
+; CHECK-NOT: %rsp
+; CHECK: imull 4
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 8
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 12
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 16
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 20
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 24
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 28
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 32
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 36
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: %end
+define void @unrolled_mmult2(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
+  i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
+  i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
+  nounwind uwtable ssp {
+entry:
+  br label %for.body
+for.body:
+  %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
+  %tmp57 = load i32* %tmp56, align 4
+  %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i
+  %tmp58 = load i32* %arrayidx12.us.i61, align 4
+  %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1
+  %tmp59 = load i32* %arrayidx8.us.i.1, align 4
+  %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i
+  %tmp60 = load i32* %arrayidx12.us.i61.1, align 4
+  %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2
+  %tmp61 = load i32* %arrayidx8.us.i.2, align 4
+  %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i
+  %tmp62 = load i32* %arrayidx12.us.i61.2, align 4
+  %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3
+  %tmp63 = load i32* %arrayidx8.us.i.3, align 4
+  %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i
+  %tmp64 = load i32* %arrayidx12.us.i61.3, align 4
+  %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4
+  %tmp65 = load i32* %arrayidx8.us.i.4, align 4
+  %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i
+  %tmp66 = load i32* %arrayidx12.us.i61.4, align 4
+  %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5
+  %tmp67 = load i32* %arrayidx8.us.i.5, align 4
+  %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i
+  %tmp68 = load i32* %arrayidx12.us.i61.5, align 4
+  %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6
+  %tmp69 = load i32* %arrayidx8.us.i.6, align 4
+  %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i
+  %tmp70 = load i32* %arrayidx12.us.i61.6, align 4
+  %mul.us.i = mul nsw i32 %tmp58, %tmp57
+  %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7
+  %tmp71 = load i32* %arrayidx8.us.i.7, align 4
+  %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i
+  %tmp72 = load i32* %arrayidx12.us.i61.7, align 4
+  %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8
+  %tmp73 = load i32* %arrayidx8.us.i.8, align 4
+  %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i
+  %tmp74 = load i32* %arrayidx12.us.i61.8, align 4
+  %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9
+  %tmp75 = load i32* %arrayidx8.us.i.9, align 4
+  %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i
+  %tmp76 = load i32* %arrayidx12.us.i61.9, align 4
+  %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59
+  %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i
+  %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61
+  %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1
+  %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63
+  %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2
+  %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65
+  %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3
+  %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67
+  %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4
+  %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69
+  %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5
+  %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71
+  %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6
+  %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73
+  %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7
+  %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75
+  %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8
+  %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i
+  store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4
+  %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %end, label %for.body
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/X86/misched-ilp.ll b/test/CodeGen/X86/misched-ilp.ll
new file mode 100644
index 0000000..c6cedb7
--- /dev/null
+++ b/test/CodeGen/X86/misched-ilp.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mcpu=core2 -enable-misched -misched=ilpmax | FileCheck -check-prefix=MAX %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mcpu=core2 -enable-misched -misched=ilpmin | FileCheck -check-prefix=MIN %s
+;
+; Basic verification of the ScheduleDAGILP metric.
+;
+; MAX: addss
+; MAX: addss
+; MAX: addss
+; MAX: subss
+; MAX: addss
+;
+; MIN: addss
+; MIN: addss
+; MIN: subss
+; MIN: addss
+; MIN: addss
+define float @ilpsched(float %a, float %b, float %c, float %d, float %e, float %f) nounwind uwtable readnone ssp {
+entry:
+  %add = fadd float %a, %b
+  %add1 = fadd float %c, %d
+  %add2 = fadd float %e, %f
+  %add3 = fsub float %add1, %add2
+  %add4 = fadd float %add, %add3
+  ret float %add4
+}
diff --git a/test/CodeGen/X86/misched-new.ll b/test/CodeGen/X86/misched-new.ll
index 8f2f6f7..cec04b5 100644
--- a/test/CodeGen/X86/misched-new.ll
+++ b/test/CodeGen/X86/misched-new.ll
@@ -1,4 +1,6 @@
-; RUN: llc -march=x86-64 -mcpu=core2 -enable-misched -misched=shuffle -misched-bottomup < %s
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
+; RUN:          -misched=shuffle -misched-bottomup -verify-machineinstrs \
+; RUN:     | FileCheck %s
 ; REQUIRES: asserts
 ;
 ; Interesting MachineScheduler cases.
@@ -25,3 +27,27 @@ for.cond.preheader:                               ; preds = %entry
 if.end:                                           ; preds = %entry
   ret void
 }
+
+; The machine verifier checks that EFLAGS kill flags are updated when
+; the scheduler reorders cmovel instructions.
+;
+; CHECK: test
+; CHECK: cmovel
+; CHECK: cmovel
+; CHECK: call
+define void @foo(i32 %b) nounwind uwtable ssp {
+entry:
+  %tobool = icmp ne i32 %b, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %v1 = phi i32 [1, %entry], [2, %if.then]
+  %v2 = phi i32 [3, %entry], [4, %if.then]
+  call void @bar(i32 %v1, i32 %v2)
+  ret void
+}
+
+declare void @bar(i32,i32)
diff --git a/test/CodeGen/X86/mmx-builtins.ll b/test/CodeGen/X86/mmx-builtins.ll
index 8b7200d..a8d33f4 100644
--- a/test/CodeGen/X86/mmx-builtins.ll
+++ b/test/CodeGen/X86/mmx-builtins.ll
@@ -1043,6 +1043,20 @@ entry:
   ret i64 %5
 }
 
+define i32 @test21_2(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: test21_2
+; CHECK: pshufw
+; CHECK: movd
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <2 x i32>
+  %5 = extractelement <2 x i32> %4, i32 0
+  ret i32 %5
+}
+
 declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll
new file mode 100644
index 0000000..24d28ad
--- /dev/null
+++ b/test/CodeGen/X86/ms-inline-asm.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -march=x86 | FileCheck %s
+
+define i32 @t1() nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect inteldialect "mov eax, $1\0A\09mov $0, eax", "=r,r,~{eax},~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
+  ret i32 %0
+; CHECK: t1
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov eax, ecx
+; CHECK: mov ecx, eax
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
+
+define void @t2() nounwind {
+entry:
+  call void asm sideeffect inteldialect "mov eax, $$1", "~{eax},~{dirflag},~{fpsr},~{flags}"() nounwind
+  ret void
+; CHECK: t2
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov eax, 1
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
+
+define void @t3(i32 %V) nounwind {
+entry:
+  %V.addr = alloca i32, align 4
+  store i32 %V, i32* %V.addr, align 4
+  call void asm sideeffect inteldialect "mov eax, DWORD PTR [$0]", "*m,~{eax},~{dirflag},~{fpsr},~{flags}"(i32* %V.addr) nounwind
+  ret void
+; CHECK: t3
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov eax, DWORD PTR {{[[esp]}}
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
+
+%struct.t18_type = type { i32, i32 }
+
+define i32 @t18() nounwind {
+entry:
+  %foo = alloca %struct.t18_type, align 4
+  %a = getelementptr inbounds %struct.t18_type* %foo, i32 0, i32 0
+  store i32 1, i32* %a, align 4
+  %b = getelementptr inbounds %struct.t18_type* %foo, i32 0, i32 1
+  store i32 2, i32* %b, align 4
+  call void asm sideeffect inteldialect "lea ebx, foo\0A\09mov eax, [ebx].0\0A\09mov [ebx].4, ecx", "~{eax},~{dirflag},~{fpsr},~{flags}"() nounwind
+  %b1 = getelementptr inbounds %struct.t18_type* %foo, i32 0, i32 1
+  %0 = load i32* %b1, align 4
+  ret i32 %0
+; CHECK: t18
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: lea ebx, foo
+; CHECK: mov eax, [ebx].0
+; CHECK: mov [ebx].4, ecx
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
diff --git a/test/CodeGen/X86/mulx32.ll b/test/CodeGen/X86/mulx32.ll
new file mode 100644
index 0000000..b75ac00
--- /dev/null
+++ b/test/CodeGen/X86/mulx32.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mcpu=core-avx2 -march=x86 < %s | FileCheck %s
+
+define i64 @f1(i32 %a, i32 %b) {
+  %x = zext i32 %a to i64
+  %y = zext i32 %b to i64
+  %r = mul i64 %x, %y
+; CHECK: f1
+; CHECK: mulxl
+; CHECK: ret
+  ret i64 %r
+}
+
+define i64 @f2(i32 %a, i32* %p) {
+  %b = load i32* %p
+  %x = zext i32 %a to i64
+  %y = zext i32 %b to i64
+  %r = mul i64 %x, %y
+; CHECK: f2
+; CHECK: mulxl ({{.+}}), %{{.+}}, %{{.+}}
+; CHECK: ret
+  ret i64 %r
+}
diff --git a/test/CodeGen/X86/mulx64.ll b/test/CodeGen/X86/mulx64.ll
new file mode 100644
index 0000000..d573028
--- /dev/null
+++ b/test/CodeGen/X86/mulx64.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mcpu=core-avx2 -march=x86-64 < %s | FileCheck %s
+
+define i128 @f1(i64 %a, i64 %b) {
+  %x = zext i64 %a to i128
+  %y = zext i64 %b to i128
+  %r = mul i128 %x, %y
+; CHECK: f1
+; CHECK: mulxq
+; CHECK: ret
+  ret i128 %r
+}
+
+define i128 @f2(i64 %a, i64* %p) {
+  %b = load i64* %p
+  %x = zext i64 %a to i128
+  %y = zext i64 %b to i128
+  %r = mul i128 %x, %y
+; CHECK: f2
+; CHECK: mulxq ({{.+}}), %{{.+}}, %{{.+}}
+; CHECK: ret
+  ret i128 %r
+}
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-3.ll b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
index 984d7e5..2a20e7a 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-3.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
@@ -1,14 +1,10 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
-; XFAIL: *
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=corei7 | FileCheck %s
 ; rdar://5571034
 
 ; This requires physreg joining, %vreg13 is live everywhere:
 ; 304L		%CL<def> = COPY %vreg13:sub_8bit; GR32_ABCD:%vreg13
 ; 320L		%vreg15<def> = COPY %vreg19; GR32:%vreg15 GR32_NOSP:%vreg19
 ; 336L		%vreg15<def> = SAR32rCL %vreg15, %EFLAGS<imp-def,dead>, %CL<imp-use,kill>; GR32:%vreg15
-;
-; This test is XFAIL until the register allocator understands trivial physreg
-; interference. <rdar://9802098>
 
 define void @foo(i32* nocapture %quadrant, i32* nocapture %ptr, i32 %bbSize, i32 %bbStart, i32 %shifts) nounwind ssp {
 ; CHECK: foo:
diff --git a/test/CodeGen/X86/pic_jumptable.ll b/test/CodeGen/X86/pic_jumptable.ll
index 8c16dc6..bdd8859 100644
--- a/test/CodeGen/X86/pic_jumptable.ll
+++ b/test/CodeGen/X86/pic_jumptable.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -relocation-model=pic -mtriple=i386-linux-gnu -asm-verbose=false \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-LINUX
+; RUN: llc < %s -relocation-model=pic -mark-data-regions -mtriple=i686-apple-darwin -asm-verbose=false \
+; RUN:   | FileCheck %s --check-prefix=CHECK-DATA
 ; RUN: llc < %s -relocation-model=pic -mtriple=i686-apple-darwin -asm-verbose=false \
 ; RUN:   | FileCheck %s
 ; RUN: llc < %s                       -mtriple=x86_64-apple-darwin | not grep 'lJTI'
@@ -16,6 +18,16 @@ entry:
 ; CHECK:       Ltmp0 = LJTI0_0-L0$pb
 ; CHECK-NEXT:  addl Ltmp0(%eax,%ecx,4)
 ; CHECK-NEXT:  jmpl *%eax
+
+;; When data-in-code markers are enabled, we should see them around the jump
+;; table.
+; CHECK-DATA: .data_region jt32
+; CHECK-DATA: LJTI0_0
+; CHECK-DATA: .end_data_region
+
+;; When they're not enabled, make sure we don't see them at all.
+; CHECK-NOT: .data_region
+; CHECK-LINUX-NOT: .data_region
 	%Y_addr = alloca i32		; <i32*> [#uses=2]
 	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
 	store i32 %Y, i32* %Y_addr
diff --git a/test/CodeGen/X86/pmovext.ll b/test/CodeGen/X86/pmovext.ll
new file mode 100644
index 0000000..16e9c28
--- /dev/null
+++ b/test/CodeGen/X86/pmovext.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+
+; rdar://11897677
+
+;CHECK: intrin_pmov
+;CHECK: pmovzxbw  (%{{.*}}), %xmm0
+;CHECK-NEXT: movdqu
+;CHECK-NEXT: ret
+define void @intrin_pmov(i16* noalias %dest, i8* noalias %src) nounwind uwtable ssp {
+  %1 = bitcast i8* %src to <2 x i64>*
+  %2 = load <2 x i64>* %1, align 16
+  %3 = bitcast <2 x i64> %2 to <16 x i8>
+  %4 = tail call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %3) nounwind
+  %5 = bitcast i16* %dest to i8*
+  %6 = bitcast <8 x i16> %4 to <16 x i8>
+  tail call void @llvm.x86.sse2.storeu.dq(i8* %5, <16 x i8> %6) nounwind
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
+
+declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
diff --git a/test/CodeGen/X86/pointer-vector.ll b/test/CodeGen/X86/pointer-vector.ll
index 800fbed..58423d1 100644
--- a/test/CodeGen/X86/pointer-vector.ll
+++ b/test/CodeGen/X86/pointer-vector.ll
@@ -81,8 +81,7 @@ define <4 x i32*> @INT2PTR1(<4 x i8>* %p) nounwind {
 entry:
   %G = load <4 x i8>* %p
 ;CHECK: movl
-;CHECK: movd
-;CHECK: pshufb
+;CHECK: pmovzxbd
 ;CHECK: pand
   %K = inttoptr <4 x i8> %G to <4 x i32*>
 ;CHECK: ret
@@ -105,7 +104,7 @@ define <2 x i32*> @BITCAST1(<2 x i8*>* %p) nounwind {
 entry:
   %G = load <2 x i8*>* %p
 ;CHECK: movl
-;CHECK: movsd
+;CHECK: pmovzxdq
   %T = bitcast <2 x i8*> %G to <2 x i32*>
 ;CHECK: ret
   ret <2 x i32*> %T
diff --git a/test/CodeGen/X86/pr11334.ll b/test/CodeGen/X86/pr11334.ll
index 5b7b5ea..e7e29e0 100644
--- a/test/CodeGen/X86/pr11334.ll
+++ b/test/CodeGen/X86/pr11334.ll
@@ -54,3 +54,11 @@ entry:
   %f1 = fpext <8 x float> %v1 to <8 x double>
   ret <8 x double> %f1
 }
+
+define void @test_vector_creation() nounwind {
+  %1 = insertelement <4 x double> undef, double 0.000000e+00, i32 2
+  %2 = load double addrspace(1)* null
+  %3 = insertelement <4 x double> %1, double %2, i32 3
+  store <4 x double> %3, <4 x double>* undef
+  ret void
+}
diff --git a/test/CodeGen/X86/pr11985.ll b/test/CodeGen/X86/pr11985.ll
new file mode 100644
index 0000000..fa37850
--- /dev/null
+++ b/test/CodeGen/X86/pr11985.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=prescott | FileCheck %s
+
+define float @foo(i8* nocapture %buf, float %a, float %b) nounwind uwtable {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %buf, i8* blockaddress(@foo, %out), i64 22, i32 1, i1 false)
+  br label %out
+
+out:                                              ; preds = %entry
+  %add = fadd float %a, %b
+  ret float %add
+; CHECK: foo
+; CHECK: movw .L{{.*}}+20(%rip), %{{.*}}
+; CHECK: movl .L{{.*}}+16(%rip), %{{.*}}
+; CHECK: movq .L{{.*}}+8(%rip), %{{.*}}
+; CHECK: movq .L{{.*}}(%rip), %{{.*}}
+; CHECK: ret
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/X86/pr12312.ll b/test/CodeGen/X86/pr12312.ll
new file mode 100644
index 0000000..087b8d7
--- /dev/null
+++ b/test/CodeGen/X86/pr12312.ll
@@ -0,0 +1,155 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse41,-avx < %s | FileCheck %s --check-prefix SSE41
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 < %s | FileCheck %s --check-prefix AVX
+
+define i32 @veccond128(<4 x i32> %input) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond128
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond128
+; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
+; AVX:   ret
+}
+
+define i32 @veccond256(<8 x i32> %input) {
+entry:
+  %0 = bitcast <8 x i32> %input to i256
+  %1 = icmp ne i256 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond256
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond256
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @veccond512(<16 x i32> %input) {
+entry:
+  %0 = bitcast <16 x i32> %input to i512
+  %1 = icmp ne i512 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond512
+; SSE41: por
+; SSE41: por
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond512
+; AVX:   vorps
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vectest128(<4 x i32> %input) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest128
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest128
+; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vectest256(<8 x i32> %input) {
+entry:
+  %0 = bitcast <8 x i32> %input to i256
+  %1 = icmp ne i256 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest256
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest256
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vectest512(<16 x i32> %input) {
+entry:
+  %0 = bitcast <16 x i32> %input to i512
+  %1 = icmp ne i512 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest512
+; SSE41: por
+; SSE41: por
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest512
+; AVX:   vorps
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vecsel128(<4 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel128
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel128
+; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vecsel256(<8 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <8 x i32> %input to i256
+  %1 = icmp ne i256 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel256
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel256
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <16 x i32> %input to i512
+  %1 = icmp ne i512 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel512
+; SSE41: por
+; SSE41: por
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel512
+; AVX:   vorps
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
diff --git a/test/CodeGen/X86/pr12359.ll b/test/CodeGen/X86/pr12359.ll
new file mode 100644
index 0000000..024b163
--- /dev/null
+++ b/test/CodeGen/X86/pr12359.ll
@@ -0,0 +1,10 @@
+; RUN: llc -asm-verbose -mtriple=x86_64-unknown-unknown -mcpu=corei7 < %s | FileCheck %s
+define <16 x i8> @shuf(<16 x i8> %inval1) {
+entry:
+  %0 = shufflevector <16 x i8> %inval1, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4, i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4>
+  ret <16 x i8> %0
+; CHECK: shuf
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/pr13458.ll b/test/CodeGen/X86/pr13458.ll
new file mode 100644
index 0000000..55548b3
--- /dev/null
+++ b/test/CodeGen/X86/pr13458.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin11.4.2"
+
+%v8_uniform_Stats.0.2.4.10 = type { i64, i64, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, i64, [7 x i32], [7 x i64] }
+
+@globalStats = external global %v8_uniform_Stats.0.2.4.10
+
+define void @MergeStats() nounwind {
+allocas:
+  %r.i.i720 = atomicrmw max i64* getelementptr inbounds (%v8_uniform_Stats.0.2.4.10* @globalStats, i64 0, i32 30), i64 0 seq_cst
+  ret void
+}
diff --git a/test/CodeGen/X86/pr13859.ll b/test/CodeGen/X86/pr13859.ll
new file mode 100644
index 0000000..719721d
--- /dev/null
+++ b/test/CodeGen/X86/pr13859.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.7.0"
+
+define void @_Z17FilterYUVRows_MMXi(i32 %af) nounwind ssp {
+entry:
+  %aMyAlloca = alloca i32, align 32
+  %dest = alloca <1 x i64>, align 32
+
+  %a32 = load i32* %aMyAlloca, align 4
+  %aconv = trunc i32 %a32 to i16
+  %a36 = insertelement <4 x i16> undef, i16 %aconv, i32 0
+  %a37 = insertelement <4 x i16> %a36, i16 %aconv, i32 1
+  %a38 = insertelement <4 x i16> %a37, i16 %aconv, i32 2
+  %a39 = insertelement <4 x i16> %a38, i16 %aconv, i32 3
+  %a40 = bitcast <4 x i16> %a39 to x86_mmx
+  %a41 = bitcast x86_mmx %a40 to <1 x i64>
+
+  %a47 = trunc i32 %a32 to i1
+  br i1 %a47, label %a48, label %a49
+
+a48:
+  unreachable
+
+a49:
+  store <1 x i64> %a41, <1 x i64>* %dest, align 8 ; !!!
+  ret void
+}
diff --git a/test/CodeGen/X86/pr13899.ll b/test/CodeGen/X86/pr13899.ll
new file mode 100644
index 0000000..bc81e34
--- /dev/null
+++ b/test/CodeGen/X86/pr13899.ll
@@ -0,0 +1,58 @@
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s --check-prefix=X64
+
+; ModuleID = 'a.bc'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S32"
+target triple = "i386-pc-win32"
+
+%v4_varying_big_struct = type { [4 x <4 x i32>] }
+
+declare <4 x i32> @"foo"(%v4_varying_big_struct, <4 x i32>) nounwind
+
+define <4 x i32> @"bar"(%v4_varying_big_struct %s, <4 x i32> %__mask) nounwind {
+allocas:
+  %calltmp = call <4 x i32> @"foo"(%v4_varying_big_struct %s, <4 x i32> %__mask)
+  ret <4 x i32> %calltmp
+; CHECK: bar
+; CHECK: andl
+; CHECK: call
+; CHECK: ret
+}
+
+declare <8 x float> @bar64(<8 x float> %i0, <8 x float> %i1,
+                         <8 x float> %i2, <8 x float> %i3,
+                         <8 x float> %i4, <8 x float> %i5,
+                         <8 x float> %i6, <8 x float> %i7,
+                         <8 x float> %i8, <8 x float> %i9)
+
+define <8 x float> @foo64(<8 x float>* %p) {
+  %1 = load <8 x float>* %p
+  %idx1 = getelementptr inbounds <8 x float>* %p, i64 1
+  %2 = load <8 x float>* %idx1
+  %idx2 = getelementptr inbounds <8 x float>* %p, i64 2
+  %3 = load <8 x float>* %idx2
+  %idx3 = getelementptr inbounds <8 x float>* %p, i64 3
+  %4 = load <8 x float>* %idx3
+  %idx4 = getelementptr inbounds <8 x float>* %p, i64 4
+  %5 = load <8 x float>* %idx4
+  %idx5 = getelementptr inbounds <8 x float>* %p, i64 5
+  %6 = load <8 x float>* %idx5
+  %idx6 = getelementptr inbounds <8 x float>* %p, i64 6
+  %7 = load <8 x float>* %idx6
+  %idx7 = getelementptr inbounds <8 x float>* %p, i64 7
+  %8 = load <8 x float>* %idx7
+  %idx8 = getelementptr inbounds <8 x float>* %p, i64 8
+  %9 = load <8 x float>* %idx8
+  %idx9 = getelementptr inbounds <8 x float>* %p, i64 9
+  %10 = load <8 x float>* %idx9
+  %r = tail call <8 x float> @bar64(<8 x float> %1, <8 x float> %2,
+                                    <8 x float> %3, <8 x float> %4,
+                                    <8 x float> %5, <8 x float> %6,
+                                    <8 x float> %7, <8 x float> %8,
+                                    <8 x float> %9, <8 x float> %10)
+  ret <8 x float> %r
+; X64: foo
+; X64: and
+; X64: call
+; X64: ret
+}
diff --git a/test/CodeGen/X86/pr14088.ll b/test/CodeGen/X86/pr14088.ll
new file mode 100644
index 0000000..505e3b5
--- /dev/null
+++ b/test/CodeGen/X86/pr14088.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple x86_64-linux -mcpu core2 -verify-machineinstrs %s -o - | FileCheck %s
+define i32 @f(i1 %foo, i16* %tm_year2, i8* %bar, i16 %zed, i32 %zed2) {
+entry:
+  br i1 %foo, label %return, label %if.end
+
+if.end:
+  %rem = srem i32 %zed2, 100
+  %conv3 = trunc i32 %rem to i16
+  store i16 %conv3, i16* %tm_year2
+  %sext = shl i32 %rem, 16
+  %conv5 = ashr exact i32 %sext, 16
+  %div = sdiv i32 %conv5, 10
+  %conv6 = trunc i32 %div to i8
+  store i8 %conv6, i8* %bar
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 0, %if.end ], [ -1, %entry ]
+  ret i32 %retval.0
+}
+
+; We were miscompiling this and using %ax instead of %cx in the movw.
+; CHECK: movswl	%cx, %ecx
+; CHECK: movw	%cx, (%rsi)
+; CHECK: movslq	%ecx, %rcx
diff --git a/test/CodeGen/X86/pr14090.ll b/test/CodeGen/X86/pr14090.ll
new file mode 100644
index 0000000..d76b912
--- /dev/null
+++ b/test/CodeGen/X86/pr14090.ll
@@ -0,0 +1,76 @@
+; RUN: llc < %s -march=x86-64 -print-before=stack-coloring -print-after=stack-coloring >%t 2>&1 && FileCheck <%t %s
+
+define void @foo(i64* %retval.i, i32 %call, i32* %.ph.i80, i32 %fourteen, i32* %out.lo, i32* %out.hi) nounwind align 2 {
+entry:
+  %_Tmp.i39 = alloca i64, align 8
+  %retval.i33 = alloca i64, align 8
+  %_Tmp.i = alloca i64, align 8
+  %retval.i.i = alloca i64, align 8
+  %_First.i = alloca i64, align 8
+
+  %0 = load i64* %retval.i, align 8
+
+  %1 = load i64* %retval.i, align 8
+
+  %_Tmp.i39.0.cast73 = bitcast i64* %_Tmp.i39 to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %_Tmp.i39.0.cast73)
+  store i64 %1, i64* %_Tmp.i39, align 8
+  %cmp.i.i.i40 = icmp slt i32 %call, 0
+  %2 = lshr i64 %1, 32
+  %3 = trunc i64 %2 to i32
+  %sub.i.i.i44 = sub i32 0, %call
+  %cmp2.i.i.i45 = icmp ult i32 %3, %sub.i.i.i44
+  %or.cond.i.i.i46 = and i1 %cmp.i.i.i40, %cmp2.i.i.i45
+  %add.i.i.i47 = add i32 %3, %call
+  %sub5.i.i.i48 = lshr i32 %add.i.i.i47, 5
+  %trunc.i50 = trunc i64 %1 to i32
+  %inttoptr.i51 = inttoptr i32 %trunc.i50 to i32*
+  %add61617.i.i.i52 = or i32 %sub5.i.i.i48, -134217728
+  %add61617.i.sub5.i.i.i53 = select i1 %or.cond.i.i.i46, i32 %add61617.i.i.i52, i32 %sub5.i.i.i48
+  %storemerge2.i.i54 = getelementptr inbounds i32* %inttoptr.i51, i32 %add61617.i.sub5.i.i.i53
+  %_Tmp.i39.0.cast74 = bitcast i64* %_Tmp.i39 to i32**
+  store i32* %storemerge2.i.i54, i32** %_Tmp.i39.0.cast74, align 8
+  %storemerge.i.i55 = and i32 %add.i.i.i47, 31
+  %_Tmp.i39.4.raw_idx = getelementptr inbounds i8* %_Tmp.i39.0.cast73, i32 4
+  %_Tmp.i39.4.cast = bitcast i8* %_Tmp.i39.4.raw_idx to i32*
+  store i32 %storemerge.i.i55, i32* %_Tmp.i39.4.cast, align 4
+  %srcval.i56 = load i64* %_Tmp.i39, align 8
+  call void @llvm.lifetime.end(i64 8, i8* %_Tmp.i39.0.cast73)
+
+; CHECK: Before Merge disjoint stack slots
+; CHECK: [[PREFIX15:MOV64mr.*<fi#]]{{[0-9]}}[[SUFFIX15:.*;]] mem:ST8[%fifteen]
+; CHECK: [[PREFIX87:MOV32mr.*;]] mem:ST4[%sunkaddr87]
+
+; CHECK: After Merge disjoint stack slots
+; CHECK: [[PREFIX15]]{{[0-9]}}[[SUFFIX15]] mem:ST8[%_Tmp.i39]
+; CHECK: [[PREFIX87]] mem:ST4[<unknown>]
+
+  %fifteen = bitcast i64* %retval.i.i to i32**
+  %sixteen = bitcast i64* %retval.i.i to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %sixteen)
+  store i32* %.ph.i80, i32** %fifteen, align 8, !tbaa !0
+  %sunkaddr = ptrtoint i64* %retval.i.i to i32
+  %sunkaddr86 = add i32 %sunkaddr, 4
+  %sunkaddr87 = inttoptr i32 %sunkaddr86 to i32*
+  store i32 %fourteen, i32* %sunkaddr87, align 4, !tbaa !3
+  %seventeen = load i64* %retval.i.i, align 8
+  call void @llvm.lifetime.end(i64 8, i8* %sixteen)
+  %eighteen = lshr i64 %seventeen, 32
+  %nineteen = trunc i64 %eighteen to i32
+  %shl.i.i.i = shl i32 1, %nineteen
+
+  store i32 %shl.i.i.i, i32* %out.lo, align 8
+  store i32 %nineteen, i32* %out.hi, align 8
+
+  ret void
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}
+!4 = metadata !{metadata !"vtable pointer", metadata !2}
diff --git a/test/CodeGen/X86/pr14098.ll b/test/CodeGen/X86/pr14098.ll
new file mode 100644
index 0000000..6ce2449
--- /dev/null
+++ b/test/CodeGen/X86/pr14098.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple i386-unknown-linux-gnu -relocation-model=pic -verify-machineinstrs < %s
+; We used to crash on this.
+
+declare void @foo()
+declare void @foo3(i1 %x)
+define void @bar(i1 %a1, i16 %a2) nounwind align 2 {
+bb0:
+  %a3 = trunc i16 %a2 to i8
+  %a4 = lshr i16 %a2, 8
+  %a5 = trunc i16 %a4 to i8
+  br i1 %a1, label %bb1, label %bb2
+bb1:
+  br label %bb2
+bb2:
+  %a6 = phi i8 [ 3, %bb0 ], [ %a5, %bb1 ]
+  %a7 = phi i8 [ 9, %bb0 ], [ %a3, %bb1 ]
+  %a8 = icmp eq i8 %a6, 1
+  call void @foo()
+  %a9 = icmp eq i8 %a7, 0
+  call void @foo3(i1 %a9)
+  call void @foo3(i1 %a8)
+  ret void
+}
diff --git a/test/CodeGen/X86/pr14161.ll b/test/CodeGen/X86/pr14161.ll
new file mode 100644
index 0000000..ff4532e
--- /dev/null
+++ b/test/CodeGen/X86/pr14161.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-linux-pc -mcpu=corei7 | FileCheck %s
+
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>)
+
+define <2 x i16> @good(<4 x i32>*, <4 x i8>*) {
+entry:
+  %2 = load <4 x i32>* %0, align 16
+  %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
+  %4 = extractelement <4 x i32> %3, i32 0
+  %5 = extractelement <4 x i32> %3, i32 1
+  %6 = extractelement <4 x i32> %3, i32 2
+  %7 = extractelement <4 x i32> %3, i32 3
+  %8 = bitcast i32 %4 to <2 x i16>
+  %9 = bitcast i32 %5 to <2 x i16>
+  ret <2 x i16> %8
+; CHECK: good
+; CHECK: pminud
+; CHECK-NEXT: pmovzxwq
+; CHECK: ret
+}
+
+define <2 x i16> @bad(<4 x i32>*, <4 x i8>*) {
+entry:
+  %2 = load <4 x i32>* %0, align 16
+  %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
+  %4 = extractelement <4 x i32> %3, i32 0
+  %5 = extractelement <4 x i32> %3, i32 1
+  %6 = extractelement <4 x i32> %3, i32 2
+  %7 = extractelement <4 x i32> %3, i32 3
+  %8 = bitcast i32 %4 to <2 x i16>
+  %9 = bitcast i32 %5 to <2 x i16>
+  ret <2 x i16> %9
+; CHECK: bad
+; CHECK: pminud
+; CHECK: pextrd
+; CHECK: pmovzxwq
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/pr14204.ll b/test/CodeGen/X86/pr14204.ll
new file mode 100644
index 0000000..42e362b
--- /dev/null
+++ b/test/CodeGen/X86/pr14204.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=x86_64-linux-pc -mcpu=core-avx2 | FileCheck %s
+
+; FIXME: vpmovsxwd should be generated instead of vpmovzxwd followed by
+; SLL/SRA.
+
+define <8 x i32> @foo(<8 x i1> %bar) nounwind readnone {
+entry:
+  %s = sext <8 x i1> %bar to <8 x i32>
+  ret <8 x i32> %s
+; CHECK: foo
+; CHECK: vpmovzxwd
+; CHECK: vpslld
+; CHECK: vpsrad
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/pr14314.ll b/test/CodeGen/X86/pr14314.ll
new file mode 100644
index 0000000..5388a4b
--- /dev/null
+++ b/test/CodeGen/X86/pr14314.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 | FileCheck %s
+
+define i64 @atomicSub(i64* %a, i64 %b) nounwind {
+entry:
+  %0 = atomicrmw sub i64* %a, i64 %b seq_cst
+  ret i64 %0
+; CHECK: atomicSub
+; movl %eax, %ebx
+; subl {{%[a-z]+}}, %ebx
+; movl %edx, %ecx
+; sbbl {{%[a-z]+}}, %ecx
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/pr14333.ll b/test/CodeGen/X86/pr14333.ll
new file mode 100644
index 0000000..86c12ef
--- /dev/null
+++ b/test/CodeGen/X86/pr14333.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s
+%foo = type { i64, i64 }
+define void @bar(%foo* %zed) {
+  %tmp = getelementptr inbounds %foo* %zed, i64 0, i32 0
+  store i64 0, i64* %tmp, align 8
+  %tmp2 = getelementptr inbounds %foo* %zed, i64 0, i32 1
+  store i64 0, i64* %tmp2, align 8
+  %tmp3 = bitcast %foo* %zed to i8*
+  call void @llvm.memset.p0i8.i64(i8* %tmp3, i8 0, i64 16, i32 8, i1 false)
+  ret void
+}
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/X86/pr5145.ll b/test/CodeGen/X86/pr5145.ll
new file mode 100644
index 0000000..d048db8
--- /dev/null
+++ b/test/CodeGen/X86/pr5145.ll
@@ -0,0 +1,35 @@
+; RUN: llc -march=x86-64 < %s | FileCheck %s
+@sc8 = external global i8
+
+define void @atomic_maxmin_i8() {
+; CHECK: atomic_maxmin_i8
+  %1 = atomicrmw max  i8* @sc8, i8 5 acquire
+; CHECK: [[LABEL1:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmovl
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL1]]
+  %2 = atomicrmw min  i8* @sc8, i8 6 acquire
+; CHECK: [[LABEL3:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmovg
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL3]]
+  %3 = atomicrmw umax i8* @sc8, i8 7 acquire
+; CHECK: [[LABEL5:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmovb
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL5]]
+  %4 = atomicrmw umin i8* @sc8, i8 8 acquire
+; CHECK: [[LABEL7:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmova
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL7]]
+  ret void
+}
diff --git a/test/CodeGen/X86/promote.ll b/test/CodeGen/X86/promote.ll
index 8b30dc7..283f48c 100644
--- a/test/CodeGen/X86/promote.ll
+++ b/test/CodeGen/X86/promote.ll
@@ -20,7 +20,7 @@ entry:
 ; CHECK: shuff_f
 define i32 @shuff_f(<4 x i8>* %A) {
 entry:
-; CHECK: pshufb
+; CHECK: pmovzxbd
 ; CHECK: paddd
 ; CHECK: pshufb
   %0 = load <4 x i8>* %A, align 8
diff --git a/test/CodeGen/X86/ptr-rotate.ll b/test/CodeGen/X86/ptr-rotate.ll
index 6debd16..fbd13b5 100644
--- a/test/CodeGen/X86/ptr-rotate.ll
+++ b/test/CodeGen/X86/ptr-rotate.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i386-apple-darwin -o - < %s | FileCheck %s
+; RUN: llc -mtriple=i386-apple-darwin -mcpu=corei7 -o - < %s | FileCheck %s
 
 define i32 @func(i8* %A) nounwind readnone {
 entry:
diff --git a/test/CodeGen/X86/red-zone2.ll b/test/CodeGen/X86/red-zone2.ll
index f092163..3e9c790 100644
--- a/test/CodeGen/X86/red-zone2.ll
+++ b/test/CodeGen/X86/red-zone2.ll
@@ -1,6 +1,7 @@
-; RUN: llc < %s -mcpu=generic -march=x86-64 > %t
-; RUN: grep subq %t | count 1
-; RUN: grep addq %t | count 1
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
+; CHECK: f0:
+; CHECK: subq
+; CHECK: addq
 
 define x86_fp80 @f0(float %f) nounwind readnone noredzone {
 entry:
diff --git a/test/CodeGen/X86/rot32.ll b/test/CodeGen/X86/rot32.ll
index 99602fd..e95a734 100644
--- a/test/CodeGen/X86/rot32.ll
+++ b/test/CodeGen/X86/rot32.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
 
 define i32 @foo(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
@@ -48,12 +49,25 @@ define i32 @xfoo(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK: xfoo:
 ; CHECK: roll $7
+; BMI2: xfoo:
+; BMI2: rorxl $25
 	%0 = lshr i32 %x, 25
 	%1 = shl i32 %x, 7
 	%2 = or i32 %0, %1
 	ret i32 %2
 }
 
+define i32 @xfoop(i32* %p) nounwind readnone {
+entry:
+; BMI2: xfoop:
+; BMI2: rorxl $25, ({{.+}}), %{{.+}}
+	%x = load i32* %p
+	%a = lshr i32 %x, 25
+	%b = shl i32 %x, 7
+	%c = or i32 %a, %b
+	ret i32 %c
+}
+
 define i32 @xbar(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK: xbar:
@@ -68,12 +82,25 @@ define i32 @xun(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK: xun:
 ; CHECK: roll $25
+; BMI2: xun:
+; BMI2: rorxl $7
 	%0 = lshr i32 %x, 7
 	%1 = shl i32 %x, 25
 	%2 = or i32 %0, %1
 	ret i32 %2
 }
 
+define i32 @xunp(i32* %p) nounwind readnone {
+entry:
+; BMI2: xunp:
+; BMI2: rorxl $7, ({{.+}}), %{{.+}}
+	%x = load i32* %p
+	%a = lshr i32 %x, 7
+	%b = shl i32 %x, 25
+	%c = or i32 %a, %b
+	ret i32 %c
+}
+
 define i32 @xbu(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK: xbu:
diff --git a/test/CodeGen/X86/rot64.ll b/test/CodeGen/X86/rot64.ll
index 4e082bb..7fa982d 100644
--- a/test/CodeGen/X86/rot64.ll
+++ b/test/CodeGen/X86/rot64.ll
@@ -1,8 +1,9 @@
-; RUN: llc < %s -march=x86-64 > %t
-; RUN: grep rol %t | count 3
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 > %t
+; RUN: grep rol %t | count 5
 ; RUN: grep ror %t | count 1
 ; RUN: grep shld %t | count 2
 ; RUN: grep shrd %t | count 2
+; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
 
 define i64 @foo(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
@@ -42,12 +43,25 @@ entry:
 
 define i64 @xfoo(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
+; BMI2: xfoo:
+; BMI2: rorxq $57
 	%0 = lshr i64 %x, 57
 	%1 = shl i64 %x, 7
 	%2 = or i64 %0, %1
 	ret i64 %2
 }
 
+define i64 @xfoop(i64* %p) nounwind readnone {
+entry:
+; BMI2: xfoop:
+; BMI2: rorxq $57, ({{.+}}), %{{.+}}
+	%x = load i64* %p
+	%a = lshr i64 %x, 57
+	%b = shl i64 %x, 7
+	%c = or i64 %a, %b
+	ret i64 %c
+}
+
 define i64 @xbar(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
 	%0 = shl i64 %y, 7
@@ -58,12 +72,25 @@ entry:
 
 define i64 @xun(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
+; BMI2: xun:
+; BMI2: rorxq $7
 	%0 = lshr i64 %x, 7
 	%1 = shl i64 %x, 57
 	%2 = or i64 %0, %1
 	ret i64 %2
 }
 
+define i64 @xunp(i64* %p) nounwind readnone {
+entry:
+; BMI2: xunp:
+; BMI2: rorxq $7, ({{.+}}), %{{.+}}
+	%x = load i64* %p
+	%a = lshr i64 %x, 7
+	%b = shl i64 %x, 57
+	%c = or i64 %a, %b
+	ret i64 %c
+}
+
 define i64 @xbu(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
 	%0 = lshr i64 %y, 7
diff --git a/test/CodeGen/X86/rotate2.ll b/test/CodeGen/X86/rotate2.ll
index 2eea399..2316c70 100644
--- a/test/CodeGen/X86/rotate2.ll
+++ b/test/CodeGen/X86/rotate2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep rol | count 2
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | grep rol | count 2
 
 define i64 @test1(i64 %x) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/rtm.ll b/test/CodeGen/X86/rtm.ll
new file mode 100644
index 0000000..76eb951
--- /dev/null
+++ b/test/CodeGen/X86/rtm.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mattr=+rtm -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+declare i32 @llvm.x86.xbegin() nounwind
+declare void @llvm.x86.xend() nounwind
+declare void @llvm.x86.xabort(i8) noreturn nounwind
+
+define i32 @test_xbegin() nounwind uwtable {
+entry:
+  %0 = tail call i32 @llvm.x86.xbegin() nounwind
+  ret i32 %0
+; CHECK: test_xbegin
+; CHECK: xbegin [[LABEL:.*BB.*]]
+; CHECK: [[LABEL]]:
+}
+
+define void @test_xend() nounwind uwtable {
+entry:
+  tail call void @llvm.x86.xend() nounwind
+  ret void
+; CHECK: test_xend
+; CHECK: xend
+}
+
+define void @test_xabort() nounwind uwtable {
+entry:
+  tail call void @llvm.x86.xabort(i8 2)
+  unreachable
+; CHECK: test_xabort
+; CHECK: xabort $2
+}
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index 2e39473..3bec3ac 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -344,3 +344,16 @@ entry:
 ; ATOM: negw
 ; ATOM: sbbw
 }
+
+define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
+  %cmp = icmp slt i32 %x, 15
+  %sel = select i1 %cmp, i8 %a, i8 %b
+  ret i8 %sel
+; CHECK: test18:
+; CHECK: cmpl $15, %edi
+; CHECK: cmovgel %edx
+
+; ATOM: test18:
+; ATOM: cmpl $15, %edi
+; ATOM: cmovgel %edx
+}
diff --git a/test/CodeGen/X86/select_const.ll b/test/CodeGen/X86/select_const.ll
new file mode 100644
index 0000000..5b2409d
--- /dev/null
+++ b/test/CodeGen/X86/select_const.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mcpu=corei7 | FileCheck %s
+
+define i64 @test1(i64 %x) nounwind {
+entry:
+  %cmp = icmp eq i64 %x, 2
+  %add = add i64 %x, 1
+  %retval.0 = select i1 %cmp, i64 2, i64 %add
+  ret i64 %retval.0
+
+; CHECK: test1:
+; CHECK: leaq 1(%rdi), %rax
+; CHECK: cmpq $2, %rdi
+; CHECK: cmoveq %rdi, %rax
+; CHECK: ret
+
+}
diff --git a/test/CodeGen/X86/shift-bmi2.ll b/test/CodeGen/X86/shift-bmi2.ll
new file mode 100644
index 0000000..d1f321f
--- /dev/null
+++ b/test/CodeGen/X86/shift-bmi2.ll
@@ -0,0 +1,178 @@
+; RUN: llc -mtriple=i386-unknown-unknown -mcpu=core-avx2 < %s | FileCheck --check-prefix=BMI2 %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 < %s | FileCheck --check-prefix=BMI264 %s
+
+define i32 @shl32(i32 %x, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = shl i32 %x, %shamt
+; BMI2: shl32
+; BMI2: shlxl
+; BMI2: ret
+; BMI264: shl32
+; BMI264: shlxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @shl32i(i32 %x) nounwind uwtable readnone {
+entry:
+  %shl = shl i32 %x, 5
+; BMI2: shl32i
+; BMI2-NOT: shlxl
+; BMI2: ret
+; BMI264: shl32i
+; BMI264-NOT: shlxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @shl32p(i32* %p, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i32* %p
+  %shl = shl i32 %x, %shamt
+; BMI2: shl32p
+; BMI2: shlxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI2: ret
+; BMI264: shl32p
+; BMI264: shlxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @shl32pi(i32* %p) nounwind uwtable readnone {
+entry:
+  %x = load i32* %p
+  %shl = shl i32 %x, 5
+; BMI2: shl32pi
+; BMI2-NOT: shlxl
+; BMI2: ret
+; BMI264: shl32pi
+; BMI264-NOT: shlxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i64 @shl64(i64 %x, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = shl i64 %x, %shamt
+; BMI264: shl64
+; BMI264: shlxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @shl64i(i64 %x) nounwind uwtable readnone {
+entry:
+  %shl = shl i64 %x, 7
+; BMI264: shl64i
+; BMI264-NOT: shlxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @shl64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i64* %p
+  %shl = shl i64 %x, %shamt
+; BMI264: shl64p
+; BMI264: shlxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @shl64pi(i64* %p) nounwind uwtable readnone {
+entry:
+  %x = load i64* %p
+  %shl = shl i64 %x, 7
+; BMI264: shl64p
+; BMI264-NOT: shlxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i32 @lshr32(i32 %x, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = lshr i32 %x, %shamt
+; BMI2: lshr32
+; BMI2: shrxl
+; BMI2: ret
+; BMI264: lshr32
+; BMI264: shrxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @lshr32p(i32* %p, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i32* %p
+  %shl = lshr i32 %x, %shamt
+; BMI2: lshr32p
+; BMI2: shrxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI2: ret
+; BMI264: lshr32
+; BMI264: shrxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i64 @lshr64(i64 %x, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = lshr i64 %x, %shamt
+; BMI264: lshr64
+; BMI264: shrxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @lshr64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i64* %p
+  %shl = lshr i64 %x, %shamt
+; BMI264: lshr64p
+; BMI264: shrxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i32 @ashr32(i32 %x, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = ashr i32 %x, %shamt
+; BMI2: ashr32
+; BMI2: sarxl
+; BMI2: ret
+; BMI264: ashr32
+; BMI264: sarxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @ashr32p(i32* %p, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i32* %p
+  %shl = ashr i32 %x, %shamt
+; BMI2: ashr32p
+; BMI2: sarxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI2: ret
+; BMI264: ashr32
+; BMI264: sarxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i64 @ashr64(i64 %x, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = ashr i64 %x, %shamt
+; BMI264: ashr64
+; BMI264: sarxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @ashr64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i64* %p
+  %shl = ashr i64 %x, %shamt
+; BMI264: ashr64p
+; BMI264: sarxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i64 %shl
+}
diff --git a/test/CodeGen/X86/sincos.ll b/test/CodeGen/X86/sincos.ll
index 1479be1..734f48a 100644
--- a/test/CodeGen/X86/sincos.ll
+++ b/test/CodeGen/X86/sincos.ll
@@ -1,6 +1,7 @@
 ; Make sure this testcase codegens to the sin and cos instructions, not calls
 ; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | FileCheck %s --check-prefix=SIN
 ; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | FileCheck %s --check-prefix=COS
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 | FileCheck %s --check-prefix=SAFE
 
 declare float  @sinf(float) readonly
 
@@ -17,6 +18,9 @@ define float @test1(float %X) {
 
 ; SIN-NOT: fsin
 
+; SAFE: test1
+; SAFE-NOT: fsin
+
 ; SIN: test2:
 define double @test2(double %X) {
         %Y = call double @sin(double %X) readonly
@@ -26,6 +30,9 @@ define double @test2(double %X) {
 
 ; SIN-NOT: fsin
 
+; SAFE: test2
+; SAFE-NOT: fsin
+
 ; SIN: test3:
 define x86_fp80 @test3(x86_fp80 %X) {
         %Y = call x86_fp80 @sinl(x86_fp80 %X) readonly
@@ -50,12 +57,18 @@ define float @test4(float %X) {
 }
 ; COS: {{^[ \t]*fcos}}
 
+; SAFE: test4
+; SAFE-NOT: fcos
+
 define double @test5(double %X) {
         %Y = call double @cos(double %X) readonly
         ret double %Y
 }
 ; COS: {{^[ \t]*fcos}}
 
+; SAFE: test5
+; SAFE-NOT: fcos
+
 define x86_fp80 @test6(x86_fp80 %X) {
         %Y = call x86_fp80 @cosl(x86_fp80 %X) readonly
         ret x86_fp80 %Y
diff --git a/test/CodeGen/X86/sjlj.ll b/test/CodeGen/X86/sjlj.ll
new file mode 100644
index 0000000..681db00
--- /dev/null
+++ b/test/CodeGen/X86/sjlj.ll
@@ -0,0 +1,60 @@
+; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 -relocation-model=static | FileCheck --check-prefix=X86 %s
+; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 -relocation-model=pic | FileCheck --check-prefix=PIC86 %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 -relocation-model=static | FileCheck --check-prefix=X64 %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 -relocation-model=pic | FileCheck --check-prefix=PIC64 %s
+
+@buf = internal global [5 x i8*] zeroinitializer
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
+
+declare i8* @llvm.stacksave() nounwind
+
+declare i32 @llvm.eh.sjlj.setjmp(i8*) nounwind
+
+declare void @llvm.eh.sjlj.longjmp(i8*) nounwind
+
+define i32 @sj0() nounwind {
+  %fp = tail call i8* @llvm.frameaddress(i32 0)
+  store i8* %fp, i8** getelementptr inbounds ([5 x i8*]* @buf, i64 0, i64 0), align 16
+  %sp = tail call i8* @llvm.stacksave()
+  store i8* %sp, i8** getelementptr inbounds ([5 x i8*]* @buf, i64 0, i64 2), align 16
+  %r = tail call i32 @llvm.eh.sjlj.setjmp(i8* bitcast ([5 x i8*]* @buf to i8*))
+  ret i32 %r
+; X86: sj0
+; x86: movl %ebp, buf
+; X86: movl %esp, buf+8
+; x86: movl ${{.*LBB.*}}, buf+4
+; X86: ret
+; PIC86: sj0
+; PIC86: movl %ebp, buf@GOTOFF(%[[GOT:.*]])
+; PIC86: movl %esp, buf@GOTOFF+8(%[[GOT]])
+; PIC86: leal {{.*LBB.*}}@GOTOFF(%[[GOT]]), %[[LREG:.*]]
+; PIC86: movl %[[LREG]], buf@GOTOFF+4
+; PIC86: ret
+; X64: sj0
+; x64: movq %rbp, buf(%rip)
+; x64: movq ${{.*LBB.*}}, buf+8(%rip)
+; X64: movq %rsp, buf+16(%rip)
+; X64: ret
+; PIC64: sj0
+; PIC64: movq %rbp, buf(%rip)
+; PIC64: movq %rsp, buf+16(%rip)
+; PIC64: leaq {{.*LBB.*}}(%rip), %[[LREG:.*]]
+; PIC64: movq %[[LREG]], buf+8(%rip)
+; PIC64: ret
+}
+
+define void @lj0() nounwind {
+  tail call void @llvm.eh.sjlj.longjmp(i8* bitcast ([5 x i8*]* @buf to i8*))
+  unreachable
+; X86: lj0
+; X86: movl buf, %ebp
+; X86: movl buf+4, %[[REG32:.*]]
+; X86: movl buf+8, %esp
+; X86: jmpl *%[[REG32]]
+; X64: lj0
+; X64: movq buf(%rip), %rbp
+; X64: movq buf+8(%rip), %[[REG64:.*]]
+; X64: movq buf+16(%rip), %rsp
+; X64: jmpq *%[[REG64]]
+}
diff --git a/test/CodeGen/X86/smul-with-overflow.ll b/test/CodeGen/X86/smul-with-overflow.ll
index 7ac3840..2d0b2f7 100644
--- a/test/CodeGen/X86/smul-with-overflow.ll
+++ b/test/CodeGen/X86/smul-with-overflow.ll
@@ -67,3 +67,17 @@ entry:
 ; CHECK: mull
 ; CHECK-NEXT: ret
 }
+
+declare { i63, i1 } @llvm.smul.with.overflow.i63(i63, i63) nounwind readnone
+
+define i1 @test5() nounwind {
+entry:
+  %res = call { i63, i1 } @llvm.smul.with.overflow.i63(i63 4, i63 4611686018427387903)
+  %sum = extractvalue { i63, i1 } %res, 0
+  %overflow = extractvalue { i63, i1 } %res, 1
+  ret i1 %overflow
+; Was returning false, should return true (not constant folded yet though).
+; PR13991
+; CHECK: test5:
+; CHECK-NOT: xorb
+}
diff --git a/test/CodeGen/X86/sse-intel-ocl.ll b/test/CodeGen/X86/sse-intel-ocl.ll
new file mode 100644
index 0000000..1885050
--- /dev/null
+++ b/test/CodeGen/X86/sse-intel-ocl.ll
@@ -0,0 +1,93 @@
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck -check-prefix=NOT_WIN %s
+
+declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
+declare <16 x float> @func_float16(<16 x float>, <16 x float>)
+; WIN64: testf16_inp
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: leaq    {{.*}}(%rsp), %rcx
+; WIN64: call
+; WIN64: ret
+
+; WIN32: testf16_inp
+; WIN32: movl    %eax, (%esp)
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: call
+; WIN32: ret
+
+; NOT_WIN: testf16_inp
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: leaq    {{.*}}(%rsp), %rdi
+; NOT_WIN: call
+; NOT_WIN: ret
+
+;test calling conventions - input parameters
+define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %2, %1
+  ret <16 x float> %3
+}
+
+;test calling conventions - preserved registers
+
+; preserved xmm6-xmm15
+; WIN64: testf16_regs
+; WIN64: call
+; WIN64: addps  {{%xmm[6-9]}}, {{.*}}
+; WIN64: addps  {{%xmm[6-9]}}, {{.*}}
+; WIN64: ret
+
+; preserved xmm8-xmm15
+; NOT_WIN: testf16_regs
+; NOT_WIN: call
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: ret
+
+define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %1, %b
+  %4 = fadd <16 x float> %2, %3
+  ret <16 x float> %4
+}
+
+; test calling conventions - prolog and epilog
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: call
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
+   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
+   ret <16 x float> %c
+}
diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index 3839e87..0ba0215 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll
@@ -47,8 +47,7 @@ define double @olt(double %x, double %y) nounwind {
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      ogt_inverse:
-; UNSAFE-NEXT: minsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ogt_inverse:
 ; FINITE-NEXT: minsd  %xmm0, %xmm1
@@ -65,8 +64,7 @@ define double @ogt_inverse(double %x, double %y) nounwind {
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      olt_inverse:
-; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      olt_inverse:
 ; FINITE-NEXT: maxsd  %xmm0, %xmm1
@@ -107,8 +105,7 @@ define double @ole(double %x, double %y) nounwind {
 ; CHECK:      oge_inverse:
 ; CHECK-NEXT: ucomisd %xmm1, %xmm0
 ; UNSAFE:      oge_inverse:
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      oge_inverse:
 ; FINITE-NEXT: minsd %xmm0, %xmm1
@@ -123,8 +120,7 @@ define double @oge_inverse(double %x, double %y) nounwind {
 ; CHECK:      ole_inverse:
 ; CHECK-NEXT: ucomisd %xmm0, %xmm1
 ; UNSAFE:      ole_inverse:
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ole_inverse:
 ; FINITE-NEXT: maxsd %xmm0, %xmm1
@@ -142,7 +138,8 @@ define double @ole_inverse(double %x, double %y) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE:      ogt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ogt_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -160,7 +157,8 @@ define double @ogt_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE:      olt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      olt_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -218,7 +216,8 @@ define double @olt_inverse_x(double %x) nounwind {
 ; CHECK:      ucomisd %xmm1, %xmm0
 ; UNSAFE:      oge_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      oge_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -234,7 +233,8 @@ define double @oge_x(double %x) nounwind {
 ; CHECK:      ucomisd %xmm0, %xmm1
 ; UNSAFE:      ole_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ole_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -313,8 +313,7 @@ define double @ult(double %x, double %y) nounwind {
 ; CHECK:      ugt_inverse:
 ; CHECK:      ucomisd %xmm0, %xmm1
 ; UNSAFE:      ugt_inverse:
-; UNSAFE-NEXT: minsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ugt_inverse:
 ; FINITE-NEXT: minsd   %xmm0, %xmm1
@@ -329,8 +328,7 @@ define double @ugt_inverse(double %x, double %y) nounwind {
 ; CHECK:      ult_inverse:
 ; CHECK:      ucomisd %xmm1, %xmm0
 ; UNSAFE:      ult_inverse:
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ult_inverse:
 ; FINITE-NEXT: maxsd   %xmm0, %xmm1
@@ -378,8 +376,7 @@ define double @ule(double %x, double %y) nounwind {
 ; CHECK-NEXT: minsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      uge_inverse:
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      uge_inverse:
 ; FINITE-NEXT: minsd %xmm0, %xmm1
@@ -395,8 +392,7 @@ define double @uge_inverse(double %x, double %y) nounwind {
 ; CHECK-NEXT: maxsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      ule_inverse:
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ule_inverse:
 ; FINITE-NEXT: maxsd %xmm0, %xmm1
@@ -412,7 +408,8 @@ define double @ule_inverse(double %x, double %y) nounwind {
 ; CHECK:      ucomisd %xmm0, %xmm1
 ; UNSAFE:      ugt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ugt_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -428,7 +425,8 @@ define double @ugt_x(double %x) nounwind {
 ; CHECK:      ucomisd %xmm1, %xmm0
 ; UNSAFE:      ult_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: minsd   %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ult_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -483,7 +481,8 @@ define double @ult_inverse_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE:      uge_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      uge_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
@@ -502,7 +501,8 @@ define double @uge_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE:      ule_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
-; UNSAFE-NEXT: minsd  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd  %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ule_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
@@ -590,9 +590,7 @@ define double @olt_y(double %x) nounwind {
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      ogt_inverse_y:
-; UNSAFE-NEXT: movsd  {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: minsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd  {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ogt_inverse_y:
 ; FINITE-NEXT: movsd  {{[^,]*}}, %xmm1
@@ -611,9 +609,7 @@ define double @ogt_inverse_y(double %x) nounwind {
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      olt_inverse_y:
-; UNSAFE-NEXT: movsd  {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd  {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      olt_inverse_y:
 ; FINITE-NEXT: movsd  {{[^,]*}}, %xmm1
@@ -657,9 +653,7 @@ define double @ole_y(double %x) nounwind {
 ; CHECK:      oge_inverse_y:
 ; CHECK:      ucomisd %xmm
 ; UNSAFE:      oge_inverse_y:
-; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: minsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      oge_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
@@ -675,9 +669,7 @@ define double @oge_inverse_y(double %x) nounwind {
 ; CHECK:      ole_inverse_y:
 ; CHECK:      ucomisd %xmm
 ; UNSAFE:      ole_inverse_y:
-; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ole_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
@@ -721,9 +713,7 @@ define double @ult_y(double %x) nounwind {
 ; CHECK:      ugt_inverse_y:
 ; CHECK:      ucomisd %xmm
 ; UNSAFE:      ugt_inverse_y:
-; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: minsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ugt_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
@@ -739,9 +729,7 @@ define double @ugt_inverse_y(double %x) nounwind {
 ; CHECK:      ult_inverse_y:
 ; CHECK:      ucomisd %xmm
 ; UNSAFE:      ult_inverse_y:
-; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ult_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
@@ -792,9 +780,7 @@ define double @ule_y(double %x) nounwind {
 ; CHECK-NEXT: minsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      uge_inverse_y:
-; UNSAFE-NEXT: movsd {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      uge_inverse_y:
 ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
@@ -811,9 +797,7 @@ define double @uge_inverse_y(double %x) nounwind {
 ; CHECK-NEXT: maxsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      ule_inverse_y:
-; UNSAFE-NEXT: movsd {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ule_inverse_y:
 ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
diff --git a/test/CodeGen/X86/sse_partial_update.ll b/test/CodeGen/X86/sse_partial_update.ll
new file mode 100644
index 0000000..655f758
--- /dev/null
+++ b/test/CodeGen/X86/sse_partial_update.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -mcpu=nehalem | FileCheck %s
+
+; rdar: 12558838
+; PR14221
+; There is a mismatch between the intrinsic and the actual instruction.
+; The actual instruction has a partial update of dest, while the intrinsic
+; passes through the upper FP values. Here, we make sure the source and
+; destination of rsqrtss are the same.
+define void @t1(<4 x float> %a) nounwind uwtable ssp {
+entry:
+; CHECK: t1:
+; CHECK: rsqrtss %xmm0, %xmm0
+  %0 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a) nounwind
+  %a.addr.0.extract = extractelement <4 x float> %0, i32 0
+  %conv = fpext float %a.addr.0.extract to double
+  %a.addr.4.extract = extractelement <4 x float> %0, i32 1
+  %conv3 = fpext float %a.addr.4.extract to double
+  tail call void @callee(double %conv, double %conv3) nounwind
+  ret void
+}
+declare void @callee(double, double)
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define void @t2(<4 x float> %a) nounwind uwtable ssp {
+entry:
+; CHECK: t2:
+; CHECK: rcpss %xmm0, %xmm0
+  %0 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a) nounwind
+  %a.addr.0.extract = extractelement <4 x float> %0, i32 0
+  %conv = fpext float %a.addr.0.extract to double
+  %a.addr.4.extract = extractelement <4 x float> %0, i32 1
+  %conv3 = fpext float %a.addr.4.extract to double
+  tail call void @callee(double %conv, double %conv3) nounwind
+  ret void
+}
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/tailcall-64.ll b/test/CodeGen/X86/tailcall-64.ll
index 7030753..ecc253b 100644
--- a/test/CodeGen/X86/tailcall-64.ll
+++ b/test/CodeGen/X86/tailcall-64.ll
@@ -1,6 +1,4 @@
-; RUN: llc < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin11.4.0"
+; RUN: llc -mtriple=x86_64-apple-macosx -mcpu=core2 < %s | FileCheck %s
 
 declare i64 @testi()
 
@@ -93,4 +91,67 @@ define { i64, i64 } @crash(i8* %this) {
   ret { i64, i64 } %mrv7
 }
 
+; Check that we can fold an indexed load into a tail call instruction.
+; CHECK: fold_indexed_load
+; CHECK: leaq (%rsi,%rsi,4), %[[RAX:r..]]
+; CHECK: jmpq *16(%{{r..}},%[[RAX]],8)  # TAILCALL
+%struct.funcs = type { i32 (i8*, i32*, i32)*, i32 (i8*)*, i32 (i8*)*, i32 (i8*, i32)*, i32 }
+@func_table = external global [0 x %struct.funcs]
+define void @fold_indexed_load(i8* %mbstr, i64 %idxprom) nounwind uwtable ssp {
+entry:
+  %dsplen = getelementptr inbounds [0 x %struct.funcs]* @func_table, i64 0, i64 %idxprom, i32 2
+  %x1 = load i32 (i8*)** %dsplen, align 8
+  %call = tail call i32 %x1(i8* %mbstr) nounwind
+  ret void
+}
+
+; <rdar://problem/12282281> Fold an indexed load into the tail call instruction.
+; Calling a varargs function with 6 arguments requires 7 registers (%al is the
+; vector count for varargs functions). This leaves %r11 as the only available
+; scratch register.
+;
+; It is not possible to fold an indexed load into TCRETURNmi64 in that case.
+;
+; typedef int (*funcptr)(void*, ...);
+; extern const funcptr funcs[];
+; int f(int n) {
+;   return funcs[n](0, 0, 0, 0, 0, 0);
+; }
+;
+; CHECK: rdar12282281
+; CHECK: jmpq *%r11 # TAILCALL
+@funcs = external constant [0 x i32 (i8*, ...)*]
+
+define i32 @rdar12282281(i32 %n) nounwind uwtable ssp {
+entry:
+  %idxprom = sext i32 %n to i64
+  %arrayidx = getelementptr inbounds [0 x i32 (i8*, ...)*]* @funcs, i64 0, i64 %idxprom
+  %0 = load i32 (i8*, ...)** %arrayidx, align 8
+  %call = tail call i32 (i8*, ...)* %0(i8* null, i32 0, i32 0, i32 0, i32 0, i32 0) nounwind
+  ret i32 %call
+}
+
+define x86_fp80 @fp80_call(x86_fp80 %x) nounwind  {
+entry:
+; CHECK: fp80_call:
+; CHECK: jmp _fp80_callee
+  %call = tail call x86_fp80 @fp80_callee(x86_fp80 %x) nounwind
+  ret x86_fp80 %call
+}
+
+declare x86_fp80 @fp80_callee(x86_fp80)
+
+; rdar://12229511
+define x86_fp80 @trunc_fp80(x86_fp80 %x) nounwind  {
+entry:
+; CHECK: trunc_fp80
+; CHECK: callq _trunc
+; CHECK-NOT: jmp _trunc
+; CHECK: ret
+  %conv = fptrunc x86_fp80 %x to double
+  %call = tail call double @trunc(double %conv) nounwind readnone
+  %conv1 = fpext double %call to x86_fp80
+  ret x86_fp80 %conv1
+}
 
+declare double @trunc(double) nounwind readnone
diff --git a/test/CodeGen/X86/targetLoweringGeneric.ll b/test/CodeGen/X86/targetLoweringGeneric.ll
index ba5f8f8..a773e9d 100644
--- a/test/CodeGen/X86/targetLoweringGeneric.ll
+++ b/test/CodeGen/X86/targetLoweringGeneric.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i386-apple-darwin9 -fast-isel=false -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=i386-apple-darwin9 -mcpu=corei7 -fast-isel=false -O0 < %s | FileCheck %s
 
 ; Gather non-machine specific tests for the transformations in
 ; CodeGen/SelectionDAG/TargetLowering.  Currently, these
diff --git a/test/CodeGen/X86/tls-pic.ll b/test/CodeGen/X86/tls-pic.ll
index 51c3d23..b823f0a 100644
--- a/test/CodeGen/X86/tls-pic.ll
+++ b/test/CodeGen/X86/tls-pic.ll
@@ -76,12 +76,12 @@ entry:
 
 ; X32:    f5:
 ; X32:      leal {{[jk]}}@TLSLDM(%ebx)
-; X32-NEXT: calll ___tls_get_addr@PLT
-; X32-NEXT: movl {{[jk]}}@DTPOFF(%eax)
-; X32-NEXT: addl {{[jk]}}@DTPOFF(%eax)
+; X32: calll ___tls_get_addr@PLT
+; X32: movl {{[jk]}}@DTPOFF(%e
+; X32: addl {{[jk]}}@DTPOFF(%e
 
 ; X64:    f5:
 ; X64:      leaq {{[jk]}}@TLSLD(%rip), %rdi
-; X64-NEXT: callq	__tls_get_addr@PLT
-; X64-NEXT: movl {{[jk]}}@DTPOFF(%rax)
-; X64-NEXT: addl {{[jk]}}@DTPOFF(%rax)
+; X64: callq	__tls_get_addr@PLT
+; X64: movl {{[jk]}}@DTPOFF(%r
+; X64: addl {{[jk]}}@DTPOFF(%r
diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll
index 9877d7b..1d22a18 100644
--- a/test/CodeGen/X86/trunc-ext-ld-st.ll
+++ b/test/CodeGen/X86/trunc-ext-ld-st.ll
@@ -2,8 +2,7 @@
 
 ;CHECK: load_2_i8
 ; A single 16-bit load
-;CHECK: movzwl
-;CHECK: pshufb
+;CHECK: pmovzxbq
 ;CHECK: paddq
 ;CHECK: pshufb
 ; A single 16-bit store
@@ -19,8 +18,7 @@ define void @load_2_i8(<2 x i8>* %A)  {
 
 ;CHECK: load_2_i16
 ; Read 32-bits
-;CHECK: movd
-;CHECK: pshufb
+;CHECK: pmovzxwq
 ;CHECK: paddq
 ;CHECK: pshufb
 ;CHECK: movd
@@ -33,7 +31,7 @@ define void @load_2_i16(<2 x i16>* %A)  {
 } 
 
 ;CHECK: load_2_i32
-;CHECK: pshufd
+;CHECK: pmovzxdq
 ;CHECK: paddq
 ;CHECK: pshufd
 ;CHECK: ret
@@ -45,8 +43,7 @@ define void @load_2_i32(<2 x i32>* %A)  {
 } 
 
 ;CHECK: load_4_i8
-;CHECK: movd
-;CHECK: pshufb
+;CHECK: pmovzxbd
 ;CHECK: paddd
 ;CHECK: pshufb
 ;CHECK: ret
@@ -58,7 +55,7 @@ define void @load_4_i8(<4 x i8>* %A)  {
 } 
 
 ;CHECK: load_4_i16
-;CHECK: punpcklwd
+;CHECK: pmovzxwd
 ;CHECK: paddd
 ;CHECK: pshufb
 ;CHECK: ret
@@ -70,7 +67,7 @@ define void @load_4_i16(<4 x i16>* %A)  {
 } 
 
 ;CHECK: load_8_i8
-;CHECK: punpcklbw
+;CHECK: pmovzxbw
 ;CHECK: paddw
 ;CHECK: pshufb
 ;CHECK: ret
diff --git a/test/CodeGen/X86/vec_compare-2.ll b/test/CodeGen/X86/vec_compare-2.ll
index 46d6a23..4da7953 100644
--- a/test/CodeGen/X86/vec_compare-2.ll
+++ b/test/CodeGen/X86/vec_compare-2.ll
@@ -10,8 +10,7 @@ define void @blackDespeckle_wrapper(i8** %args_list, i64* %gtid, i64 %xend) {
 entry:
 ; CHECK: cfi_def_cfa_offset
 ; CHECK-NOT: set
-; CHECK: punpcklwd
-; CHECK: pshufd
+; CHECK: pmovzxwq
 ; CHECK: pshufb
   %shr.i = ashr <4 x i32> zeroinitializer, <i32 3, i32 3, i32 3, i32 3> ; <<4 x i32>> [#uses=1]
   %cmp318.i = sext <4 x i1> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
new file mode 100644
index 0000000..82517cb
--- /dev/null
+++ b/test/CodeGen/X86/vec_fabs.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+
+
+define <2 x double> @fabs_v2f64(<2 x double> %p)
+{
+  ; CHECK: fabs_v2f64
+  ; CHECK: vandps
+  %t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
+  ret <2 x double> %t
+}
+declare <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
+
+define <4 x float> @fabs_v4f32(<4 x float> %p)
+{
+  ; CHECK: fabs_v4f32
+  ; CHECK: vandps
+  %t = call <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
+  ret <4 x float> %t
+}
+declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
+
+define <4 x double> @fabs_v4f64(<4 x double> %p)
+{
+  ; CHECK: fabs_v4f64
+  ; CHECK: vandps
+  %t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
+  ret <4 x double> %t
+}
+declare <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
+
+define <8 x float> @fabs_v8f32(<8 x float> %p)
+{
+  ; CHECK: fabs_v8f32
+  ; CHECK: vandps
+  %t = call <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
+  ret <8 x float> %t
+}
+declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
diff --git a/test/CodeGen/X86/vec_floor.ll b/test/CodeGen/X86/vec_floor.ll
new file mode 100644
index 0000000..5e0160b
--- /dev/null
+++ b/test/CodeGen/X86/vec_floor.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+
+
+define <2 x double> @floor_v2f64(<2 x double> %p)
+{
+  ; CHECK: floor_v2f64
+  ; CHECK: vroundpd
+  %t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p)
+  ret <2 x double> %t
+}
+declare <2 x double> @llvm.floor.v2f64(<2 x double> %p)
+
+define <4 x float> @floor_v4f32(<4 x float> %p)
+{
+  ; CHECK: floor_v4f32
+  ; CHECK: vroundps
+  %t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p)
+  ret <4 x float> %t
+}
+declare <4 x float> @llvm.floor.v4f32(<4 x float> %p)
+
+define <4 x double> @floor_v4f64(<4 x double> %p)
+{
+  ; CHECK: floor_v4f64
+  ; CHECK: vroundpd
+  %t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p)
+  ret <4 x double> %t
+}
+declare <4 x double> @llvm.floor.v4f64(<4 x double> %p)
+
+define <8 x float> @floor_v8f32(<8 x float> %p)
+{
+  ; CHECK: floor_v8f32
+  ; CHECK: vroundps
+  %t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p)
+  ret <8 x float> %t
+}
+declare <8 x float> @llvm.floor.v8f32(<8 x float> %p)
diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index 05b263e..dc0464f 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll
@@ -1,14 +1,38 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse41,-avx | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck --check-prefix=AVX %s
 
 ; PR11674
 define void @fpext_frommem(<2 x float>* %in, <2 x double>* %out) {
 entry:
-; TODO: We should be able to generate cvtps2pd for the load.
-; For now, just check that we generate something sane.
-; CHECK: cvtss2sd
-; CHECK: cvtss2sd
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
   %0 = load <2 x float>* %in, align 8
   %1 = fpext <2 x float> %0 to <2 x double>
   store <2 x double> %1, <2 x double>* %out, align 1
   ret void
 }
+
+define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
+entry:
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
+  %0 = load <4 x float>* %in
+  %1 = fpext <4 x float> %0 to <4 x double>
+  store <4 x double> %1, <4 x double>* %out, align 1
+  ret void
+}
+
+define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
+entry:
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 16(%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 24(%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
+; AVX: vcvtps2pd 16(%{{.+}}), %ymm{{[0-9]+}}
+  %0 = load <8 x float>* %in
+  %1 = fpext <8 x float> %0 to <8 x double>
+  store <8 x double> %1, <8 x double>* %out, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/vec_shuffle-26.ll b/test/CodeGen/X86/vec_shuffle-26.ll
index 086af6b..4c56f84 100644
--- a/test/CodeGen/X86/vec_shuffle-26.ll
+++ b/test/CodeGen/X86/vec_shuffle-26.ll
@@ -1,6 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
-; RUN: grep unpcklps %t | count 1
-; RUN: grep unpckhps %t | count 3
+; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s
 
 ; Transpose example using the more generic vector shuffle. Return float8
 ; instead of float16
@@ -14,6 +13,17 @@ target triple = "i386-apple-cl.1.0"
 
 define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind {
 entry:
+; CHECK: transpose2
+; CHECK: unpckhps
+; CHECK: unpckhps
+; CHECK: unpcklps
+; CHECK: unpckhps
+; Different instruction order for Atom.
+; ATOM: transpose2
+; ATOM: unpckhps
+; ATOM: unpckhps
+; ATOM: unpckhps
+; ATOM: unpcklps
 	%unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
 	%unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
 	%unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
@@ -27,3 +37,32 @@ entry:
 ;       %r3 = shufflevector <8 x float> %r1,  <8 x float> %r2,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15 >; 
 	ret <8 x float> %r2
 }
+
+define <2 x i64> @lo_hi_shift(float* nocapture %x, float* nocapture %y) nounwind {
+entry:
+; movhps should happen before extractps to assure it gets the correct value.
+; CHECK: lo_hi_shift
+; CHECK: movhps ([[BASEREG:%[a-z]+]]),
+; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; ATOM: lo_hi_shift
+; ATOM: movhps ([[BASEREG:%[a-z]+]]),
+; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+  %v.i = bitcast float* %y to <4 x float>*
+  %0 = load <4 x float>* %v.i, align 1
+  %1 = bitcast float* %x to <1 x i64>*
+  %.val = load <1 x i64>* %1, align 1
+  %2 = bitcast <1 x i64> %.val to <2 x float>
+  %shuffle.i = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle1.i = shufflevector <4 x float> %0, <4 x float> %shuffle.i, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %cast.i = bitcast <4 x float> %0 to <2 x i64>
+  %extract.i = extractelement <2 x i64> %cast.i, i32 1
+  %3 = bitcast float* %x to i64*
+  store i64 %extract.i, i64* %3, align 4
+  %4 = bitcast <4 x float> %0 to <16 x i8>
+  %5 = bitcast <4 x float> %shuffle1.i to <16 x i8>
+  %palignr = shufflevector <16 x i8> %5, <16 x i8> %4, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %6 = bitcast <16 x i8> %palignr to <2 x i64>
+  ret <2 x i64> %6
+}
diff --git a/test/CodeGen/X86/vec_shuffle-30.ll b/test/CodeGen/X86/vec_shuffle-30.ll
index 1651c4c..f5f8842 100644
--- a/test/CodeGen/X86/vec_shuffle-30.ll
+++ b/test/CodeGen/X86/vec_shuffle-30.ll
@@ -1,21 +1,25 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
-; RUN: grep pshufhw %t | grep -- -95 | count 1
-; RUN: grep shufps %t | count 1
-; RUN: not grep pslldq %t
+; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s
 
+; CHECK: test
 ; Test case when creating pshufhw, we incorrectly set the higher order bit
 ; for an undef,
 define void @test(<8 x i16>* %dest, <8 x i16> %in) nounwind {
 entry:
+; CHECK-NOT: vmovaps
+; CHECK: vmovlpd
+; CHECK: vpshufhw        $-95
   %0 = load <8 x i16>* %dest
   %1 = shufflevector <8 x i16> %0, <8 x i16> %in, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 13, i32 undef, i32 14, i32 14>
   store <8 x i16> %1, <8 x i16>* %dest
   ret void
-}                              
+}
 
+; CHECK: test2
 ; A test case where we shouldn't generate a punpckldq but a pshufd and a pslldq
 define void @test2(<4 x i32>* %dest, <4 x i32> %in) nounwind {
 entry:
+; CHECK-NOT: pslldq
+; CHECK: shufps
   %0 = shufflevector <4 x i32> %in, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> < i32 undef, i32 5, i32 undef, i32 2>
   store <4 x i32> %0, <4 x i32>* %dest
   ret void
diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll
index ebdfea9..56c6364 100644
--- a/test/CodeGen/X86/widen_cast-1.ll
+++ b/test/CodeGen/X86/widen_cast-1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=x86 -mcpu=generic -mattr=+sse42 < %s | FileCheck %s
-; RUN: llc -march=x86 -mcpu=atom -mattr=+sse42 < %s | FileCheck -check-prefix=ATOM %s
+; RUN: llc -march=x86 -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK: paddd
 ; CHECK: movl
diff --git a/test/CodeGen/X86/widen_load-1.ll b/test/CodeGen/X86/widen_load-1.ll
index 9705d14..dfaa3d6 100644
--- a/test/CodeGen/X86/widen_load-1.ll
+++ b/test/CodeGen/X86/widen_load-1.ll
@@ -1,12 +1,17 @@
-; RUN: llc %s -o - -march=x86-64 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: llc %s -o - -march=x86-64 -mattr=-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=SSE
+; RUN: llc %s -o - -march=x86-64 -mattr=+avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=AVX
 ; PR4891
 ; PR5626
 
 ; This load should be before the call, not after.
 
-; CHECK: movaps    compl+128(%rip), %xmm0
-; CHECK: movaps  %xmm0, (%rsp)
-; CHECK: callq   killcommon
+; SSE: movaps    compl+128(%rip), %xmm0
+; SSE: movaps  %xmm0, (%rsp)
+; SSE: callq   killcommon
+
+; AVX: vmovapd    compl+128(%rip), %xmm0
+; AVX: vmovapd  %xmm0, (%rsp)
+; AVX: callq   killcommon
 
 @compl = linkonce global [20 x i64] zeroinitializer, align 64 ; <[20 x i64]*> [#uses=1]
 
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index 79aa000..224898c 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -170,7 +170,7 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp
 ; CHECK: rot
 %i8vec3pack = type { <3 x i8>, i8 }
 define %i8vec3pack  @rot() nounwind {
-; CHECK: movd {{-?[0-9]+}}(%rsp), {{%xmm[0-9]}}
+; CHECK: pmovzxbd {{-?[0-9]+}}(%rsp), {{%xmm[0-9]}}
 entry:
   %X = alloca %i8vec3pack, align 4
   %rot = alloca %i8vec3pack, align 4
diff --git a/test/CodeGen/X86/xmulo.ll b/test/CodeGen/X86/xmulo.ll
new file mode 100644
index 0000000..486dafe
--- /dev/null
+++ b/test/CodeGen/X86/xmulo.ll
@@ -0,0 +1,50 @@
+; RUN: llc %s -o - | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
+declare i32 @printf(i8*, ...)
+
+@.str = private unnamed_addr constant [10 x i8] c"%llx, %d\0A\00", align 1
+
+define i32 @t1() nounwind {
+; CHECK: t1:
+; CHECK:  movl $0, 12(%esp)
+; CHECK:  movl $0, 8(%esp)
+; CHECK:  movl $72, 4(%esp)
+
+    %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 8)
+    %2 = extractvalue {i64, i1} %1, 0
+    %3 = extractvalue {i64, i1} %1, 1
+    %4 = zext i1 %3 to i32
+    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    ret i32 0
+}
+
+define i32 @t2() nounwind {
+; CHECK: t2:
+; CHECK:  movl $0, 12(%esp)
+; CHECK:  movl $0, 8(%esp)
+; CHECK:  movl $0, 4(%esp)
+
+    %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 0)
+    %2 = extractvalue {i64, i1} %1, 0
+    %3 = extractvalue {i64, i1} %1, 1
+    %4 = zext i1 %3 to i32
+    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    ret i32 0
+}
+
+define i32 @t3() nounwind {
+; CHECK: t3:
+; CHECK:  movl $1, 12(%esp)
+; CHECK:  movl $-1, 8(%esp)
+; CHECK:  movl $-9, 4(%esp)
+
+    %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 -1)
+    %2 = extractvalue {i64, i1} %1, 0
+    %3 = extractvalue {i64, i1} %1, 1
+    %4 = zext i1 %3 to i32
+    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    ret i32 0
+}
author	dim <dim@FreeBSD.org>	2012-12-02 13:10:19 +0000
committer	dim <dim@FreeBSD.org>	2012-12-02 13:10:19 +0000
commit	6de2c08bc400b4aca9fb46684e8bdb56eed9b09f (patch)
tree	32b4679ab4b8f28e5228daafc65e9dc436935353 /test/CodeGen/X86
parent	4dc93743c9d40c29c0a3bec2aae328cac0d289e8 (diff)
download	FreeBSD-src-6de2c08bc400b4aca9fb46684e8bdb56eed9b09f.zip FreeBSD-src-6de2c08bc400b4aca9fb46684e8bdb56eed9b09f.tar.gz