309 files changed, 5337 insertions, 739 deletions
diff --git a/test/CodeGen/X86/2003-08-03-CallArgLiveRanges.ll b/test/CodeGen/X86/2003-08-03-CallArgLiveRanges.ll
index 2484860..0af2445 100644
--- a/test/CodeGen/X86/2003-08-03-CallArgLiveRanges.ll
+++ b/test/CodeGen/X86/2003-08-03-CallArgLiveRanges.ll
@@ -3,7 +3,7 @@
 ; it makes a ton of annoying overlapping live ranges.  This code should not
 ; cause spills!
 ;
-; RUN: llc < %s -march=x86 -stats |& not grep spilled
+; RUN: llc < %s -march=x86 -stats 2>&1 | not grep spilled
 
 target datalayout = "e-p:32:32"
 
diff --git a/test/CodeGen/X86/2003-11-03-GlobalBool.ll b/test/CodeGen/X86/2003-11-03-GlobalBool.ll
index 8b0a185..f201b98 100644
--- a/test/CodeGen/X86/2003-11-03-GlobalBool.ll
+++ b/test/CodeGen/X86/2003-11-03-GlobalBool.ll
@@ -1,4 +1,4 @@
 ; RUN: llc < %s -march=x86 | \
-; RUN:   not grep {.byte\[\[:space:\]\]*true}
+; RUN:   not grep ".byte[[:space:]]*true"
 
 @X = global i1 true             ; <i1*> [#uses=0]
diff --git a/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll b/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll
index fea2b54..dde210b 100644
--- a/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll
+++ b/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 | grep {(%esp}
-; RUN: llc < %s -march=x86 | grep {pushl	%ebp} | count 1
-; RUN: llc < %s -march=x86 | grep {popl	%ebp} | count 1
+; RUN: llc < %s -march=x86 | grep "(%esp"
+; RUN: llc < %s -march=x86 | grep "pushl	%ebp" | count 1
+; RUN: llc < %s -march=x86 | grep "popl	%ebp" | count 1
 
 declare i8* @llvm.returnaddress(i32)
 
diff --git a/test/CodeGen/X86/2004-03-30-Select-Max.ll b/test/CodeGen/X86/2004-03-30-Select-Max.ll
index c44d10a..e22aa6a 100644
--- a/test/CodeGen/X86/2004-03-30-Select-Max.ll
+++ b/test/CodeGen/X86/2004-03-30-Select-Max.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | not grep {j\[lgbe\]}
+; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
+; CHECK-NOT: {{j[lgbe]}}
 
 define i32 @max(i32 %A, i32 %B) nounwind {
         %gt = icmp sgt i32 %A, %B               ; <i1> [#uses=1]
diff --git a/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll b/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll
index dc69ef8..f8bf099 100644
--- a/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll
+++ b/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep {subl.*%esp}
+; RUN: llc < %s -march=x86 | not grep "subl.*%esp"
 
 define i32 @f(i32 %a, i32 %b) {
         %tmp.2 = mul i32 %a, %a         ; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll b/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll
index 0421896..1a3d749 100644
--- a/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll
+++ b/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86  -stats |& \
+; RUN: llc < %s -march=x86  -stats 2>&1 | \
 ; RUN:   grep asm-printer | grep 7
 
 define i32 @g(i32 %a, i32 %b) nounwind {
diff --git a/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll b/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll
index 8783a11..fb1262a 100644
--- a/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll
+++ b/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin8 -relocation-model=static > %t
-; RUN: grep {movl	_last} %t | count 1
-; RUN: grep {cmpl.*_last} %t | count 1
+; RUN: grep "movl	_last" %t | count 1
+; RUN: grep "cmpl.*_last" %t | count 1
 
 @block = external global i8*            ; <i8**> [#uses=1]
 @last = external global i32             ; <i32*> [#uses=3]
diff --git a/test/CodeGen/X86/2006-05-01-SchedCausingSpills.ll b/test/CodeGen/X86/2006-05-01-SchedCausingSpills.ll
index b045329..5cba3ef 100644
--- a/test/CodeGen/X86/2006-05-01-SchedCausingSpills.ll
+++ b/test/CodeGen/X86/2006-05-01-SchedCausingSpills.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah -stats |& \
-; RUN:   not grep {Number of register spills}
+; RUN: llc < %s -march=x86 -mcpu=yonah -stats 2>&1 | \
+; RUN:   not grep "Number of register spills"
 ; END.
 
 
diff --git a/test/CodeGen/X86/2006-05-02-InstrSched1.ll b/test/CodeGen/X86/2006-05-02-InstrSched1.ll
index 7d0a6ab..1c75f93 100644
--- a/test/CodeGen/X86/2006-05-02-InstrSched1.ll
+++ b/test/CodeGen/X86/2006-05-02-InstrSched1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -relocation-model=static -stats |& \
+; RUN: llc < %s -march=x86 -relocation-model=static -stats 2>&1 | \
 ; RUN:   grep asm-printer | grep 14
 ;
 @size20 = external global i32		; <i32*> [#uses=1]
diff --git a/test/CodeGen/X86/2006-05-02-InstrSched2.ll b/test/CodeGen/X86/2006-05-02-InstrSched2.ll
index 23954d7..95eefa1 100644
--- a/test/CodeGen/X86/2006-05-02-InstrSched2.ll
+++ b/test/CodeGen/X86/2006-05-02-InstrSched2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -stats  |& \
+; RUN: llc < %s -march=x86 -stats  2>&1 | \
 ; RUN:   grep asm-printer | grep 13
 
 define void @_ZN9__gnu_cxx9hashtableISt4pairIKPKciES3_NS_4hashIS3_EESt10_Select1stIS5_E5eqstrSaIiEE14find_or_insertERKS5__cond_true456.i(i8* %tmp435.i, i32* %tmp449.i.out) nounwind {
diff --git a/test/CodeGen/X86/2006-05-08-InstrSched.ll b/test/CodeGen/X86/2006-05-08-InstrSched.ll
index d58d638..3419d01 100644
--- a/test/CodeGen/X86/2006-05-08-InstrSched.ll
+++ b/test/CodeGen/X86/2006-05-08-InstrSched.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -relocation-model=static | not grep {subl.*%esp}
+; RUN: llc < %s -march=x86 -relocation-model=static | not grep "subl.*%esp"
 
 @A = external global i16*		; <i16**> [#uses=1]
 @B = external global i32		; <i32*> [#uses=1]
diff --git a/test/CodeGen/X86/2006-05-11-InstrSched.ll b/test/CodeGen/X86/2006-05-11-InstrSched.ll
index 38bca28..37c5107 100644
--- a/test/CodeGen/X86/2006-05-11-InstrSched.ll
+++ b/test/CodeGen/X86/2006-05-11-InstrSched.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mattr=+sse2 -stats -realign-stack=0 |&\
-; RUN:     grep {asm-printer} | grep 35
+; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mcpu=penryn -mattr=+sse2 -stats -realign-stack=0 2>&1 | \
+; RUN:     grep "asm-printer" | grep 35
 
 target datalayout = "e-p:32:32"
 define void @foo(i32* %mc, i32* %bp, i32* %ms, i32* %xmb, i32* %mpp, i32* %tpmm, i32* %ip, i32* %tpim, i32* %dpp, i32* %tpdm, i32* %bpi, i32 %M) nounwind {
diff --git a/test/CodeGen/X86/2006-07-31-SingleRegClass.ll b/test/CodeGen/X86/2006-07-31-SingleRegClass.ll
index 3159cec..c5c74d1 100644
--- a/test/CodeGen/X86/2006-07-31-SingleRegClass.ll
+++ b/test/CodeGen/X86/2006-07-31-SingleRegClass.ll
@@ -1,7 +1,7 @@
 ; PR850
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=att > %t
-; RUN: grep {movl 4(%eax),%ebp} %t
-; RUN: grep {movl 0(%eax), %ebx} %t
+; RUN: grep "movl 4(%eax),%ebp" %t
+; RUN: grep "movl 0(%eax), %ebx" %t
 
 define i32 @foo(i32 %__s.i.i, i32 %tmp5.i.i, i32 %tmp6.i.i, i32 %tmp7.i.i, i32 %tmp8.i.i) {
 	%tmp9.i.i = call i32 asm sideeffect "push %ebp\0Apush %ebx\0Amovl 4($2),%ebp\0Amovl 0($2), %ebx\0Amovl $1,%eax\0Aint  $$0x80\0Apop  %ebx\0Apop %ebp", "={ax},i,0,{cx},{dx},{si},{di}"( i32 192, i32 %__s.i.i, i32 %tmp5.i.i, i32 %tmp6.i.i, i32 %tmp7.i.i, i32 %tmp8.i.i )		; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll b/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll
index a19d8f7..56d5f2f 100644
--- a/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll
+++ b/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -mcpu=i386 | \
-; RUN:    not grep {movl %eax, %edx}
+; RUN:    not grep "movl %eax, %edx"
 
 define i32 @foo(i32 %t, i32 %C) {
 entry:
diff --git a/test/CodeGen/X86/2006-11-12-CSRetCC.ll b/test/CodeGen/X86/2006-11-12-CSRetCC.ll
index 6ec9a48..a58c9b1 100644
--- a/test/CodeGen/X86/2006-11-12-CSRetCC.ll
+++ b/test/CodeGen/X86/2006-11-12-CSRetCC.ll
@@ -52,8 +52,8 @@ entry:
         %tmp21 = load double* %tmp20            ; <double> [#uses=1]
         %tmp.upgrd.6 = getelementptr [9 x i8]* @str, i32 0, i64 0               ; <i8*> [#uses=1]
         %tmp.upgrd.7 = call i32 (i8*, ...)* @printf( i8* %tmp.upgrd.6, double %tmp21, double %tmp19 )           ; <i32> [#uses=0]
-        br label %return
-return:         ; preds = %entry
+        br label %finish
+finish:
         %retval.upgrd.8 = load i32* %retval             ; <i32> [#uses=1]
         ret i32 %retval.upgrd.8
 }
diff --git a/test/CodeGen/X86/2006-11-17-IllegalMove.ll b/test/CodeGen/X86/2006-11-17-IllegalMove.ll
index affb7af..783d9f9 100644
--- a/test/CodeGen/X86/2006-11-17-IllegalMove.ll
+++ b/test/CodeGen/X86/2006-11-17-IllegalMove.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86-64 > %t
-; RUN: grep movb %t | count 2
-; RUN: grep {movzb\[wl\]} %t
+; RUN: grep movb %t | count 1
+; RUN: grep "movzb[wl]" %t
 
 
 define void @handle_vector_size_attribute() nounwind {
diff --git a/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll b/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
index a228898..04d4b8e 100644
--- a/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
+++ b/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86-64 > %t
-; RUN: not grep {,%rsp)} %t
+; RUN: not grep ",%rsp)" %t
 ; PR1103
 
 target datalayout = "e-p:64:64"
diff --git a/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll b/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll
index 3312e01..3b2e443 100644
--- a/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll
+++ b/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep {mov %gs:72, %eax}
+; RUN: llc < %s -march=x86 | grep "mov %gs:72, %eax"
 target datalayout = "e-p:32:32"
 target triple = "i686-apple-darwin9"
 
diff --git a/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll b/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll
index c1b1ad1..18b06dc 100644
--- a/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll
+++ b/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mcpu=yonah -march=x86 | \
-; RUN:   grep {cmpltsd %xmm0, %xmm0}
+; RUN:   grep "cmpltsd %xmm0, %xmm0"
 target datalayout = "e-p:32:32"
 target triple = "i686-apple-darwin9"
 
diff --git a/test/CodeGen/X86/2007-04-27-InlineAsm-IntMemInput.ll b/test/CodeGen/X86/2007-04-27-InlineAsm-IntMemInput.ll
index 85a2ecc..cae68c9 100644
--- a/test/CodeGen/X86/2007-04-27-InlineAsm-IntMemInput.ll
+++ b/test/CodeGen/X86/2007-04-27-InlineAsm-IntMemInput.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | not grep {bsrl.*10}
+; RUN: llc < %s | not grep "bsrl.*10"
 ; PR1356
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/X86/2007-05-07-InvokeSRet.ll b/test/CodeGen/X86/2007-05-07-InvokeSRet.ll
index deb3999..c3d7e8a 100644
--- a/test/CodeGen/X86/2007-05-07-InvokeSRet.ll
+++ b/test/CodeGen/X86/2007-05-07-InvokeSRet.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i686-pc-linux-gnu -disable-fp-elim | not grep {addl .12, %esp}
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu -disable-fp-elim | not grep "addl .12, %esp"
 ; PR1398
 
 	%struct.S = type { i32, i32 }
diff --git a/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll b/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
index 77291f0..aa0ee5d 100644
--- a/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
+++ b/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep {movsbl}
+; RUN: llc < %s -march=x86 | grep "movsbl"
 
 @X = global i32 0               ; <i32*> [#uses=1]
 
diff --git a/test/CodeGen/X86/2007-09-05-InvalidAsm.ll b/test/CodeGen/X86/2007-09-05-InvalidAsm.ll
index 5acb051..e81534b 100644
--- a/test/CodeGen/X86/2007-09-05-InvalidAsm.ll
+++ b/test/CodeGen/X86/2007-09-05-InvalidAsm.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -x86-asm-syntax=intel | not grep {lea\[\[:space:\]\]R}
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -x86-asm-syntax=intel | FileCheck %s
+; CHECK-NOT: lea R
 
 	%struct.AGenericCall = type { %struct.AGenericManager*, %struct.ComponentParameters*, i32* }
 	%struct.AGenericManager = type <{ i8 }>
diff --git a/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll b/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll
index 228a915..56a109a 100644
--- a/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll
+++ b/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -relocation-model=static | grep {foo str$}
+; RUN: llc < %s -relocation-model=static | grep "foo str$"
 ; PR1761
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-pc-linux"
diff --git a/test/CodeGen/X86/2007-12-18-LoadCSEBug.ll b/test/CodeGen/X86/2007-12-18-LoadCSEBug.ll
index 2e95082..99df20d 100644
--- a/test/CodeGen/X86/2007-12-18-LoadCSEBug.ll
+++ b/test/CodeGen/X86/2007-12-18-LoadCSEBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=generic | grep {(%esp)} | count 2
+; RUN: llc < %s -march=x86 -mcpu=generic | grep "(%esp)" | count 2
 ; PR1872
 
 	%struct.c34007g__designated___XUB = type { i32, i32, i32, i32 }
diff --git a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
index 266fd7b..39af931 100644
--- a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
+++ b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
@@ -10,10 +10,10 @@
 
 	%struct.indexentry = type { i32, i8*, i8*, i8*, i8*, i8* }
 
-define i32 @_bfd_stab_section_find_nearest_line(i32 %offset) nounwind  {
+define i32 @_bfd_stab_section_find_nearest_line(i32 %offset, i1 %cond) nounwind  {
 entry:
 	%tmp910 = add i32 0, %offset		; <i32> [#uses=1]
-	br i1 true, label %bb951, label %bb917
+	br i1 %cond, label %bb951, label %bb917
 
 bb917:		; preds = %entry
 	ret i32 0
@@ -21,7 +21,7 @@ bb917:		; preds = %entry
 bb951:		; preds = %bb986, %entry
 	%tmp955 = sdiv i32 0, 2		; <i32> [#uses=3]
 	%tmp961 = getelementptr %struct.indexentry* null, i32 %tmp955, i32 0		; <i32*> [#uses=1]
-	br i1 true, label %bb986, label %bb967
+	br i1 %cond, label %bb986, label %bb967
 
 bb967:		; preds = %bb951
 	ret i32 0
diff --git a/test/CodeGen/X86/2008-01-16-FPStackifierAssert.ll b/test/CodeGen/X86/2008-01-16-FPStackifierAssert.ll
index 0091397..9584b71 100644
--- a/test/CodeGen/X86/2008-01-16-FPStackifierAssert.ll
+++ b/test/CodeGen/X86/2008-01-16-FPStackifierAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -regalloc=fast
+; RUN: llc < %s -march=x86 -mattr=+sse2 -regalloc=fast -optimize-regalloc=0
 
 define void @SolveCubic(double %a, double %b, double %c, double %d, i32* %solutions, double* %x) {
 entry:
diff --git a/test/CodeGen/X86/2008-02-18-TailMergingBug.ll b/test/CodeGen/X86/2008-02-18-TailMergingBug.ll
index bdacf50..a1b973d 100644
--- a/test/CodeGen/X86/2008-02-18-TailMergingBug.ll
+++ b/test/CodeGen/X86/2008-02-18-TailMergingBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah -stats |& grep {Number of block tails merged} | grep 16
+; RUN: llc < %s -march=x86 -mcpu=yonah -stats 2>&1 | grep "Number of block tails merged" | grep 16
 ; PR1909
 
 @.str = internal constant [48 x i8] c"transformed bounds: (%.2f, %.2f), (%.2f, %.2f)\0A\00"		; <[48 x i8]*> [#uses=1]
diff --git a/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll b/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll
index 5115e48..a52b365 100644
--- a/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll
+++ b/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s | grep {a:} | not grep ax
-; RUN: llc < %s | grep {b:} | not grep ax
+; RUN: llc < %s | grep "a:" | not grep ax
+; RUN: llc < %s | grep "b:" | not grep ax
 ; PR2078
 ; The clobber list says that "ax" is clobbered.  Make sure that eax isn't 
 ; allocated to the input/output register.
diff --git a/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll b/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll
index da02907..9185a36 100644
--- a/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll
+++ b/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -regalloc=fast -march=x86 -mattr=+mmx | grep esi
+; RUN: llc < %s -regalloc=fast -optimize-regalloc=0 -march=x86 -mattr=+mmx | grep esi
 ; PR2082
 ; Local register allocator was refusing to use ESI, EDI, and EBP so it ran out of
 ; registers.
diff --git a/test/CodeGen/X86/2008-03-23-DarwinAsmComments.ll b/test/CodeGen/X86/2008-03-23-DarwinAsmComments.ll
index 4dc3a10..5ca7e3e 100644
--- a/test/CodeGen/X86/2008-03-23-DarwinAsmComments.ll
+++ b/test/CodeGen/X86/2008-03-23-DarwinAsmComments.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -asm-verbose | grep {#} | not grep -v {##}
+; RUN: llc < %s -mtriple=i386-apple-darwin -asm-verbose | grep "#" | not grep -v "##"
 
 	%struct.AGenericCall = type { %struct.AGenericManager*, %struct.ComponentParameters*, i32* }
 	%struct.AGenericManager = type <{ i8 }>
diff --git a/test/CodeGen/X86/2008-04-16-ReMatBug.ll b/test/CodeGen/X86/2008-04-16-ReMatBug.ll
index 109069e..3a1de11 100644
--- a/test/CodeGen/X86/2008-04-16-ReMatBug.ll
+++ b/test/CodeGen/X86/2008-04-16-ReMatBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -disable-cgp-branch-opts | grep movw | not grep {, %e}
+; RUN: llc < %s -mtriple=i386-apple-darwin -disable-cgp-branch-opts | grep movw | not grep ", %e"
 
 	%struct.DBC_t = type { i32, i8*, i16, %struct.DBC_t*, i8*, i8*, i8*, i8*, i8*, %struct.DBC_t*, i32, i32, i32, i32, i8*, i8*, i8*, i8*, i8*, i32, i32, i32, i32, i32, i32, i32, i32, i16, i16, i32*, i8, i16, %struct.DRVOPT*, i16 }
 	%struct.DRVOPT = type { i16, i32, i8, %struct.DRVOPT* }
diff --git a/test/CodeGen/X86/2008-04-17-CoalescerBug.ll b/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
index 859041e..f244793 100644
--- a/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | grep xorl | grep {%e}
+; RUN: llc < %s -mtriple=i386-apple-darwin | grep xorl | grep "%e"
 ; Make sure xorl operands are 32-bit registers.
 
 	%struct.tm = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8* }
diff --git a/test/CodeGen/X86/2008-04-28-CoalescerBug.ll b/test/CodeGen/X86/2008-04-28-CoalescerBug.ll
index 5b97eb7..7c04206 100644
--- a/test/CodeGen/X86/2008-04-28-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-04-28-CoalescerBug.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin | grep movl > %t
-; RUN: not grep {r\[abcd\]x} %t
-; RUN: not grep {r\[ds\]i} %t
-; RUN: not grep {r\[bs\]p} %t
+; RUN: not grep "r[abcd]x" %t
+; RUN: not grep "r[ds]i" %t
+; RUN: not grep "r[bs]p" %t
 
 	%struct.BITMAP = type { i16, i16, i32, i32, i32, i32, i32, i32, i8*, i8* }
 	%struct.BltData = type { float, float, float, float }
diff --git a/test/CodeGen/X86/2008-05-28-LocalRegAllocBug.ll b/test/CodeGen/X86/2008-05-28-LocalRegAllocBug.ll
index c068f8a..4e73b5a 100644
--- a/test/CodeGen/X86/2008-05-28-LocalRegAllocBug.ll
+++ b/test/CodeGen/X86/2008-05-28-LocalRegAllocBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -regalloc=fast
+; RUN: llc < %s -mtriple=i386-apple-darwin -regalloc=fast -optimize-regalloc=0
 
 @_ZTVN10Evaluation10GridOutputILi3EEE = external constant [5 x i32 (...)*]		; <[5 x i32 (...)*]*> [#uses=1]
 
diff --git a/test/CodeGen/X86/2008-08-06-CmpStride.ll b/test/CodeGen/X86/2008-08-06-CmpStride.ll
index 99cb856..bdac8fd 100644
--- a/test/CodeGen/X86/2008-08-06-CmpStride.ll
+++ b/test/CodeGen/X86/2008-08-06-CmpStride.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s -o - | grep {cmpl	\\$\[1\], %}
+; RUN: llc -march=x86-64 < %s -o - | grep "cmpl	\$[1], %"
 
 @.str = internal constant [4 x i8] c"%d\0A\00"
 
diff --git a/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll b/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll
index 1d27fc5..c63c890 100644
--- a/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll
+++ b/test/CodeGen/X86/2008-08-31-EH_RETURN32.ll
@@ -1,15 +1,36 @@
 ; Check that eh_return & unwind_init were properly lowered
-; RUN: llc < %s | grep %ebp | count 9
-; RUN: llc < %s | grep %ecx | count 5
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "i386-pc-linux"
 
-define i8* @test(i32 %a, i8* %b)  {
+; CHECK: test1
+; CHECK: pushl %ebp
+define i8* @test1(i32 %a, i8* %b)  {
 entry:
   call void @llvm.eh.unwind.init()
   %foo   = alloca i32
   call void @llvm.eh.return.i32(i32 %a, i8* %b)
+; CHECK: movl 12(%ebp), %[[ECX:e..]]
+; CHECK: movl 8(%ebp), %[[EAX:e..]]
+; CHECK: movl %[[ECX]], 4(%ebp,%[[EAX]])
+; CHECK: leal 4(%ebp,%[[EAX]]), %[[ECX2:e..]]
+; CHECK: movl %[[ECX2]], %esp
+; CHECK: ret
+  unreachable
+}
+
+; CHECK: test2
+; CHECK: pushl %ebp
+define i8* @test2(i32 %a, i8* %b)  {
+entry:
+  call void @llvm.eh.return.i32(i32 %a, i8* %b)
+; CHECK: movl 12(%ebp), %[[ECX:e..]]
+; CHECK: movl 8(%ebp), %[[EAX:e..]]
+; CHECK: movl %[[ECX]], 4(%ebp,%[[EAX]])
+; CHECK: leal 4(%ebp,%[[EAX]]), %[[ECX2:e..]]
+; CHECK: movl %[[ECX2]], %esp
+; CHECK: ret
   unreachable
 }
 
diff --git a/test/CodeGen/X86/2008-09-17-inline-asm-1.ll b/test/CodeGen/X86/2008-09-17-inline-asm-1.ll
index 86e50c9..4b2774b 100644
--- a/test/CodeGen/X86/2008-09-17-inline-asm-1.ll
+++ b/test/CodeGen/X86/2008-09-17-inline-asm-1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 | FileCheck %s
-; RUN: llc < %s -march=x86 -regalloc=fast | FileCheck %s
+; RUN: llc < %s -march=x86 -regalloc=fast -optimize-regalloc=0 | FileCheck %s
 
 ; %0 must not be put in EAX or EDX.
 ; In the first asm, $0 and $2 must not be put in EAX.
diff --git a/test/CodeGen/X86/2008-09-18-inline-asm-2.ll b/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
index 6867ae7..5c2fbee 100644
--- a/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
+++ b/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -regalloc=fast       | FileCheck %s
+; RUN: llc < %s -march=x86 -regalloc=fast -optimize-regalloc=0 | FileCheck %s
 ; RUN: llc < %s -march=x86 -regalloc=basic      | FileCheck %s
 ; RUN: llc < %s -march=x86 -regalloc=greedy     | FileCheck %s
 
diff --git a/test/CodeGen/X86/2008-10-24-FlippedCompare.ll b/test/CodeGen/X86/2008-10-24-FlippedCompare.ll
index 421b931..e504bc3 100644
--- a/test/CodeGen/X86/2008-10-24-FlippedCompare.ll
+++ b/test/CodeGen/X86/2008-10-24-FlippedCompare.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -o - | not grep {ucomiss\[^,\]*esp}
+; RUN: llc < %s -march=x86 -mattr=+sse2 -o - | not grep "ucomiss[^,]*esp"
 
 define void @f(float %wt) {
 entry:
diff --git a/test/CodeGen/X86/2008-10-27-CoalescerBug.ll b/test/CodeGen/X86/2008-10-27-CoalescerBug.ll
index 9d144a4..b2cf34cd 100644
--- a/test/CodeGen/X86/2008-10-27-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-10-27-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 -stats |& FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 -stats 2>&1 | FileCheck %s
 ; Now this test spills one register. But a reload in the loop is cheaper than
 ; the divsd so it's a win.
 
@@ -17,8 +17,7 @@ bb:		; preds = %bb, %entry
 ; CHECK: %bb30.loopexit
 ; CHECK: divsd %xmm0
 ; CHECK: movsd %xmm0, 16(%esp)
-; CHECK: .align
-; CHECK-NEXT: %bb3
+; CHECK: %bb3
 bb3:		; preds = %bb30.loopexit, %bb25, %bb3
 	%2 = load i32* null, align 4		; <i32> [#uses=1]
 	%3 = mul i32 %2, 0		; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/2008-12-23-crazy-address.ll b/test/CodeGen/X86/2008-12-23-crazy-address.ll
index 2edcaea..0e95c9e 100644
--- a/test/CodeGen/X86/2008-12-23-crazy-address.ll
+++ b/test/CodeGen/X86/2008-12-23-crazy-address.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -relocation-model=static | grep {lea.*X.*esp} | count 2
+; RUN: llc < %s -march=x86 -relocation-model=static | grep "lea.*X.*esp" | count 2
 
 @X = external global [0 x i32]
 
diff --git a/test/CodeGen/X86/2009-01-31-BigShift2.ll b/test/CodeGen/X86/2009-01-31-BigShift2.ll
index 3e42553..b478f27 100644
--- a/test/CodeGen/X86/2009-01-31-BigShift2.ll
+++ b/test/CodeGen/X86/2009-01-31-BigShift2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep {mov.*56}
+; RUN: llc < %s -march=x86 | grep "mov.*56"
 ; PR3449
 
 define void @test(<8 x double>* %P, i64* %Q) nounwind {
diff --git a/test/CodeGen/X86/2009-02-25-CommuteBug.ll b/test/CodeGen/X86/2009-02-25-CommuteBug.ll
index 7ea6998..9cbf350 100644
--- a/test/CodeGen/X86/2009-02-25-CommuteBug.ll
+++ b/test/CodeGen/X86/2009-02-25-CommuteBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -stats |& not grep commuted
+; RUN: llc < %s -march=x86 -mattr=+sse2 -stats 2>&1 | not grep commuted
 ; rdar://6608609
 
 define <2 x double> @t(<2 x double> %A, <2 x double> %B, <2 x double> %C) nounwind readnone {
diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
index 0b5b7bd..d50fe6f 100644
--- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 -stats |& grep {8 machine-licm}
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 -mcpu=penryn -stats 2>&1 | grep "5 machine-licm"
+; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 -mcpu=penryn | FileCheck %s
 ; rdar://6627786
 ; rdar://7792037
 
diff --git a/test/CodeGen/X86/2009-03-12-CPAlignBug.ll b/test/CodeGen/X86/2009-03-12-CPAlignBug.ll
index 3564f01..847a43f 100644
--- a/test/CodeGen/X86/2009-03-12-CPAlignBug.ll
+++ b/test/CodeGen/X86/2009-03-12-CPAlignBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 | not grep {.space}
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 | not grep ".space"
 ; rdar://6668548
 
 declare double @llvm.sqrt.f64(double) nounwind readonly
diff --git a/test/CodeGen/X86/2009-03-23-MultiUseSched.ll b/test/CodeGen/X86/2009-03-23-MultiUseSched.ll
index 8bbdb0e..d934ec9 100644
--- a/test/CodeGen/X86/2009-03-23-MultiUseSched.ll
+++ b/test/CodeGen/X86/2009-03-23-MultiUseSched.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-linux -relocation-model=static -o /dev/null -stats -info-output-file - > %t
 ; RUN: not grep spill %t
-; RUN: not grep {%rsp} %t
-; RUN: not grep {%rbp} %t
+; RUN: not grep "%rsp" %t
+; RUN: not grep "%rbp" %t
 
 ; The register-pressure scheduler should be able to schedule this in a
 ; way that does not require spills.
diff --git a/test/CodeGen/X86/2009-04-16-SpillerUnfold.ll b/test/CodeGen/X86/2009-04-16-SpillerUnfold.ll
index f46eed4..ad18a0c 100644
--- a/test/CodeGen/X86/2009-04-16-SpillerUnfold.ll
+++ b/test/CodeGen/X86/2009-04-16-SpillerUnfold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -relocation-model=pic -disable-fp-elim -stats |& grep {Number of modref unfolded}
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -relocation-model=pic -disable-fp-elim -stats 2>&1 | grep "Number of modref unfolded"
 ; XFAIL: *
 ; 69408 removed the opportunity for this optimization to work
 
diff --git a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll b/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
index 9f5a8c5..5cb05e8 100644
--- a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
+++ b/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=i386-apple-darwin10.0 -relocation-model=pic -asm-verbose=false \
-; RUN:     -disable-fp-elim -mattr=-sse41,-sse3,+sse2 -post-RA-scheduler=false -regalloc=basic < %s | \
+; RUN:     -mcpu=generic -disable-fp-elim -mattr=-sse41,-sse3,+sse2 -post-RA-scheduler=false -regalloc=basic < %s | \
 ; RUN:   FileCheck %s
 ; rdar://6808032
 
diff --git a/test/CodeGen/X86/2009-04-24.ll b/test/CodeGen/X86/2009-04-24.ll
index d6ed0c4..08bf9e3 100644
--- a/test/CodeGen/X86/2009-04-24.ll
+++ b/test/CodeGen/X86/2009-04-24.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -regalloc=fast -relocation-model=pic > %t2
-; RUN: grep {leaq.*TLSGD} %t2
-; RUN: grep {__tls_get_addr} %t2
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -regalloc=fast -optimize-regalloc=0 -relocation-model=pic > %t2
+; RUN: grep "leaq.*TLSGD" %t2
+; RUN: grep "__tls_get_addr" %t2
 ; PR4004
 
 @i = thread_local global i32 15
diff --git a/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll b/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll
index a2fd2e4..a6ed74b 100644
--- a/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll
+++ b/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | grep {movl.*%ebx, 8(%esi)}
+; RUN: llc < %s | grep "movl.*%ebx, 8(%esi)"
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin9.0"
 
diff --git a/test/CodeGen/X86/2009-05-30-ISelBug.ll b/test/CodeGen/X86/2009-05-30-ISelBug.ll
index af552d4..fe04272 100644
--- a/test/CodeGen/X86/2009-05-30-ISelBug.ll
+++ b/test/CodeGen/X86/2009-05-30-ISelBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | not grep {movzbl	%\[abcd\]h,}
+; RUN: llc < %s -march=x86-64 | not grep "movzbl	%[abcd]h,"
 
 define void @BZ2_bzDecompress_bb5_2E_outer_bb35_2E_i_bb54_2E_i(i32*, i32 %c_nblock_used.2.i, i32 %.reload51, i32* %.out, i32* %.out1, i32* %.out2, i32* %.out3) nounwind {
 newFuncRoot:
diff --git a/test/CodeGen/X86/20090313-signext.ll b/test/CodeGen/X86/20090313-signext.ll
index de930d5..b8effa6 100644
--- a/test/CodeGen/X86/20090313-signext.ll
+++ b/test/CodeGen/X86/20090313-signext.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86-64 -relocation-model=pic > %t
-; RUN: grep {movswl	%ax, %edi} %t
-; RUN: grep {movw	(%rax), %ax} %t
+; RUN: grep "movswl	%ax, %edi" %t
+; RUN: grep "movw	(%rax), %ax" %t
 ; XFAIL: *
 
 @x = common global i16 0
diff --git a/test/CodeGen/X86/2010-01-19-OptExtBug.ll b/test/CodeGen/X86/2010-01-19-OptExtBug.ll
index cd8960b..eb4a5c0 100644
--- a/test/CodeGen/X86/2010-01-19-OptExtBug.ll
+++ b/test/CodeGen/X86/2010-01-19-OptExtBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -relocation-model=pic -disable-fp-elim -stats |& not grep ext-opt
+; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -relocation-model=pic -disable-fp-elim -stats 2>&1 | not grep ext-opt
 
 define fastcc i8* @S_scan_str(i8* %start, i32 %keep_quoted, i32 %keep_delims) nounwind ssp {
 entry:
diff --git a/test/CodeGen/X86/2010-05-06-LocalInlineAsmClobber.ll b/test/CodeGen/X86/2010-05-06-LocalInlineAsmClobber.ll
index 90eb84d..35f2339 100644
--- a/test/CodeGen/X86/2010-05-06-LocalInlineAsmClobber.ll
+++ b/test/CodeGen/X86/2010-05-06-LocalInlineAsmClobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc -regalloc=fast %s -o %t
+; RUN: llc -regalloc=fast -optimize-regalloc=0 %s -o %t
 ; PR7066
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/test/CodeGen/X86/2010-05-12-FastAllocKills.ll b/test/CodeGen/X86/2010-05-12-FastAllocKills.ll
index 36a99d6..eb0b150 100644
--- a/test/CodeGen/X86/2010-05-12-FastAllocKills.ll
+++ b/test/CodeGen/X86/2010-05-12-FastAllocKills.ll
@@ -1,4 +1,4 @@
-; RUN: llc -regalloc=fast -verify-machineinstrs < %s
+; RUN: llc -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin"
 
diff --git a/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll b/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
index 4639866..9b47bb7 100644
--- a/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
+++ b/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc -regalloc=fast < %s | FileCheck %s
+; RUN: llc -regalloc=fast -optimize-regalloc=0 < %s | FileCheck %s
 ; PR7382
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll b/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll
index c6f4b49..be10ad5 100644
--- a/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll
+++ b/test/CodeGen/X86/2011-04-13-SchedCmpJmp.ll
@@ -12,9 +12,9 @@ declare hidden fastcc void @_ZN3JSCL23returnToThrowTrampolineEPNS_12JSGlobalData
 
 ; Avoid hoisting the test above loads or copies
 ; CHECK: %entry
-; CHECK: cmpq
+; CHECK: test
 ; CHECK-NOT: mov
-; CHECK: jb
+; CHECK: je
 define i32 @cti_op_eq(i8** nocapture %args) nounwind ssp {
 entry:
   %0 = load i8** null, align 8
diff --git a/test/CodeGen/X86/2011-04-19-sclr-bb.ll b/test/CodeGen/X86/2011-04-19-sclr-bb.ll
new file mode 100644
index 0000000..771e4b3
--- /dev/null
+++ b/test/CodeGen/X86/2011-04-19-sclr-bb.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
+
+; Make sure that values of illegal types are not scalarized between basic blocks.
+;CHECK: test
+;CHECK-NOT: pinsrw
+;CHECK-NOT: pextrb
+;CHECK: ret
+define void @test(i1 %cond) {
+ENTRY:
+  br label %LOOP
+LOOP:
+  %vec1 = phi <4 x i1> [ %vec1_or_2, %LOOP ], [ zeroinitializer, %ENTRY ]
+  %vec2 = phi <4 x i1> [ %vec2_and_1, %LOOP ], [ zeroinitializer, %ENTRY ]
+  %vec1_or_2 = or <4 x i1> %vec1, %vec2
+  %vec2_and_1 = and <4 x i1> %vec2, %vec1
+  br i1 %cond, label %LOOP, label %EXIT
+
+EXIT:
+  ret void
+}
+
diff --git a/test/CodeGen/X86/2011-06-03-x87chain.ll b/test/CodeGen/X86/2011-06-03-x87chain.ll
index bf7f583..ce63c74 100644
--- a/test/CodeGen/X86/2011-06-03-x87chain.ll
+++ b/test/CodeGen/X86/2011-06-03-x87chain.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse | FileCheck %s
+; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse | FileCheck %s
 
 define float @chainfail1(i64* nocapture %a, i64* nocapture %b, i32 %x, i32 %y, float* nocapture %f) nounwind uwtable noinline ssp {
 entry:
diff --git a/test/CodeGen/X86/2011-06-12-FastAllocSpill.ll b/test/CodeGen/X86/2011-06-12-FastAllocSpill.ll
index a51dad0..47ef693 100644
--- a/test/CodeGen/X86/2011-06-12-FastAllocSpill.ll
+++ b/test/CodeGen/X86/2011-06-12-FastAllocSpill.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -disable-fp-elim -relocation-model=pic -stats |& FileCheck %s
+; RUN: llc < %s -O0 -disable-fp-elim -relocation-model=pic -stats 2>&1 | FileCheck %s
 ;
 ; This test should not cause any spilling with RAFast.
 ;
diff --git a/test/CodeGen/X86/2011-09-18-sse2cmp.ll b/test/CodeGen/X86/2011-09-18-sse2cmp.ll
index 844d674..a6f428f 100644
--- a/test/CodeGen/X86/2011-09-18-sse2cmp.ll
+++ b/test/CodeGen/X86/2011-09-18-sse2cmp.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=x86 -mcpu=yonah -promote-elements -mattr=+sse2,-sse41 | FileCheck %s
+;RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse41 | FileCheck %s
 
 ;CHECK: @max
 ;CHECK: cmplepd
diff --git a/test/CodeGen/X86/2011-09-21-setcc-bug.ll b/test/CodeGen/X86/2011-09-21-setcc-bug.ll
index ed5649c..4daf678 100644
--- a/test/CodeGen/X86/2011-09-21-setcc-bug.ll
+++ b/test/CodeGen/X86/2011-09-21-setcc-bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -promote-elements -mattr=+sse41
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse41
 
 ; Make sure we are not crashing on this code.
 
diff --git a/test/CodeGen/X86/2011-10-11-srl.ll b/test/CodeGen/X86/2011-10-11-srl.ll
index cf9d36f..6c6d340 100644
--- a/test/CodeGen/X86/2011-10-11-srl.ll
+++ b/test/CodeGen/X86/2011-10-11-srl.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -promote-elements -mattr=-sse41 
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=-sse41 
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/X86/2011-12-15-vec_shift.ll b/test/CodeGen/X86/2011-12-15-vec_shift.ll
index 6f9188c..dc3a08b 100644
--- a/test/CodeGen/X86/2011-12-15-vec_shift.ll
+++ b/test/CodeGen/X86/2011-12-15-vec_shift.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=x86-64 -mattr=+sse41 < %s | FileCheck %s -check-prefix=CHECK-W-SSE4
-; RUN: llc -march=x86-64 -mattr=-sse41 < %s | FileCheck %s -check-prefix=CHECK-WO-SSE4
+; RUN: llc -march=x86-64 -mattr=+sse41 -mcpu=penryn < %s | FileCheck %s -check-prefix=CHECK-W-SSE4
+; RUN: llc -march=x86-64 -mattr=-sse41 -mcpu=penryn < %s | FileCheck %s -check-prefix=CHECK-WO-SSE4
 ; Test case for r146671
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.7"
diff --git a/test/CodeGen/X86/2012-02-20-MachineCPBug.ll b/test/CodeGen/X86/2012-02-20-MachineCPBug.ll
index 557d49d..477b4de 100644
--- a/test/CodeGen/X86/2012-02-20-MachineCPBug.ll
+++ b/test/CodeGen/X86/2012-02-20-MachineCPBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-macosx -mattr=+sse | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-macosx -mcpu=core2 -mattr=+sse | FileCheck %s
 ; PR11940: Do not optimize away movb %al, %ch
 
 %struct.APInt = type { i64* }
diff --git a/test/CodeGen/X86/2012-03-26-PostRALICMBug.ll b/test/CodeGen/X86/2012-03-26-PostRALICMBug.ll
index 101ecca..18a3313 100644
--- a/test/CodeGen/X86/2012-03-26-PostRALICMBug.ll
+++ b/test/CodeGen/X86/2012-03-26-PostRALICMBug.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -stats |& \
-; RUN:   not grep {Number of machine instructions hoisted out of loops post regalloc}
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -stats 2>&1 | \
+; RUN:   not grep "Number of machine instructions hoisted out of loops post regalloc"
 
 ; rdar://11095580
 
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 9543587..9a66b67 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -5,7 +5,8 @@
 ; It's hard to test for the ISEL condition because CodeGen optimizes
 ; away the bugpointed code. Just ensure the basics are still there.
 ;CHECK: func:
-;CHECK: vmovups
+;CHECK: vpxor
+;CHECK: vinsertf128
 ;CHECK: vpshufd
 ;CHECK: vpshufd
 ;CHECK: vmulps
diff --git a/test/CodeGen/X86/2012-05-17-TwoAddressBug.ll b/test/CodeGen/X86/2012-05-17-TwoAddressBug.ll
new file mode 100644
index 0000000..171c3f1
--- /dev/null
+++ b/test/CodeGen/X86/2012-05-17-TwoAddressBug.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -pre-RA-sched=source | FileCheck %s
+
+; Teach two-address pass to update the "source" map so it doesn't perform a
+; non-profitable commute using outdated info. The test case would still fail
+; because of poor pre-RA schedule. That will be fixed by MI scheduler.
+; rdar://11472010
+define i32 @t(i32 %mask) nounwind readnone ssp {
+entry:
+; CHECK: t:
+; CHECK-NOT: mov
+  %sub = add i32 %mask, -65535
+  %shr = lshr i32 %sub, 23
+  %and = and i32 %mask, 1
+  %add = add i32 %shr, %and
+  ret i32 %add
+}
diff --git a/test/CodeGen/X86/2012-05-19-CoalescerCrash.ll b/test/CodeGen/X86/2012-05-19-CoalescerCrash.ll
new file mode 100644
index 0000000..837fbc0
--- /dev/null
+++ b/test/CodeGen/X86/2012-05-19-CoalescerCrash.ll
@@ -0,0 +1,122 @@
+; RUN: llc < %s -verify-coalescing
+; PR12892
+;
+; Dead code elimination during coalesing causes a live range to split into two
+; virtual registers. Stale identity copies that had already been joined were
+; interfering with the liveness computations.
+
+target triple = "i386-pc-linux-gnu"
+
+define void @_ZN4llvm17AsmMatcherEmitter3runERNS_11raw_ostreamE() align 2 {
+  invoke void @_ZNK4llvm13CodeGenTarget12getAsmParserEv()
+          to label %1 unwind label %5
+
+; <label>:1                                       ; preds = %0
+  invoke void @_ZNK4llvm6Record16getValueAsStringENS_9StringRefE()
+          to label %4 unwind label %2
+
+; <label>:2                                       ; preds = %1
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  unreachable
+
+; <label>:4                                       ; preds = %1
+  invoke void @_ZN4llvm18isCurrentDebugTypeEPKc()
+          to label %12 unwind label %7
+
+; <label>:5                                       ; preds = %0
+  %6 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %33
+
+; <label>:7                                       ; preds = %4
+  %8 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %9
+
+; <label>:9                                       ; preds = %28, %7
+  %10 = phi { i8*, i32 } [ %29, %28 ], [ %8, %7 ]
+  %11 = extractvalue { i8*, i32 } %10, 1
+  invoke fastcc void @_ZN12_GLOBAL__N_114AsmMatcherInfoD2Ev()
+          to label %32 unwind label %35
+
+; <label>:12                                      ; preds = %4
+  invoke void @_ZNK4llvm13CodeGenTarget10getRegBankEv()
+          to label %13 unwind label %16
+
+; <label>:13                                      ; preds = %12
+  br label %14
+
+; <label>:14                                      ; preds = %20, %13
+  %15 = icmp eq i32 undef, 0
+  br i1 %15, label %20, label %18
+
+; <label>:16                                      ; preds = %12
+  %17 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %26
+
+; <label>:18                                      ; preds = %14
+  invoke void @_ZNSs4_Rep9_S_createEjjRKSaIcE()
+          to label %19 unwind label %21
+
+; <label>:19                                      ; preds = %18
+  unreachable
+
+; <label>:20                                      ; preds = %14
+  br label %14
+
+; <label>:21                                      ; preds = %18
+  %22 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  %23 = extractvalue { i8*, i32 } %22, 1
+  br i1 undef, label %26, label %24
+
+; <label>:24                                      ; preds = %21
+  br i1 undef, label %25, label %26
+
+; <label>:25                                      ; preds = %24
+  unreachable
+
+; <label>:26                                      ; preds = %24, %21, %16
+  %27 = phi i32 [ 0, %16 ], [ %23, %21 ], [ %23, %24 ]
+  invoke void @_ZNSt6vectorISt4pairISsSsESaIS1_EED1Ev()
+          to label %28 unwind label %30
+
+; <label>:28                                      ; preds = %26
+  %29 = insertvalue { i8*, i32 } undef, i32 %27, 1
+  br label %9
+
+; <label>:30                                      ; preds = %26
+  %31 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  unreachable
+
+; <label>:32                                      ; preds = %9
+  br label %33
+
+; <label>:33                                      ; preds = %32, %5
+  %34 = phi i32 [ undef, %5 ], [ %11, %32 ]
+  unreachable
+
+; <label>:35                                      ; preds = %9
+  %36 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  unreachable
+}
+
+declare void @_ZNK4llvm13CodeGenTarget12getAsmParserEv()
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @_ZNK4llvm6Record16getValueAsStringENS_9StringRefE()
+
+declare void @_ZN4llvm18isCurrentDebugTypeEPKc()
+
+declare fastcc void @_ZN12_GLOBAL__N_114AsmMatcherInfoD2Ev() unnamed_addr inlinehint align 2
+
+declare hidden void @_ZNSt6vectorISt4pairISsSsESaIS1_EED1Ev() unnamed_addr align 2
+
+declare void @_ZNSs4_Rep9_S_createEjjRKSaIcE()
+
+declare void @_ZNK4llvm13CodeGenTarget10getRegBankEv()
diff --git a/test/CodeGen/X86/2012-05-19-avx2-store.ll b/test/CodeGen/X86/2012-05-19-avx2-store.ll
new file mode 100644
index 0000000..1c1e8e2
--- /dev/null
+++ b/test/CodeGen/X86/2012-05-19-avx2-store.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx2 | FileCheck %s
+
+define void @double_save(<4 x i32>* %Ap, <4 x i32>* %Bp, <8 x i32>* %P) nounwind ssp {
+entry:
+  ; CHECK: vmovaps
+  ; CHECK: vinsertf128 $1, ([[A0:%rdi|%rsi]]),
+  ; CHECK: vmovups
+  %A = load <4 x i32>* %Ap
+  %B = load <4 x i32>* %Bp
+  %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i32> %Z, <8 x i32>* %P, align 16
+  ret void
+}
diff --git a/test/CodeGen/X86/2012-07-10-extload64.ll b/test/CodeGen/X86/2012-07-10-extload64.ll
new file mode 100644
index 0000000..906b748
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-10-extload64.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mtriple=i686-pc-win32 | FileCheck %s
+
+; CHECK: load_store
+define void @load_store(<4 x i16>* %in) {
+entry:
+; CHECK: movsd
+  %A27 = load <4 x i16>* %in, align 4
+  %A28 = add <4 x i16> %A27, %A27
+; CHECK: movlpd
+  store <4 x i16> %A28, <4 x i16>* %in, align 4
+  ret void
+; CHECK: ret
+}
+
+; Make sure that we store a 64bit value, even on 32bit systems.
+;CHECK: store_64
+define void @store_64(<2 x i32>* %ptr) {
+BB:
+  store <2 x i32> zeroinitializer, <2 x i32>* %ptr
+  ret void
+;CHECK: movlpd
+;CHECK: ret
+}
+
+;CHECK: load_64
+define <2 x i32> @load_64(<2 x i32>* %ptr) {
+BB:
+  %t = load <2 x i32>* %ptr
+  ret <2 x i32> %t
+;CHECK: movsd
+;CHECK: ret
+}
diff --git a/test/CodeGen/X86/2012-07-10-shufnorm.ll b/test/CodeGen/X86/2012-07-10-shufnorm.ll
new file mode 100644
index 0000000..e39df58
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-10-shufnorm.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx | FileCheck %s
+
+; CHECK: ocl
+define void @ocl() {
+entry:
+  %vext = shufflevector <2 x double> zeroinitializer, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit = shufflevector <8 x double> %vext, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit1 = insertelement <8 x double> %vecinit, double undef, i32 2
+  %vecinit3 = insertelement <8 x double> %vecinit1, double undef, i32 3
+  %vecinit5 = insertelement <8 x double> %vecinit3, double 0.000000e+00, i32 4
+  %vecinit9 = shufflevector <8 x double> %vecinit5, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 10>
+  store <8 x double> %vecinit9, <8 x double>* undef
+  ret void
+; CHECK: vxorps
+; CHECK: ret
+}
+
diff --git a/test/CodeGen/X86/2012-07-15-broadcastfold.ll b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
new file mode 100644
index 0000000..3b7a8a7
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx2 | FileCheck %s
+
+declare x86_fastcallcc i64 @barrier()
+
+;CHECK: bcast_fold
+;CHECK: vmovaps %xmm{{[0-9]+}}, [[SPILLED:[^\)]+\)]]
+;CHECK: barrier
+;CHECK: vbroadcastss [[SPILLED]], %ymm0
+;CHECK: ret
+define <8 x float> @bcast_fold( float* %A) {
+BB:
+  %A0 = load float* %A
+  %tt3 = call x86_fastcallcc i64 @barrier()
+  br i1 undef, label %work, label %exit
+
+work:
+  %A1 = insertelement <8 x float> undef, float %A0, i32 0
+  %A2 = shufflevector <8 x float> %A1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <8 x float> %A2
+
+exit:
+  ret <8 x float> undef
+}
diff --git a/test/CodeGen/X86/2012-07-15-tconst_shl.ll b/test/CodeGen/X86/2012-07-15-tconst_shl.ll
new file mode 100644
index 0000000..46eca76
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-15-tconst_shl.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+avx2
+; make sure that we are not crashing.
+
+define <16 x i32> @autogen_SD34717() {
+BB:
+  %Shuff7 = shufflevector <16 x i32> zeroinitializer, <16 x i32> zeroinitializer, <16 x i32> <i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 undef, i32 22, i32 24, i32 26, i32 28, i32 30, i32 undef>
+  %B9 = lshr <16 x i32> zeroinitializer, %Shuff7
+  ret <16 x i32> %B9
+}
diff --git a/test/CodeGen/X86/2012-07-15-vshl.ll b/test/CodeGen/X86/2012-07-15-vshl.ll
new file mode 100644
index 0000000..cd0fef4
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-15-vshl.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx
+; PR13352
+
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+
+define void @f_f() nounwind {
+allocas:
+  br label %for_loop29
+
+for_loop29:                                       ; preds = %safe_if_after_true, %allocas
+  %indvars.iv596 = phi i64 [ %indvars.iv.next597, %safe_if_after_true ], [ 0, %allocas ]
+  %0 = trunc i64 %indvars.iv596 to i32
+  %smear.15 = insertelement <16 x i32> undef, i32 %0, i32 15
+  %bitop = lshr <16 x i32> zeroinitializer, %smear.15
+  %bitop35 = and <16 x i32> %bitop, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %bitop35_to_bool = icmp ne <16 x i32> %bitop35, zeroinitializer
+  %val_to_boolvec32 = sext <16 x i1> %bitop35_to_bool to <16 x i32>
+  %floatmask.i526 = bitcast <16 x i32> %val_to_boolvec32 to <16 x float>
+  %mask1.i529 = shufflevector <16 x float> %floatmask.i526, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %"internal_mask&function_mask41_any" = icmp eq i32 undef, 0
+  br i1 %"internal_mask&function_mask41_any", label %safe_if_after_true, label %safe_if_run_true
+
+safe_if_after_true:                               ; preds = %for_loop29
+  %indvars.iv.next597 = add i64 %indvars.iv596, 1
+  br label %for_loop29
+
+safe_if_run_true:                                 ; preds = %for_loop29
+  %blend1.i583 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> undef, <8 x float> undef, <8 x float> %mask1.i529) nounwind
+  unreachable
+}
+
diff --git a/test/CodeGen/X86/2012-07-16-LeaUndef.ll b/test/CodeGen/X86/2012-07-16-LeaUndef.ll
new file mode 100644
index 0000000..9e5cbd2
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-16-LeaUndef.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7
+
+define void @autogen_SD2543() {
+A:
+  %E83 = add i32 0, 1
+  %E820 = add i32 0, undef
+  br label %C
+C:
+  %B908 = add i32 %E83, %E820
+  store i32 %B908, i32* undef
+  %Sl2391 = select i1 undef, i32 undef, i32 %E83
+  %Cmp3114 = icmp ne i32 %Sl2391, undef
+  br i1 %Cmp3114, label %C, label %G
+G:
+  ret void
+}
diff --git a/test/CodeGen/X86/2012-07-16-fp2ui-i1.ll b/test/CodeGen/X86/2012-07-16-fp2ui-i1.ll
new file mode 100644
index 0000000..17533a1
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-16-fp2ui-i1.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7
+
+define void @autogen_SD3100() {
+BB:
+  %FC123 = fptoui float 0x40693F5D00000000 to i1
+  br i1 %FC123, label %V, label %W
+
+V:
+  ret void
+W:
+  ret void
+}
diff --git a/test/CodeGen/X86/2012-07-17-vtrunc.ll b/test/CodeGen/X86/2012-07-17-vtrunc.ll
new file mode 100644
index 0000000..2de2f97
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-17-vtrunc.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7
+
+define void @autogen_SD33189483() {
+BB:
+  br label %CF76
+
+CF76:                                             ; preds = %CF76, %BB
+  %Shuff13 = shufflevector <4 x i64> zeroinitializer, <4 x i64> undef, <4 x i32> zeroinitializer
+  %Tr16 = trunc <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i1>
+  %E19 = extractelement <8 x i1> %Tr16, i32 2
+  br i1 %E19, label %CF76, label %CF78
+
+CF78:                                             ; preds = %CF78, %CF76
+  %BC = bitcast <4 x i64> %Shuff13 to <4 x double>
+  br label %CF78
+}
diff --git a/test/CodeGen/X86/2012-07-23-select_cc.ll b/test/CodeGen/X86/2012-07-23-select_cc.ll
new file mode 100644
index 0000000..33fcb12
--- /dev/null
+++ b/test/CodeGen/X86/2012-07-23-select_cc.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; PR 13428
+
+declare void @use(double)
+
+define void @test() {
+entry:
+  call void @use(double 1.000000e+00)
+  %A = icmp eq i64 undef, 2
+  %B = zext i1 %A to i32
+  %C = sitofp i32 %B to double
+  call void @use(double %C)
+  call void @use(double 0.000000e+00)
+  unreachable
+}
diff --git a/test/CodeGen/X86/2012-08-07-CmpISelBug.ll b/test/CodeGen/X86/2012-08-07-CmpISelBug.ll
new file mode 100644
index 0000000..000b853
--- /dev/null
+++ b/test/CodeGen/X86/2012-08-07-CmpISelBug.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s
+
+; Cmp lowering should not look past the truncate unless the high bits are known
+; zero.
+; rdar://12027825
+
+define void @foo(i8 %arg4, i32 %arg5, i32* %arg14) nounwind {
+bb:
+; CHECK: foo:
+; CHECK-NOT: testl
+; CHECK: testb
+  %tmp48 = zext i8 %arg4 to i32
+  %tmp49 = and i32 %tmp48, 32
+  %tmp50 = add i32 %tmp49, 1593371643
+  %tmp55 = sub i32 %tmp50, 0
+  %tmp56 = add i32 %tmp55, 7787538
+  %tmp57 = xor i32 %tmp56, 1601159181
+  %tmp58 = xor i32 %arg5, 1601159181
+  %tmp59 = and i32 %tmp57, %tmp58
+  %tmp60 = add i32 %tmp59, -1263900958
+  %tmp67 = sub i32 %tmp60, 0
+  %tmp103 = xor i32 %tmp56, 13
+  %tmp104 = trunc i32 %tmp103 to i8
+  %tmp105 = sub i8 0, %tmp104
+  %tmp106 = add i8 %tmp105, -103
+  %tmp113 = sub i8 %tmp106, 0
+  %tmp114 = add i8 %tmp113, -72
+  %tmp141 = icmp ne i32 %tmp67, -1263900958
+  %tmp142 = select i1 %tmp141, i8 %tmp114, i8 undef
+  %tmp143 = xor i8 %tmp142, 81
+  %tmp144 = zext i8 %tmp143 to i32
+  %tmp145 = add i32 %tmp144, 2062143348
+  %tmp152 = sub i32 %tmp145, 0
+  store i32 %tmp152, i32* %arg14
+  ret void
+}
diff --git a/test/CodeGen/X86/4char-promote.ll b/test/CodeGen/X86/4char-promote.ll
index 386057f..4f1a859 100644
--- a/test/CodeGen/X86/4char-promote.ll
+++ b/test/CodeGen/X86/4char-promote.ll
@@ -1,11 +1,12 @@
 ; A test for checking PR 9623
-;RUN: llc -march=x86-64 -mcpu=corei7 -promote-elements < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s
 
 target triple = "x86_64-apple-darwin"
 
-; CHECK:  pmulld 
-; CHECK:  paddd  
-; CHECK:  movdqa 
+; CHECK:  pmulld
+; CHECK:  paddd
+; CHECK-NOT:  movdqa
+; CHECK:  ret
 
 define <4 x i8> @foo(<4 x i8> %x, <4 x i8> %y) {
 entry:
diff --git a/test/CodeGen/X86/MachineSink-PHIUse.ll b/test/CodeGen/X86/MachineSink-PHIUse.ll
index 3758fd8..3314168 100644
--- a/test/CodeGen/X86/MachineSink-PHIUse.ll
+++ b/test/CodeGen/X86/MachineSink-PHIUse.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-appel-darwin -disable-cgp-branch-opts -stats |& grep {machine-sink}
+; RUN: llc < %s -mtriple=x86_64-appel-darwin -disable-cgp-branch-opts -stats 2>&1 | grep "machine-sink"
 
 define fastcc void @t() nounwind ssp {
 entry:
diff --git a/test/CodeGen/X86/add.ll b/test/CodeGen/X86/add.ll
index 8e871f4..03d2e47 100644
--- a/test/CodeGen/X86/add.ll
+++ b/test/CodeGen/X86/add.ll
@@ -1,8 +1,6 @@
 ; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -join-physregs | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32 -join-physregs | FileCheck %s -check-prefix=X64
-
-; Some of these tests depend on -join-physregs to commute instructions.
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32 | FileCheck %s -check-prefix=X64
 
 ; The immediate can be encoded in a smaller way if the
 ; instruction is a sub instead of an add.
@@ -101,9 +99,9 @@ define {i32, i1} @test7(i32 %v1, i32 %v2) nounwind {
 }
 
 ; X64: test7:
-; X64: addl %e[[A1]], %eax
+; X64: addl %e[[A1]], %e
 ; X64-NEXT: setb %dl
-; X64-NEXT: ret
+; X64: ret
 
 ; PR5443
 define {i64, i1} @test8(i64 %left, i64 %right) nounwind {
diff --git a/test/CodeGen/X86/addr-label-difference.ll b/test/CodeGen/X86/addr-label-difference.ll
index 49abd8a..15fbec5 100644
--- a/test/CodeGen/X86/addr-label-difference.ll
+++ b/test/CodeGen/X86/addr-label-difference.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -o - | grep {__TEXT,__const}
+; RUN: llc %s -o - | grep "__TEXT,__const"
 ; PR5929
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 target triple = "i386-apple-darwin10.0"
diff --git a/test/CodeGen/X86/aligned-comm.ll b/test/CodeGen/X86/aligned-comm.ll
index 7715869..eab02cc 100644
--- a/test/CodeGen/X86/aligned-comm.ll
+++ b/test/CodeGen/X86/aligned-comm.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86
-; RUN: llc < %s -mtriple=i386-apple-darwin10 | grep {array,16512,7}
-; RUN: llc < %s -mtriple=i386-apple-darwin9 | grep {array,16512,7}
+; RUN: llc < %s -mtriple=i386-apple-darwin10 | grep "array,16512,7"
+; RUN: llc < %s -mtriple=i386-apple-darwin9 | grep "array,16512,7"
 
 ; Darwin 9+ should get alignment on common symbols.
 @array = common global [4128 x i32] zeroinitializer, align 128
diff --git a/test/CodeGen/X86/alignment-2.ll b/test/CodeGen/X86/alignment-2.ll
index cc709b5..1f9e85c 100644
--- a/test/CodeGen/X86/alignment-2.ll
+++ b/test/CodeGen/X86/alignment-2.ll
@@ -18,7 +18,9 @@
 define signext i8 @do_lo_list() nounwind optsize ssp {
 bb:
 ; CHECK:     do_lo_list
-; CHECK-NOT: movaps
+; Make sure we do not use movaps for the global variable.
+; It is okay to use movaps for writing the local variable on stack.
+; CHECK-NOT: movaps {{[0-9]*}}(%{{[a-z]*}}), {{%xmm[0-9]}}
   %myopt = alloca %struct.printQueryOpt, align 4
   %tmp = bitcast %struct.printQueryOpt* %myopt to i8*
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp, i8* bitcast (%struct.printQueryOpt* getelementptr inbounds (%struct._psqlSettings* @pset, i32 0, i32 4) to i8*), i32 76, i32 4, i1 false)
diff --git a/test/CodeGen/X86/alloca-align-rounding-32.ll b/test/CodeGen/X86/alloca-align-rounding-32.ll
index c0f1a18..a45284e 100644
--- a/test/CodeGen/X86/alloca-align-rounding-32.ll
+++ b/test/CodeGen/X86/alloca-align-rounding-32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin | grep and | count 1
+; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin | FileCheck %s
 
 declare void @bar(<2 x i64>* %n)
 
@@ -6,10 +6,15 @@ define void @foo(i32 %h) {
   %p = alloca <2 x i64>, i32 %h
   call void @bar(<2 x i64>* %p)
   ret void
+; CHECK: foo
+; CHECK-NOT: andl $-32, %eax
 }
 
 define void @foo2(i32 %h) {
   %p = alloca <2 x i64>, i32 %h, align 32
   call void @bar(<2 x i64>* %p)
   ret void
+; CHECK: foo2
+; CHECK: andl $-32, %esp
+; CHECK: andl $-32, %eax
 }
diff --git a/test/CodeGen/X86/alloca-align-rounding.ll b/test/CodeGen/X86/alloca-align-rounding.ll
index 3c87dbf..3d76fb0 100644
--- a/test/CodeGen/X86/alloca-align-rounding.ll
+++ b/test/CodeGen/X86/alloca-align-rounding.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux | grep and | count 1
+; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux | FileCheck %s
 
 declare void @bar(<2 x i64>* %n)
 
@@ -6,10 +6,15 @@ define void @foo(i64 %h) {
   %p = alloca <2 x i64>, i64 %h
   call void @bar(<2 x i64>* %p)
   ret void
+; CHECK: foo
+; CHECK-NOT: andq $-32, %rax
 }
 
 define void @foo2(i64 %h) {
   %p = alloca <2 x i64>, i64 %h, align 32
   call void @bar(<2 x i64>* %p)
   ret void
+; CHECK: foo2
+; CHECK: andq $-32, %rsp
+; CHECK: andq $-32, %rax
 }
diff --git a/test/CodeGen/X86/andimm8.ll b/test/CodeGen/X86/andimm8.ll
index a3dc85f..640237d 100644
--- a/test/CodeGen/X86/andimm8.ll
+++ b/test/CodeGen/X86/andimm8.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux-gnu -show-mc-encoding -join-physregs | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux-gnu -show-mc-encoding | FileCheck %s
 
 ; PR8365
 ; CHECK: andl	$-64, %edi              # encoding: [0x83,0xe7,0xc0]
diff --git a/test/CodeGen/X86/2008-08-25-AsmRegTypeMismatch.ll b/test/CodeGen/X86/asm-reg-type-mismatch.ll
index f0d46a0..47accdb 100644
--- a/test/CodeGen/X86/2008-08-25-AsmRegTypeMismatch.ll
+++ b/test/CodeGen/X86/asm-reg-type-mismatch.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -mcpu=core2 | grep xorps | count 2
-; RUN: llc < %s -mcpu=core2 | not grep movap
+; RUN: llc < %s -mcpu=core2 | FileCheck %s
 ; PR2715
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
@@ -11,8 +10,22 @@ target triple = "x86_64-unknown-linux-gnu"
 	%struct.nsXPTCVariant = type { %struct.nsXPTCMiniVariant, i8*, %struct.nsXPTType, i8 }
 	%struct.nsXPTType = type { %struct.XPTTypeDescriptorPrefix }
 
-define i32 @XPTC_InvokeByIndex(%struct.nsISupports* %that, i32 %methodIndex, i32 %paramCount, %struct.nsXPTCVariant* %params) nounwind {
+define i32 @test1(%struct.nsISupports* %that, i32 %methodIndex, i32 %paramCount, %struct.nsXPTCVariant* %params) nounwind {
 entry:
 	call void asm sideeffect "", "{xmm0},{xmm1},{xmm2},{xmm3},{xmm4},{xmm5},{xmm6},{xmm7},~{dirflag},~{fpsr},~{flags}"( double undef, double undef, double undef, double 1.0, double undef, double 0.0, double undef, double 0.0 ) nounwind
 	ret i32 0
+	; CHECK: test1
+	; CHECK-NOT: movap
+	; CHECK: xorps
+	; CHECK: xorps
+	; CHECK-NOT: movap
+}
+
+define i64 @test2() nounwind {
+entry:
+  %0 = tail call i64 asm sideeffect "movq $1, $0", "={xmm7},*m,~{dirflag},~{fpsr},~{flags}"(i64* null) nounwind
+  ret i64 %0
+  ; CHECK: test2
+	; CHECK: movq {{.*}}, %xmm7
+	; CHECK: movd %xmm7, %rax
 }
diff --git a/test/CodeGen/X86/atom-lea-sp.ll b/test/CodeGen/X86/atom-lea-sp.ll
index 5942788..19482e1 100644
--- a/test/CodeGen/X86/atom-lea-sp.ll
+++ b/test/CodeGen/X86/atom-lea-sp.ll
@@ -1,15 +1,15 @@
-; RUN: llc < %s -mcpu=atom -mtriple=i686-linux  | FileCheck -check-prefix=atom %s
+; RUN: llc < %s -mcpu=atom -mtriple=i686-linux  | FileCheck -check-prefix=ATOM %s
 ; RUN: llc < %s -mcpu=core2 -mtriple=i686-linux | FileCheck %s
 
 declare void @use_arr(i8*)
 declare void @many_params(i32, i32, i32, i32, i32, i32)
 
 define void @test1() nounwind {
-; atom: test1:
-; atom: leal -1052(%esp), %esp
-; atom-NOT: sub
-; atom: call
-; atom: leal 1052(%esp), %esp
+; ATOM: test1:
+; ATOM: leal -1052(%esp), %esp
+; ATOM-NOT: sub
+; ATOM: call
+; ATOM: leal 1052(%esp), %esp
 
 ; CHECK: test1:
 ; CHECK: subl
@@ -22,10 +22,10 @@ define void @test1() nounwind {
 }
 
 define void @test2() nounwind {
-; atom: test2:
-; atom: leal -28(%esp), %esp
-; atom: call
-; atom: leal 28(%esp), %esp
+; ATOM: test2:
+; ATOM: leal -28(%esp), %esp
+; ATOM: call
+; ATOM: leal 28(%esp), %esp
 
 ; CHECK: test2:
 ; CHECK-NOT: lea
@@ -34,9 +34,9 @@ define void @test2() nounwind {
 }
 
 define void @test3() nounwind {
-; atom: test3:
-; atom: leal -8(%esp), %esp
-; atom: leal 8(%esp), %esp
+; ATOM: test3:
+; ATOM: leal -8(%esp), %esp
+; ATOM: leal 8(%esp), %esp
 
 ; CHECK: test3:
 ; CHECK-NOT: lea
diff --git a/test/CodeGen/X86/atom-sched.ll b/test/CodeGen/X86/atom-sched.ll
index 4dd9a9e..0d97e85 100644
--- a/test/CodeGen/X86/atom-sched.ll
+++ b/test/CodeGen/X86/atom-sched.ll
@@ -1,9 +1,6 @@
-; XFAIL: *
 ; RUN: llc <%s -O2 -mcpu=atom -march=x86 -relocation-model=static | FileCheck -check-prefix=atom %s
 ; RUN: llc <%s -O2 -mcpu=core2 -march=x86 -relocation-model=static | FileCheck %s
 ;
-; FIXME: Atom's scheduler is temporarily disabled.
-; XFAIL: *
 
 @a = common global i32 0, align 4
 @b = common global i32 0, align 4
diff --git a/test/CodeGen/X86/atomic_op.ll b/test/CodeGen/X86/atomic_op.ll
index 7c5abe2..152bece 100644
--- a/test/CodeGen/X86/atomic_op.ll
+++ b/test/CodeGen/X86/atomic_op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll
index 7729491..188efe2 100644
--- a/test/CodeGen/X86/avx-blend.ll
+++ b/test/CodeGen/X86/avx-blend.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -promote-elements -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx  -mattr=+avx | FileCheck %s
 
 ; AVX128 tests:
 
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index b334932..c44beb4 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7 -mattr=avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
 
 define <2 x i64> @test_x86_aesni_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
   ; CHECK: vaesdec
@@ -1154,7 +1154,7 @@ define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
   ; CHECK: vpcmpestri
-  ; CHECK: movl
+  ; CHECK: seta
   %res = call i32 @llvm.x86.sse42.pcmpestria128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1165,7 +1165,7 @@ define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
   ; CHECK: vpcmpestri
-  ; CHECK: movl
+  ; CHECK: sbbl
   %res = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1176,7 +1176,7 @@ define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
   ; CHECK: vpcmpestri
-  ; CHECK: movl
+  ; CHECK: seto
   %res = call i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1187,7 +1187,7 @@ define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
   ; CHECK: vpcmpestri
-  ; CHECK: movl
+  ; CHECK: sets
   %res = call i32 @llvm.x86.sse42.pcmpestris128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1198,7 +1198,7 @@ define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
   ; CHECK: vpcmpestri
-  ; CHECK: movl
+  ; CHECK: sete
   %res = call i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1209,6 +1209,7 @@ define <16 x i8> @test_x86_sse42_pcmpestrm128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
   ; CHECK: vpcmpestrm
+  ; CHECK-NOT: vmov
   %res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
 }
@@ -1226,7 +1227,7 @@ declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind read
 
 define i32 @test_x86_sse42_pcmpistria128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
-  ; CHECK: movl
+  ; CHECK: seta
   %res = call i32 @llvm.x86.sse42.pcmpistria128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1235,7 +1236,7 @@ declare i32 @llvm.x86.sse42.pcmpistria128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define i32 @test_x86_sse42_pcmpistric128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
-  ; CHECK: movl
+  ; CHECK: sbbl
   %res = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1244,7 +1245,7 @@ declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define i32 @test_x86_sse42_pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
-  ; CHECK: movl
+  ; CHECK: seto
   %res = call i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1253,7 +1254,7 @@ declare i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define i32 @test_x86_sse42_pcmpistris128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
-  ; CHECK: movl
+  ; CHECK: sets
   %res = call i32 @llvm.x86.sse42.pcmpistris128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1262,7 +1263,7 @@ declare i32 @llvm.x86.sse42.pcmpistris128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define i32 @test_x86_sse42_pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
-  ; CHECK: movl
+  ; CHECK: sete
   %res = call i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1271,6 +1272,7 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistrm
+  ; CHECK-NOT: vmov
   %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
 }
@@ -2555,3 +2557,36 @@ define i32 @crc32_32_32(i32 %a, i32 %b) nounwind {
   ret i32 %tmp
 }
 declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
+
+; CHECK: movntdq
+define void @movnt_dq(i8* %p, <4 x i64> %a1) nounwind {
+  %a2 = add <4 x i64> %a1, <i64 1, i64 1, i64 1, i64 1>
+  tail call void @llvm.x86.avx.movnt.dq.256(i8* %p, <4 x i64> %a2) nounwind
+  ret void
+}
+declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind
+
+; CHECK: movntps
+define void @movnt_ps(i8* %p, <8 x float> %a) nounwind {
+  tail call void @llvm.x86.avx.movnt.ps.256(i8* %p, <8 x float> %a) nounwind
+  ret void
+}
+declare void @llvm.x86.avx.movnt.ps.256(i8*, <8 x float>) nounwind
+
+; CHECK: movntpd
+define void @movnt_pd(i8* %p, <4 x double> %a1) nounwind {
+  ; add operation forces the execution domain.
+  %a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
+  tail call void @llvm.x86.avx.movnt.pd.256(i8* %p, <4 x double> %a2) nounwind
+  ret void
+}
+declare void @llvm.x86.avx.movnt.pd.256(i8*, <4 x double>) nounwind
+
+
+; Check for pclmulqdq
+define <2 x i64> @test_x86_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK: vpclmulqdq
+  %res = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx-minmax.ll b/test/CodeGen/X86/avx-minmax.ll
index 7c58820..eff9251 100644
--- a/test/CodeGen/X86/avx-minmax.ll
+++ b/test/CodeGen/X86/avx-minmax.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+avx -asm-verbose=false -join-physregs -enable-unsafe-fp-math -enable-no-nans-fp-math -promote-elements | FileCheck -check-prefix=UNSAFE %s
+; RUN: llc < %s -march=x86-64 -mattr=+avx -asm-verbose=false -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck -check-prefix=UNSAFE %s
 
 ; UNSAFE: maxpd:
 ; UNSAFE: vmaxpd {{.+}}, %xmm
diff --git a/test/CodeGen/X86/avx-shuffle-x86_32.ll b/test/CodeGen/X86/avx-shuffle-x86_32.ll
index 5268ec3a..e203c4e 100755
--- a/test/CodeGen/X86/avx-shuffle-x86_32.ll
+++ b/test/CodeGen/X86/avx-shuffle-x86_32.ll
@@ -4,5 +4,5 @@ define <4 x i64> @test1(<4 x i64> %a) nounwind {
  %b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  ret <4 x i64>%b
  ; CHECK: test1:
- ; CHECK: vinsertf128
+ ; CHECK-NOT: vinsertf128
  }
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 16c447b..9b41709 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -90,8 +90,8 @@ define i32 @test9(<4 x i32> %a) nounwind {
 ; Extract a value which is the result of an undef mask.
 define i32 @test10(<4 x i32> %a) nounwind {
 ; CHECK: @test10
-; CHECK-NEXT: #
-; CHECK-NEXT: ret
+; CHECK-NOT: {{^[^#]*[a-z]}}
+; CHECK: ret
   %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   %r = extractelement <8 x i32> %b, i32 2
   ret i32 %r
@@ -149,17 +149,26 @@ entry:
 }
 
 ; PR12413
+; CHECK: shuf1
+; CHECK: vpshufb
+; CHECK: vpshufb
 ; CHECK: vpshufb
 ; CHECK: vpshufb
+define <32 x i8> @shuf1(<32 x i8> %inval1, <32 x i8> %inval2) {
+entry:
+ %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+ ret <32 x i8> %0
+}
+
+; handle the case where only half of the 256-bits is splittable
+; CHECK: shuf2
 ; CHECK: vpshufb
 ; CHECK: vpshufb
-define <32 x i8> @shuf(<32 x i8> %inval1, <32 x i8> %inval2) {
+; CHECK: vpextrb
+; CHECK: vpextrb
+define <32 x i8> @shuf2(<32 x i8> %inval1, <32 x i8> %inval2) {
 entry:
- %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0,
-i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32
-22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32
-42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32
-62>
+ %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 31, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
  ret <32 x i8> %0
 }
 
@@ -202,3 +211,40 @@ define <4 x i64> @blend4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline {
   %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   ret <4 x i64> %t
 }
+
+; CHECK: narrow
+; CHECK: vpermilps
+; CHECK: ret
+define <16 x i16> @narrow(<16 x i16> %a) nounwind alwaysinline {
+  %t = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 undef, i32 14, i32 15, i32 undef, i32 undef>
+  ret <16 x i16> %t
+}
+
+;CHECK: test17
+;CHECK-NOT: vinsertf128
+;CHECK: ret
+define   <8 x float> @test17(<4 x float> %y) {
+  %x = shufflevector <4 x float> %y, <4 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x float> %x
+}
+
+; CHECK: test18
+; CHECK: vshufps
+; CHECK: vshufps
+; CHECK: vunpcklps
+; CHECK: ret
+define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
+  %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x float>%S
+}
+
+; CHECK: test19
+; CHECK: vshufps
+; CHECK: vshufps
+; CHECK: vunpcklps
+; CHECK: ret
+define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
+  %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x float>%S
+}
+
diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index 148ae73..0d403d4 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -112,3 +112,32 @@ entry:
   %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
   ret <2 x double> %vecinit2.i
 }
+
+; CHECK: _RR
+; CHECK: vbroadcastss (%
+; CHECK: ret
+define <4 x float> @_RR(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
+entry:
+  %q = load float* %ptr, align 4
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  ; force a chain
+  %j = load i32* %k, align 4
+  store i32 %j, i32* undef
+  ret <4 x float> %vecinit6.i
+}
+
+
+; CHECK: _RR2
+; CHECK: vbroadcastss (%
+; CHECK: ret
+define <4 x float> @_RR2(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
+entry:
+  %q = load float* %ptr, align 4
+  %v = insertelement <4 x float> undef, float %q, i32 0
+  %t = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %t
+}
+
diff --git a/test/CodeGen/X86/avx2-conversions.ll b/test/CodeGen/X86/avx2-conversions.ll
new file mode 100755
index 0000000..b474913
--- /dev/null
+++ b/test/CodeGen/X86/avx2-conversions.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+
+; CHECK: trunc4
+; CHECK: vpermd
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
+  %B = trunc <4 x i64> %A to <4 x i32>
+  ret <4 x i32>%B
+}
+
+; CHECK: trunc8
+; CHECK: vpshufb
+; CHECK-NOT: vinsert
+; CHECK: ret
+
+define <8 x i16> @trunc8(<8 x i32> %A) nounwind {
+  %B = trunc <8 x i32> %A to <8 x i16>
+  ret <8 x i16>%B
+}
+
+; CHECK: sext4
+; CHECK: vpmovsxdq
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <4 x i64> @sext4(<4 x i32> %A) nounwind {
+  %B = sext <4 x i32> %A to <4 x i64>
+  ret <4 x i64>%B
+}
+
+; CHECK: sext8
+; CHECK: vpmovsxwd
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <8 x i32> @sext8(<8 x i16> %A) nounwind {
+  %B = sext <8 x i16> %A to <8 x i32>
+  ret <8 x i32>%B
+}
+
+; CHECK: zext4
+; CHECK: vpmovzxdq
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <4 x i64> @zext4(<4 x i32> %A) nounwind {
+  %B = zext <4 x i32> %A to <4 x i64>
+  ret <4 x i64>%B
+}
+
+; CHECK: zext8
+; CHECK: vpmovzxwd
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <8 x i32> @zext8(<8 x i16> %A) nounwind {
+  %B = zext <8 x i16> %A to <8 x i32>
+  ret <8 x i32>%B
+}
+; CHECK: zext_8i8_8i32
+; CHECK: vpmovzxwd
+; CHECK: vpand
+; CHECK: ret
+define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind {
+  %B = zext <8 x i8> %A to <8 x i32>  
+  ret <8 x i32>%B
+}
+
+
+
+
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 3f27a02..a6141b0 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -976,3 +976,182 @@ define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
   ret void
 }
 declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
+
+define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1,
+                     <4 x i32> %idx, <2 x double> %mask) {
+  ; CHECK: vgatherdpd
+  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0,
+                            i8* %a1, <4 x i32> %idx, <2 x double> %mask, i8 2) ;
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*,
+                      <4 x i32>, <2 x double>, i8) nounwind readonly
+
+define <4 x double> @test_x86_avx2_gather_d_pd_256(<4 x double> %a0, i8* %a1,
+                     <4 x i32> %idx, <4 x double> %mask) {
+  ; CHECK: vgatherdpd
+  %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0,
+                            i8* %a1, <4 x i32> %idx, <4 x double> %mask, i8 2) ;
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*,
+                      <4 x i32>, <4 x double>, i8) nounwind readonly
+
+define <2 x double> @test_x86_avx2_gather_q_pd(<2 x double> %a0, i8* %a1,
+                     <2 x i64> %idx, <2 x double> %mask) {
+  ; CHECK: vgatherqpd
+  %res = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0,
+                            i8* %a1, <2 x i64> %idx, <2 x double> %mask, i8 2) ;
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*,
+                      <2 x i64>, <2 x double>, i8) nounwind readonly
+
+define <4 x double> @test_x86_avx2_gather_q_pd_256(<4 x double> %a0, i8* %a1,
+                     <4 x i64> %idx, <4 x double> %mask) {
+  ; CHECK: vgatherqpd
+  %res = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0,
+                            i8* %a1, <4 x i64> %idx, <4 x double> %mask, i8 2) ;
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*,
+                      <4 x i64>, <4 x double>, i8) nounwind readonly
+
+define <4 x float> @test_x86_avx2_gather_d_ps(<4 x float> %a0, i8* %a1,
+                     <4 x i32> %idx, <4 x float> %mask) {
+  ; CHECK: vgatherdps
+  %res = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0,
+                            i8* %a1, <4 x i32> %idx, <4 x float> %mask, i8 2) ;
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*,
+                      <4 x i32>, <4 x float>, i8) nounwind readonly
+
+define <8 x float> @test_x86_avx2_gather_d_ps_256(<8 x float> %a0, i8* %a1,
+                     <8 x i32> %idx, <8 x float> %mask) {
+  ; CHECK: vgatherdps
+  %res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0,
+                            i8* %a1, <8 x i32> %idx, <8 x float> %mask, i8 2) ;
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*,
+                      <8 x i32>, <8 x float>, i8) nounwind readonly
+
+define <4 x float> @test_x86_avx2_gather_q_ps(<4 x float> %a0, i8* %a1,
+                     <2 x i64> %idx, <4 x float> %mask) {
+  ; CHECK: vgatherqps
+  %res = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0,
+                            i8* %a1, <2 x i64> %idx, <4 x float> %mask, i8 2) ;
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*,
+                      <2 x i64>, <4 x float>, i8) nounwind readonly
+
+define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, i8* %a1,
+                     <4 x i64> %idx, <4 x float> %mask) {
+  ; CHECK: vgatherqps
+  %res = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0,
+                            i8* %a1, <4 x i64> %idx, <4 x float> %mask, i8 2) ;
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*,
+                      <4 x i64>, <4 x float>, i8) nounwind readonly
+
+define <2 x i64> @test_x86_avx2_gather_d_q(<2 x i64> %a0, i8* %a1,
+                     <4 x i32> %idx, <2 x i64> %mask) {
+  ; CHECK: vpgatherdq
+  %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0,
+                            i8* %a1, <4 x i32> %idx, <2 x i64> %mask, i8 2) ;
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*,
+                      <4 x i32>, <2 x i64>, i8) nounwind readonly
+
+define <4 x i64> @test_x86_avx2_gather_d_q_256(<4 x i64> %a0, i8* %a1,
+                     <4 x i32> %idx, <4 x i64> %mask) {
+  ; CHECK: vpgatherdq
+  %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0,
+                            i8* %a1, <4 x i32> %idx, <4 x i64> %mask, i8 2) ;
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*,
+                      <4 x i32>, <4 x i64>, i8) nounwind readonly
+
+define <2 x i64> @test_x86_avx2_gather_q_q(<2 x i64> %a0, i8* %a1,
+                     <2 x i64> %idx, <2 x i64> %mask) {
+  ; CHECK: vpgatherqq
+  %res = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0,
+                            i8* %a1, <2 x i64> %idx, <2 x i64> %mask, i8 2) ;
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*,
+                      <2 x i64>, <2 x i64>, i8) nounwind readonly
+
+define <4 x i64> @test_x86_avx2_gather_q_q_256(<4 x i64> %a0, i8* %a1,
+                     <4 x i64> %idx, <4 x i64> %mask) {
+  ; CHECK: vpgatherqq
+  %res = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0,
+                            i8* %a1, <4 x i64> %idx, <4 x i64> %mask, i8 2) ;
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*,
+                      <4 x i64>, <4 x i64>, i8) nounwind readonly
+
+define <4 x i32> @test_x86_avx2_gather_d_d(<4 x i32> %a0, i8* %a1,
+                     <4 x i32> %idx, <4 x i32> %mask) {
+  ; CHECK: vpgatherdd
+  %res = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %a0,
+                            i8* %a1, <4 x i32> %idx, <4 x i32> %mask, i8 2) ;
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*,
+                      <4 x i32>, <4 x i32>, i8) nounwind readonly
+
+define <8 x i32> @test_x86_avx2_gather_d_d_256(<8 x i32> %a0, i8* %a1,
+                     <8 x i32> %idx, <8 x i32> %mask) {
+  ; CHECK: vpgatherdd
+  %res = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %a0,
+                            i8* %a1, <8 x i32> %idx, <8 x i32> %mask, i8 2) ;
+  ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*,
+                      <8 x i32>, <8 x i32>, i8) nounwind readonly
+
+define <4 x i32> @test_x86_avx2_gather_q_d(<4 x i32> %a0, i8* %a1,
+                     <2 x i64> %idx, <4 x i32> %mask) {
+  ; CHECK: vpgatherqd
+  %res = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %a0,
+                            i8* %a1, <2 x i64> %idx, <4 x i32> %mask, i8 2) ;
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*,
+                      <2 x i64>, <4 x i32>, i8) nounwind readonly
+
+define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, i8* %a1,
+                     <4 x i64> %idx, <4 x i32> %mask) {
+  ; CHECK: vpgatherqd
+  %res = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %a0,
+                            i8* %a1, <4 x i64> %idx, <4 x i32> %mask, i8 2) ;
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*,
+                      <4 x i64>, <4 x i32>, i8) nounwind readonly
+
+; PR13298
+define <8 x float>  @test_gather_mask(<8 x float> %a0, float* %a,
+                                      <8 x i32> %idx, <8 x float> %mask,
+                                      float* nocapture %out) {
+; CHECK: test_gather_mask
+; CHECK: vmovdqa %ymm2, [[DEST:%.*]]
+; CHECK: vgatherdps [[DEST]]
+;; gather with mask
+  %a_i8 = bitcast float* %a to i8*
+  %res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0,
+                           i8* %a_i8, <8 x i32> %idx, <8 x float> %mask, i8 4) ;
+
+;; for debugging, we'll just dump out the mask
+  %out_ptr = bitcast float * %out to <8 x float> *
+  store <8 x float> %mask, <8 x float> * %out_ptr, align 4
+
+  ret <8 x float> %res
+}
diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
new file mode 100644
index 0000000..c5899fa
--- /dev/null
+++ b/test/CodeGen/X86/avx2-shuffle.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+
+; Make sure that we don't match this shuffle using the vpblendw YMM instruction.
+; The mask for the vpblendw instruction needs to be identical for both halves
+; of the YMM. Need to use two vpblendw instructions.
+
+; CHECK: blendw1
+; CHECK: vpblendw
+; CHECK: vpblendw
+; CHECK: ret
+define <16 x i16> @blendw1(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline {
+  %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 20, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
+  ret <16 x i16> %t
+}
+
+; CHECK: vpshufhw $27, %ymm
+define <16 x i16> @vpshufhw(<16 x i16> %src1) nounwind uwtable readnone ssp {
+entry:
+  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
+  ret <16 x i16> %shuffle.i
+}
+
+; CHECK: vpshuflw $27, %ymm
+define <16 x i16> @vpshuflw(<16 x i16> %src1) nounwind uwtable readnone ssp {
+entry:
+  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 3, i32 undef, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %shuffle.i
+}
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 1a78414..b804233 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -160,6 +160,15 @@ entry:
   ret <8 x i32> %g
 }
 
+; CHECK: V113
+; CHECK: vbroadcastss
+; CHECK: ret
+define <8 x float> @V113(<8 x float> %in) nounwind uwtable readnone ssp {
+entry:
+  %g = fadd <8 x float> %in, <float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000, float 0xbf80000000000000>
+  ret <8 x float> %g
+}
+
 ; CHECK: _e2
 ; CHECK: vbroadcastss
 ; CHECK: ret
@@ -179,9 +188,170 @@ define <8 x i8> @_e4(i8* %ptr) nounwind uwtable readnone ssp {
   %vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1
   %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 52, i32 2
   %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 52, i32 3
-  %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 52, i32 3
-  %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 52, i32 3
-  %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 52, i32 3
-  %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 52, i32 3
+  %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 52, i32 4
+  %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 52, i32 5
+  %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 52, i32 6
+  %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 52, i32 7
   ret <8 x i8> %vecinit7.i
 }
+
+
+define void @crash() nounwind alwaysinline {
+WGLoopsEntry:
+  br i1 undef, label %ret, label %footer329VF
+
+footer329VF:
+  %A.0.inVF = fmul float undef, 6.553600e+04
+  %B.0.in407VF = fmul <8 x float> undef, <float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04, float 6.553600e+04>
+  %A.0VF = fptosi float %A.0.inVF to i32
+  %B.0408VF = fptosi <8 x float> %B.0.in407VF to <8 x i32>
+  %0 = and <8 x i32> %B.0408VF, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+  %1 = and i32 %A.0VF, 65535
+  %temp1098VF = insertelement <8 x i32> undef, i32 %1, i32 0
+  %vector1099VF = shufflevector <8 x i32> %temp1098VF, <8 x i32> undef, <8 x i32> zeroinitializer
+  br i1 undef, label %preload1201VF, label %footer349VF
+
+preload1201VF:
+  br label %footer349VF
+
+footer349VF:
+  %2 = mul nsw <8 x i32> undef, %0
+  %3 = mul nsw <8 x i32> undef, %vector1099VF
+  br label %footer329VF
+
+ret:
+  ret void
+}
+
+; CHECK: _inreg0
+; CHECK: broadcastss
+; CHECK: ret
+define <8 x i32> @_inreg0(i32 %scalar) nounwind uwtable readnone ssp {
+  %in = insertelement <8 x i32> undef, i32 %scalar, i32 0
+  %wide = shufflevector <8 x i32> %in, <8 x i32> undef, <8 x i32> zeroinitializer
+  ret <8 x i32> %wide
+}
+
+; CHECK: _inreg1
+; CHECK: broadcastss
+; CHECK: ret
+define <8 x float> @_inreg1(float %scalar) nounwind uwtable readnone ssp {
+  %in = insertelement <8 x float> undef, float %scalar, i32 0
+  %wide = shufflevector <8 x float> %in, <8 x float> undef, <8 x i32> zeroinitializer
+  ret <8 x float> %wide
+}
+
+; CHECK: _inreg2
+; CHECK: broadcastss
+; CHECK: ret
+define <4 x float> @_inreg2(float %scalar) nounwind uwtable readnone ssp {
+  %in = insertelement <4 x float> undef, float %scalar, i32 0
+  %wide = shufflevector <4 x float> %in, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %wide
+}
+
+; CHECK: _inreg3
+; CHECK: broadcastsd
+; CHECK: ret
+define <4 x double> @_inreg3(double %scalar) nounwind uwtable readnone ssp {
+  %in = insertelement <4 x double> undef, double %scalar, i32 0
+  %wide = shufflevector <4 x double> %in, <4 x double> undef, <4 x i32> zeroinitializer
+  ret <4 x double> %wide
+}
+
+;CHECK: _inreg8xfloat
+;CHECK: vbroadcastss
+;CHECK: ret
+define   <8 x float> @_inreg8xfloat(<8 x float> %a) {
+  %b = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> zeroinitializer
+  ret <8 x float> %b
+}
+
+;CHECK: _inreg4xfloat
+;CHECK: vbroadcastss
+;CHECK: ret
+define   <4 x float> @_inreg4xfloat(<4 x float> %a) {
+  %b = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %b
+}
+
+;CHECK: _inreg16xi16
+;CHECK: vpbroadcastw
+;CHECK: ret
+define   <16 x i16> @_inreg16xi16(<16 x i16> %a) {
+  %b = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> zeroinitializer
+  ret <16 x i16> %b
+}
+
+;CHECK: _inreg8xi16
+;CHECK: vpbroadcastw
+;CHECK: ret
+define   <8 x i16> @_inreg8xi16(<8 x i16> %a) {
+  %b = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %b
+}
+
+
+;CHECK: _inreg4xi64
+;CHECK: vpbroadcastq
+;CHECK: ret
+define   <4 x i64> @_inreg4xi64(<4 x i64> %a) {
+  %b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> zeroinitializer
+  ret <4 x i64> %b
+}
+
+;CHECK: _inreg2xi64
+;CHECK: vpbroadcastq
+;CHECK: ret
+define   <2 x i64> @_inreg2xi64(<2 x i64> %a) {
+  %b = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %b
+}
+
+;CHECK: _inreg4xdouble
+;CHECK: vbroadcastsd
+;CHECK: ret
+define   <4 x double> @_inreg4xdouble(<4 x double> %a) {
+  %b = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> zeroinitializer
+  ret <4 x double> %b
+}
+
+;CHECK: _inreg2xdouble
+;CHECK: vpbroadcastq
+;CHECK: ret
+define   <2 x double> @_inreg2xdouble(<2 x double> %a) {
+  %b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %b
+}
+
+;CHECK: _inreg8xi32
+;CHECK: vpbroadcastd
+;CHECK: ret
+define   <8 x i32> @_inreg8xi32(<8 x i32> %a) {
+  %b = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> zeroinitializer
+  ret <8 x i32> %b
+}
+
+;CHECK: _inreg4xi32
+;CHECK: vpbroadcastd
+;CHECK: ret
+define   <4 x i32> @_inreg4xi32(<4 x i32> %a) {
+  %b = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %b
+}
+
+;CHECK: _inreg32xi8
+;CHECK: vpbroadcastb
+;CHECK: ret
+define   <32 x i8> @_inreg32xi8(<32 x i8> %a) {
+  %b = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
+  ret <32 x i8> %b
+}
+
+;CHECK: _inreg16xi8
+;CHECK: vpbroadcastb
+;CHECK: ret
+define   <16 x i8> @_inreg16xi8(<16 x i8> %a) {
+  %b = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %b
+}
diff --git a/test/CodeGen/X86/basic-promote-integers.ll b/test/CodeGen/X86/basic-promote-integers.ll
index c80f2b0..fce6b7f 100644
--- a/test/CodeGen/X86/basic-promote-integers.ll
+++ b/test/CodeGen/X86/basic-promote-integers.ll
@@ -1,7 +1,7 @@
 ; Test that vectors are scalarized/lowered correctly
 ; (with both legalization methods).
-; RUN: llc -march=x86 -promote-elements < %s
-; RUN: llc -march=x86                   < %s
+; RUN: llc -march=x86  < %s
+; RUN: llc -march=x86  < %s
 
 ; A simple test to check copyToParts and copyFromParts.
 
diff --git a/test/CodeGen/X86/bigstructret.ll b/test/CodeGen/X86/bigstructret.ll
index 633995d..3c499fa 100644
--- a/test/CodeGen/X86/bigstructret.ll
+++ b/test/CodeGen/X86/bigstructret.ll
@@ -1,12 +1,15 @@
-; RUN: llc < %s -march=x86 -o %t
-; RUN: grep "movl	.24601, 12(%ecx)" %t
-; RUN: grep "movl	.48, 8(%ecx)" %t
-; RUN: grep "movl	.24, 4(%ecx)" %t
-; RUN: grep "movl	.12, (%ecx)" %t
+; RUN: llc < %s -march=x86 | FileCheck %s
 
 %0 = type { i32, i32, i32, i32 }
+%1 = type { i1, i1, i1, i32 }
 
-define internal fastcc %0 @ReturnBigStruct() nounwind readnone {
+; CHECK: ReturnBigStruct
+; CHECK: movl $24601, 12(%ecx)
+; CHECK: movl	$48, 8(%ecx)
+; CHECK: movl	$24, 4(%ecx)
+; CHECK: movl	$12, (%ecx)
+
+define fastcc %0 @ReturnBigStruct() nounwind readnone {
 entry:
   %0 = insertvalue %0 zeroinitializer, i32 12, 0
   %1 = insertvalue %0 %0, i32 24, 1
@@ -15,3 +18,17 @@ entry:
   ret %0 %3
 }
 
+; CHECK: ReturnBigStruct2
+; CHECK: movl	$48, 4(%ecx)
+; CHECK: movb	$1, 2(%ecx)
+; CHECK: movb	$1, 1(%ecx)
+; CHECK: movb	$0, (%ecx)
+
+define fastcc %1 @ReturnBigStruct2() nounwind readnone {
+entry:
+  %0 = insertvalue %1 zeroinitializer, i1 false, 0
+  %1 = insertvalue %1 %0, i1 true, 1
+  %2 = insertvalue %1 %1, i1 true, 2
+  %3 = insertvalue %1 %2, i32 48, 3
+  ret %1 %3
+}
diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll
index 3a10c70..11f811f 100644
--- a/test/CodeGen/X86/blend-msb.ll
+++ b/test/CodeGen/X86/blend-msb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -promote-elements -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse41 | FileCheck %s
 
 
 ; In this test we check that sign-extend of the mask bit is performed by
diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll
index fc7b638..5534712 100644
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll
@@ -7,10 +7,15 @@ define i32 @test_ifchains(i32 %i, i32* %a, i32 %b) {
 ; that is not expected to run.
 ; CHECK: test_ifchains:
 ; CHECK: %entry
+; CHECK-NOT: .align
 ; CHECK: %else1
+; CHECK-NOT: .align
 ; CHECK: %else2
+; CHECK-NOT: .align
 ; CHECK: %else3
+; CHECK-NOT: .align
 ; CHECK: %else4
+; CHECK-NOT: .align
 ; CHECK: %exit
 ; CHECK: %then1
 ; CHECK: %then2
@@ -76,8 +81,11 @@ define i32 @test_loop_cold_blocks(i32 %i, i32* %a) {
 ; Check that we sink cold loop blocks after the hot loop body.
 ; CHECK: test_loop_cold_blocks:
 ; CHECK: %entry
+; CHECK-NOT: .align
 ; CHECK: %unlikely1
+; CHECK-NOT: .align
 ; CHECK: %unlikely2
+; CHECK: .align
 ; CHECK: %body1
 ; CHECK: %body2
 ; CHECK: %body3
@@ -634,7 +642,7 @@ define void @test_unnatural_cfg_backwards_inner_loop() {
 ;
 ; CHECK: test_unnatural_cfg_backwards_inner_loop
 ; CHECK: %entry
-; CHECK: %body
+; CHECK: [[BODY:# BB#[0-9]+]]:
 ; CHECK: %loop2b
 ; CHECK: %loop1
 ; CHECK: %loop2a
diff --git a/test/CodeGen/X86/bool-simplify.ll b/test/CodeGen/X86/bool-simplify.ll
new file mode 100644
index 0000000..0cb9fd9
--- /dev/null
+++ b/test/CodeGen/X86/bool-simplify.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx | FileCheck %s
+
+define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) {
+  %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
+  %t2 = icmp ne i32 %t1, 0
+  %t3 = select i1 %t2, i32 %a, i32 %b
+  ret i32 %t3
+; CHECK: foo
+; CHECK: ptest
+; CHECK-NOT: testl
+; CHECK: cmov
+; CHECK: ret
+}
+
+define i32 @bar(<2 x i64> %c) {
+entry:
+  %0 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
+  %1 = icmp ne i32 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; CHECK: bar
+; CHECK: ptest
+; CHECK-NOT: testl
+; CHECK: jne
+; CHECK: ret
+}
+
+define i32 @bax(<2 x i64> %c) {
+  %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
+  %t2 = icmp eq i32 %t1, 1
+  %t3 = zext i1 %t2 to i32
+  ret i32 %t3
+; CHECK: bax
+; CHECK: ptest
+; CHECK-NOT: cmpl
+; CHECK: ret
+}
+
+declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/X86/br-fold.ll b/test/CodeGen/X86/br-fold.ll
index 2c37194..5223463 100644
--- a/test/CodeGen/X86/br-fold.ll
+++ b/test/CodeGen/X86/br-fold.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=x86-64 < %s | FileCheck %s
 
 ; CHECK: orq
-; CHECK-NEXT: LBB0_1
+; CHECK-NEXT: %bb8.i329
 
 @_ZN11xercesc_2_513SchemaSymbols21fgURI_SCHEMAFORSCHEMAE = external constant [33 x i16], align 32 ; <[33 x i16]*> [#uses=1]
 @_ZN11xercesc_2_56XMLUni16fgNotationStringE = external constant [9 x i16], align 16 ; <[9 x i16]*> [#uses=1]
diff --git a/test/CodeGen/X86/break-anti-dependencies.ll b/test/CodeGen/X86/break-anti-dependencies.ll
index 93b2043..c942614 100644
--- a/test/CodeGen/X86/break-anti-dependencies.ll
+++ b/test/CodeGen/X86/break-anti-dependencies.ll
@@ -1,10 +1,12 @@
 ; Without list-burr scheduling we may not see the difference in codegen here.
-; RUN: llc < %s -march=x86-64 -post-RA-scheduler -pre-RA-sched=list-burr -break-anti-dependencies=none > %t
-; RUN:   grep {%xmm0} %t | count 14
-; RUN:   not grep {%xmm1} %t
-; RUN: llc < %s -march=x86-64 -post-RA-scheduler -break-anti-dependencies=critical > %t
-; RUN:   grep {%xmm0} %t | count 7
-; RUN:   grep {%xmm1} %t | count 7
+; Use a subtarget that has post-RA scheduling enabled because the anti-dependency
+; breaker requires liveness information to be kept.
+; RUN: llc < %s -march=x86-64 -mcpu=atom -post-RA-scheduler -pre-RA-sched=list-burr -break-anti-dependencies=none > %t
+; RUN:   grep "%xmm0" %t | count 14
+; RUN:   not grep "%xmm1" %t
+; RUN: llc < %s -march=x86-64 -mcpu=atom -post-RA-scheduler -break-anti-dependencies=critical > %t
+; RUN:   grep "%xmm0" %t | count 7
+; RUN:   grep "%xmm1" %t | count 7
 
 define void @goo(double* %r, double* %p, double* %q) nounwind {
 entry:
diff --git a/test/CodeGen/X86/break-sse-dep.ll b/test/CodeGen/X86/break-sse-dep.ll
index 2dee575..4d80189 100644
--- a/test/CodeGen/X86/break-sse-dep.ll
+++ b/test/CodeGen/X86/break-sse-dep.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 -mcpu=nehalem | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 -mcpu=nehalem | FileCheck %s
 
 define double @t1(float* nocapture %x) nounwind readonly ssp {
 entry:
@@ -34,8 +34,7 @@ entry:
 define double @squirt(double* %x) nounwind {
 entry:
 ; CHECK: squirt:
-; CHECK: movsd ([[A0]]), %xmm0
-; CHECK: sqrtsd %xmm0, %xmm0
+; CHECK: sqrtsd ([[A0]]), %xmm0
   %z = load double* %x
   %t = call double @llvm.sqrt.f64(double %z)
   ret double %t
diff --git a/test/CodeGen/X86/call-imm.ll b/test/CodeGen/X86/call-imm.ll
index 3857fb1..38cda4d 100644
--- a/test/CodeGen/X86/call-imm.ll
+++ b/test/CodeGen/X86/call-imm.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=static | grep {call.*12345678}
-; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=pic | not grep {call.*12345678}
-; RUN: llc < %s -mtriple=i386-pc-linux -relocation-model=dynamic-no-pic | grep {call.*12345678}
+; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=static | grep "call.*12345678"
+; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=pic | not grep "call.*12345678"
+; RUN: llc < %s -mtriple=i386-pc-linux -relocation-model=dynamic-no-pic | grep "call.*12345678"
 
 ; Call to immediate is not safe on x86-64 unless we *know* that the
 ; call will be within 32-bits pcrel from the dest immediate.
 
-; RUN: llc < %s -march=x86-64 | grep {call.*\\*%rax}
+; RUN: llc < %s -march=x86-64 | grep "call.*\*%rax"
 
 ; PR3666
 ; PR3773
diff --git a/test/CodeGen/X86/cfstring.ll b/test/CodeGen/X86/cfstring.ll
index 7420ce7..8cdd59e 100644
--- a/test/CodeGen/X86/cfstring.ll
+++ b/test/CodeGen/X86/cfstring.ll
@@ -4,7 +4,7 @@
 %0 = type opaque
 %struct.NSConstantString = type { i32*, i32, i8*, i32 }
 
-; Make sure that the string ends up the the correct section.
+; Make sure that the string ends up the correct section.
 
 ; CHECK:        .section __TEXT,__cstring
 ; CHECK-NEXT: l_.str3:
diff --git a/test/CodeGen/X86/cmov-into-branch.ll b/test/CodeGen/X86/cmov-into-branch.ll
new file mode 100644
index 0000000..780746a
--- /dev/null
+++ b/test/CodeGen/X86/cmov-into-branch.ll
@@ -0,0 +1,63 @@
+; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s
+
+; cmp with single-use load, should not form cmov.
+define i32 @test1(double %a, double* nocapture %b, i32 %x, i32 %y)  {
+  %load = load double* %b, align 8
+  %cmp = fcmp olt double %load, %a
+  %cond = select i1 %cmp, i32 %x, i32 %y
+  ret i32 %cond
+; CHECK: test1:
+; CHECK: ucomisd
+; CHECK-NOT: cmov
+; CHECK: j
+; CHECK-NOT: cmov
+}
+
+; Sanity check: no load.
+define i32 @test2(double %a, double %b, i32 %x, i32 %y)  {
+  %cmp = fcmp ogt double %a, %b
+  %cond = select i1 %cmp, i32 %x, i32 %y
+  ret i32 %cond
+; CHECK: test2:
+; CHECK: ucomisd
+; CHECK: cmov
+}
+
+; Multiple uses of %a, should not form cmov.
+define i32 @test3(i32 %a, i32* nocapture %b, i32 %x)  {
+  %load = load i32* %b, align 4
+  %cmp = icmp ult i32 %load, %a
+  %cond = select i1 %cmp, i32 %a, i32 %x
+  ret i32 %cond
+; CHECK: test3:
+; CHECK: cmpl
+; CHECK-NOT: cmov
+; CHECK: j
+; CHECK-NOT: cmov
+}
+
+; Multiple uses of the load.
+define i32 @test4(i32 %a, i32* nocapture %b, i32 %x, i32 %y)  {
+  %load = load i32* %b, align 4
+  %cmp = icmp ult i32 %load, %a
+  %cond = select i1 %cmp, i32 %x, i32 %y
+  %add = add i32 %cond, %load
+  ret i32 %add
+; CHECK: test4:
+; CHECK: cmpl
+; CHECK: cmov
+}
+
+; Multiple uses of the cmp.
+define i32 @test5(i32 %a, i32* nocapture %b, i32 %x, i32 %y) {
+  %load = load i32* %b, align 4
+  %cmp = icmp ult i32 %load, %a
+  %cmp1 = icmp ugt i32 %load, %a
+  %cond = select i1 %cmp1, i32 %a, i32 %y
+  %cond5 = select i1 %cmp, i32 %cond, i32 %x
+  ret i32 %cond5
+; CHECK: test5:
+; CHECK: cmpl
+; CHECK: cmov
+; CHECK: cmov
+}
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index 2e7ffbf..ed25c82 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -disable-cgp-select2branch | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
 define i32 @test1(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
 entry:
 ; CHECK: test1:
-; CHECK: btl
-; CHECK-NEXT: movl	$12, %eax
+; CHECK: movl	$12, %eax
+; CHECK-NEXT: btl
 ; CHECK-NEXT: cmovael	(%rcx), %eax
 ; CHECK-NEXT: ret
 
@@ -19,8 +19,8 @@ entry:
 define i32 @test2(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
 entry:
 ; CHECK: test2:
-; CHECK: btl
-; CHECK-NEXT: movl	$12, %eax
+; CHECK: movl	$12, %eax
+; CHECK-NEXT: btl
 ; CHECK-NEXT: cmovbl	(%rcx), %eax
 ; CHECK-NEXT: ret
 
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index ef5e353..eb06327 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -90,3 +90,64 @@ F:
 ; CHECK: encoding: [0x48,0x83,0x7c,0x24,0xf8,0x00]
 }
 
+; rdar://11866926
+define i32 @test7(i64 %res) nounwind {
+entry:
+; CHECK: test7:
+; CHECK-NOT: movabsq
+; CHECK: shrq $32, %rdi
+; CHECK: testq %rdi, %rdi
+; CHECK: sete
+  %lnot = icmp ult i64 %res, 4294967296
+  %lnot.ext = zext i1 %lnot to i32
+  ret i32 %lnot.ext
+}
+
+define i32 @test8(i64 %res) nounwind {
+entry:
+; CHECK: test8:
+; CHECK-NOT: movabsq
+; CHECK: shrq $32, %rdi
+; CHECK: cmpq $3, %rdi
+  %lnot = icmp ult i64 %res, 12884901888
+  %lnot.ext = zext i1 %lnot to i32
+  ret i32 %lnot.ext
+}
+
+define i32 @test9(i64 %res) nounwind {
+entry:
+; CHECK: test9:
+; CHECK-NOT: movabsq
+; CHECK: shrq $33, %rdi
+; CHECK: testq %rdi, %rdi
+; CHECK: sete
+  %lnot = icmp ult i64 %res, 8589934592
+  %lnot.ext = zext i1 %lnot to i32
+  ret i32 %lnot.ext
+}
+
+define i32 @test10(i64 %res) nounwind {
+entry:
+; CHECK: test10:
+; CHECK-NOT: movabsq
+; CHECK: shrq $32, %rdi
+; CHECK: testq %rdi, %rdi
+; CHECK: setne
+  %lnot = icmp uge i64 %res, 4294967296
+  %lnot.ext = zext i1 %lnot to i32
+  ret i32 %lnot.ext
+}
+
+; rdar://9758774
+define i32 @test11(i64 %l) nounwind {
+entry:
+; CHECK: test11:
+; CHECK-NOT: movabsq
+; CHECK-NOT: andq
+; CHECK: shrq $47, %rdi
+; CHECK: cmpq $1, %rdi
+  %shr.mask = and i64 %l, -140737488355328
+  %cmp = icmp eq i64 %shr.mask, 140737488355328
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/X86/coalesce-esp.ll b/test/CodeGen/X86/coalesce-esp.ll
index a584876..4004379 100644
--- a/test/CodeGen/X86/coalesce-esp.ll
+++ b/test/CodeGen/X86/coalesce-esp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | grep {movl	%esp, %ebp}
+; RUN: llc < %s | grep "movl	%esp, %ebp"
 ; PR4572
 
 ; Don't coalesce with %esp if it would end up putting %esp in
diff --git a/test/CodeGen/X86/coalescer-commute2.ll b/test/CodeGen/X86/coalescer-commute2.ll
index 6e5c1cf..e45437c 100644
--- a/test/CodeGen/X86/coalescer-commute2.ll
+++ b/test/CodeGen/X86/coalescer-commute2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux -join-physregs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=nehalem | FileCheck %s
 ; CHECK-NOT:     mov
 ; CHECK:     paddw
 ; CHECK-NOT:     mov
@@ -26,14 +26,3 @@ entry:
 	%tmp10 = bitcast <8 x i16> %tmp9 to <2 x i64>		; <<2 x i64>> [#uses=1]
 	ret <2 x i64> %tmp10
 }
-
-
-; The coalescer should commute the add to avoid a copy.
-define <4 x float> @test3(<4 x float> %V) {
-entry:
-        %tmp8 = shufflevector <4 x float> %V, <4 x float> undef,
-                                        <4 x i32> < i32 3, i32 2, i32 1, i32 0 >
-        %add = fadd <4 x float> %tmp8, %V
-        ret <4 x float> %add
-}
-
diff --git a/test/CodeGen/X86/coalescer-dce2.ll b/test/CodeGen/X86/coalescer-dce2.ll
new file mode 100644
index 0000000..bbbf09b
--- /dev/null
+++ b/test/CodeGen/X86/coalescer-dce2.ll
@@ -0,0 +1,118 @@
+; RUN: llc < %s -verify-coalescing
+; PR12911
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.7.0"
+
+@d = common global i32 0, align 4
+@c = common global i32 0, align 4
+@b = common global i32 0, align 4
+@h = common global i32 0, align 4
+@f = common global i32 0, align 4
+@g = common global i32 0, align 4
+@a = common global i16 0, align 2
+@e = common global i32 0, align 4
+
+define void @fn1() nounwind uwtable ssp {
+entry:
+  %0 = load i32* @d, align 4
+  %tobool72 = icmp eq i32 %0, 0
+  br i1 %tobool72, label %for.end32, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %1 = load i32* @c, align 4
+  %tobool2 = icmp eq i32 %1, 0
+  %2 = load i32* @b, align 4
+  %cmp = icmp sgt i32 %2, 0
+  %conv = zext i1 %cmp to i32
+  %3 = load i32* @g, align 4
+  %tobool4 = icmp eq i32 %3, 0
+  %4 = load i16* @a, align 2
+  %tobool9 = icmp eq i16 %4, 0
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond25.loopexit.us-lcssa.us-lcssa, %if.end.us50, %if.end.us, %if.end.us.us, %for.cond1.preheader.lr.ph
+  %j.073 = phi i32 [ undef, %for.cond1.preheader.lr.ph ], [ %j.1.us.us, %if.end.us.us ], [ %j.1.us, %if.end.us ], [ %j.073, %for.cond25.loopexit.us-lcssa.us-lcssa ], [ %j.1.us36, %if.end.us50 ]
+  br i1 %tobool2, label %for.cond1.preheader.split.us, label %for.cond1.preheader.for.cond1.preheader.split_crit_edge
+
+for.cond1.preheader.for.cond1.preheader.split_crit_edge: ; preds = %for.cond1.preheader
+  br i1 %tobool9, label %if.end.us50, label %for.cond1.preheader.split.for.cond1.preheader.split.split_crit_edge
+
+for.cond1.preheader.split.us:                     ; preds = %for.cond1.preheader
+  br i1 %tobool9, label %cond.end.us.us, label %cond.end.us
+
+cond.false18.us.us:                               ; preds = %if.end.us.us
+  %5 = load i32* @f, align 4
+  %sext76 = shl i32 %5, 16
+  %phitmp75 = ashr exact i32 %sext76, 16
+  br label %cond.end.us.us
+
+if.end.us.us:                                     ; preds = %cond.end.us.us, %if.then.us.us
+  br i1 %tobool4, label %cond.false18.us.us, label %for.cond1.preheader
+
+if.then.us.us:                                    ; preds = %cond.end.us.us
+  store i32 0, i32* @f, align 4
+  br label %if.end.us.us
+
+cond.end.us.us:                                   ; preds = %cond.false18.us.us, %for.cond1.preheader.split.us
+  %j.1.us.us = phi i32 [ %j.073, %for.cond1.preheader.split.us ], [ %phitmp75, %cond.false18.us.us ]
+  store i32 %conv, i32* @h, align 4
+  br i1 %cmp, label %if.then.us.us, label %if.end.us.us
+
+cond.end21.us:                                    ; preds = %land.lhs.true12.us, %cond.false18.us
+  %cond22.us = phi i16 [ %add.us, %cond.false18.us ], [ %4, %land.lhs.true12.us ]
+  %conv24.us = sext i16 %cond22.us to i32
+  br label %cond.end.us
+
+cond.false18.us:                                  ; preds = %if.end6.us, %land.lhs.true12.us
+  %add.us = add i16 %4, %conv7.us
+  br label %cond.end21.us
+
+land.lhs.true12.us:                               ; preds = %if.end6.us
+  %conv10.us = sext i16 %conv7.us to i32
+  %sub.us = sub nsw i32 0, %conv10.us
+  %cmp14.us = icmp slt i32 %sub.us, 1
+  br i1 %cmp14.us, label %cond.end21.us, label %cond.false18.us
+
+if.end6.us:                                       ; preds = %if.end.us
+  %6 = load i32* @f, align 4
+  %conv7.us = trunc i32 %6 to i16
+  %tobool11.us = icmp eq i16 %conv7.us, 0
+  br i1 %tobool11.us, label %cond.false18.us, label %land.lhs.true12.us
+
+if.end.us:                                        ; preds = %cond.end.us, %if.then.us
+  br i1 %tobool4, label %if.end6.us, label %for.cond1.preheader
+
+if.then.us:                                       ; preds = %cond.end.us
+  store i32 0, i32* @f, align 4
+  br label %if.end.us
+
+cond.end.us:                                      ; preds = %cond.end21.us, %for.cond1.preheader.split.us
+  %j.1.us = phi i32 [ %conv24.us, %cond.end21.us ], [ %j.073, %for.cond1.preheader.split.us ]
+  store i32 %conv, i32* @h, align 4
+  br i1 %cmp, label %if.then.us, label %if.end.us
+
+for.cond1.preheader.split.for.cond1.preheader.split.split_crit_edge: ; preds = %for.cond1.preheader.for.cond1.preheader.split_crit_edge
+  br i1 %tobool4, label %if.end6.us65, label %for.cond25.loopexit.us-lcssa.us-lcssa
+
+cond.false18.us40:                                ; preds = %if.end.us50
+  %7 = load i32* @f, align 4
+  %sext = shl i32 %7, 16
+  %phitmp = ashr exact i32 %sext, 16
+  br label %if.end.us50
+
+if.end.us50:                                      ; preds = %cond.false18.us40, %for.cond1.preheader.for.cond1.preheader.split_crit_edge
+  %j.1.us36 = phi i32 [ %j.073, %for.cond1.preheader.for.cond1.preheader.split_crit_edge ], [ %phitmp, %cond.false18.us40 ]
+  store i32 0, i32* @h, align 4
+  br i1 %tobool4, label %cond.false18.us40, label %for.cond1.preheader
+
+if.end6.us65:                                     ; preds = %if.end6.us65, %for.cond1.preheader.split.for.cond1.preheader.split.split_crit_edge
+  store i32 0, i32* @h, align 4
+  br label %if.end6.us65
+
+for.cond25.loopexit.us-lcssa.us-lcssa:            ; preds = %for.cond1.preheader.split.for.cond1.preheader.split.split_crit_edge
+  store i32 0, i32* @h, align 4
+  br label %for.cond1.preheader
+
+for.end32:                                        ; preds = %entry
+  ret void
+}
diff --git a/test/CodeGen/X86/coalescer-identity.ll b/test/CodeGen/X86/coalescer-identity.ll
new file mode 100644
index 0000000..9c72ee6
--- /dev/null
+++ b/test/CodeGen/X86/coalescer-identity.ll
@@ -0,0 +1,82 @@
+; RUN: llc < %s -verify-coalescing
+; PR12927
+target triple = "x86_64-apple-macosx10.8.0"
+
+; This is a case where removeCopyByCommutingDef() creates an identity copy that
+; joinCopy must then deal with correctly.
+
+@s = common global i16 0, align 2
+@g1 = common global i32 0, align 4
+@g2 = common global i32 0, align 4
+@g0 = common global i32 0, align 4
+
+define void @func() nounwind uwtable ssp {
+for.body.lr.ph:
+  %0 = load i32* @g2, align 4, !tbaa !0
+  %tobool6 = icmp eq i32 %0, 0
+  %s.promoted = load i16* @s, align 2
+  %.pre = load i32* @g1, align 4, !tbaa !0
+  br i1 %tobool6, label %for.body.us, label %for.body
+
+for.body.us:                                      ; preds = %for.body.lr.ph, %for.inc.us
+  %1 = phi i32 [ %3, %for.inc.us ], [ %.pre, %for.body.lr.ph ]
+  %dec13.us = phi i16 [ %dec12.us, %for.inc.us ], [ %s.promoted, %for.body.lr.ph ]
+  %i.011.us = phi i32 [ %inc.us, %for.inc.us ], [ undef, %for.body.lr.ph ]
+  %v.010.us = phi i32 [ %phitmp.us, %for.inc.us ], [ 1, %for.body.lr.ph ]
+  %tobool1.us = icmp ne i32 %v.010.us, 0
+  %2 = zext i1 %tobool1.us to i16
+  %lnot.ext.us = xor i16 %2, 1
+  %add.us = add i16 %dec13.us, %lnot.ext.us
+  %conv3.us = zext i16 %add.us to i32
+  %add4.us = sub i32 0, %1
+  %tobool5.us = icmp eq i32 %conv3.us, %add4.us
+  br i1 %tobool5.us, label %for.inc.us, label %if.then7.us
+
+for.inc.us:                                       ; preds = %cond.end.us, %for.body.us
+  %3 = phi i32 [ %1, %for.body.us ], [ %4, %cond.end.us ]
+  %dec12.us = phi i16 [ %add.us, %for.body.us ], [ %dec.us, %cond.end.us ]
+  %inc.us = add i32 %i.011.us, 1
+  %phitmp.us = udiv i32 %v.010.us, 12
+  %tobool.us = icmp eq i32 %inc.us, 0
+  br i1 %tobool.us, label %for.end, label %for.body.us
+
+cond.end.us:                                      ; preds = %if.then7.us, %cond.false.us
+  %4 = phi i32 [ 0, %cond.false.us ], [ %1, %if.then7.us ]
+  %cond.us = phi i32 [ 0, %cond.false.us ], [ %v.010.us, %if.then7.us ]
+  store i32 %cond.us, i32* @g0, align 4, !tbaa !0
+  br label %for.inc.us
+
+cond.false.us:                                    ; preds = %if.then7.us
+  store i32 0, i32* @g1, align 4, !tbaa !0
+  br label %cond.end.us
+
+if.then7.us:                                      ; preds = %for.body.us
+  %dec.us = add i16 %add.us, -1
+  br i1 %tobool1.us, label %cond.end.us, label %cond.false.us
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %dec13 = phi i16 [ %dec12, %for.body ], [ %s.promoted, %for.body.lr.ph ]
+  %i.011 = phi i32 [ %inc, %for.body ], [ undef, %for.body.lr.ph ]
+  %v.010 = phi i32 [ %phitmp, %for.body ], [ 1, %for.body.lr.ph ]
+  %tobool1 = icmp eq i32 %v.010, 0
+  %lnot.ext = zext i1 %tobool1 to i16
+  %add = add i16 %dec13, %lnot.ext
+  %conv3 = zext i16 %add to i32
+  %add4 = sub i32 0, %.pre
+  %not.tobool5 = icmp ne i32 %conv3, %add4
+  %dec = sext i1 %not.tobool5 to i16
+  %dec12 = add i16 %add, %dec
+  %inc = add i32 %i.011, 1
+  %phitmp = udiv i32 %v.010, 12
+  %tobool = icmp eq i32 %inc, 0
+  br i1 %tobool, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc.us, %for.body
+  %dec12.lcssa = phi i16 [ %dec12.us, %for.inc.us ], [ %dec12, %for.body ]
+  store i16 %dec12.lcssa, i16* @s, align 2
+  ret void
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/constant-pool-sharing.ll b/test/CodeGen/X86/constant-pool-sharing.ll
index f979945..26318dd 100644
--- a/test/CodeGen/X86/constant-pool-sharing.ll
+++ b/test/CodeGen/X86/constant-pool-sharing.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s
 
 ; llc should share constant pool entries between this integer vector
 ; and this floating-point vector since they have the same encoding.
diff --git a/test/CodeGen/X86/constructor.ll b/test/CodeGen/X86/constructor.ll
new file mode 100644
index 0000000..b578896
--- /dev/null
+++ b/test/CodeGen/X86/constructor.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck --check-prefix=CTOR %s
+; RUN: llc -mtriple x86_64-pc-linux -use-init-array < %s | FileCheck --check-prefix=INIT-ARRAY %s
+@llvm.global_ctors = appending global [2 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @f }, { i32, void ()* } { i32 15, void ()* @g }]
+
+define void @f() {
+entry:
+  ret void
+}
+
+define void @g() {
+entry:
+  ret void
+}
+
+; CTOR:		.section	.ctors.65520,"aw",@progbits
+; CTOR-NEXT:	.align	8
+; CTOR-NEXT:	.quad	g
+; CTOR-NEXT:	.section	.ctors,"aw",@progbits
+; CTOR-NEXT:	.align	8
+; CTOR-NEXT:	.quad	f
+
+; INIT-ARRAY:		.section	.init_array.15,"aw",@init_array
+; INIT-ARRAY-NEXT:	.align	8
+; INIT-ARRAY-NEXT:	.quad	g
+; INIT-ARRAY-NEXT:	.section	.init_array,"aw",@init_array
+; INIT-ARRAY-NEXT:	.align	8
+; INIT-ARRAY-NEXT:	.quad	f
diff --git a/test/CodeGen/X86/convert-2-addr-3-addr-inc64.ll b/test/CodeGen/X86/convert-2-addr-3-addr-inc64.ll
index b82348b..064ee36 100644
--- a/test/CodeGen/X86/convert-2-addr-3-addr-inc64.ll
+++ b/test/CodeGen/X86/convert-2-addr-3-addr-inc64.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux -o /dev/null -stats |& FileCheck %s -check-prefix=STATS
-; RUN: llc < %s -mtriple=x86_64-win32 -o /dev/null -stats |& FileCheck %s -check-prefix=STATS
+; RUN: llc < %s -mtriple=x86_64-linux -o /dev/null -stats 2>&1 | FileCheck %s -check-prefix=STATS
+; RUN: llc < %s -mtriple=x86_64-win32 -o /dev/null -stats 2>&1 | FileCheck %s -check-prefix=STATS
 ; STATS: 9 asm-printer
 
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll
index cf6e27d..9badfc8 100644
--- a/test/CodeGen/X86/crash.ll
+++ b/test/CodeGen/X86/crash.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=x86 %s -o -
-; RUN: llc -march=x86-64 %s -o -
+; RUN: llc -march=x86 < %s -verify-machineinstrs
+; RUN: llc -march=x86-64 < %s -verify-machineinstrs
 
 ; PR6497
 
@@ -391,3 +391,54 @@ if.end:
   %t11 = tail call i64 asm sideeffect "foo", "=*m,=A,{bx},{cx},1,~{memory},~{dirflag},~{fpsr},~{flags}"(i64* %t6, i32 0, i32 0, i64 0) nounwind
   ret void
 }
+
+; Avoid emitting wrong kill flags from InstrEmitter.
+; InstrEmitter::EmitSubregNode() may steal virtual registers from already
+; emitted blocks when isCoalescableExtInstr points out the opportunity.
+; Make sure kill flags are cleared on the newly global virtual register.
+define i64 @ov_read(i8* %vf, i8* nocapture %buffer, i32 %length, i32 %bigendianp, i32 %word, i32 %sgned, i32* %bitstream) nounwind uwtable ssp {
+entry:
+  br i1 undef, label %return, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br i1 undef, label %if.then3, label %if.end7
+
+if.then3:                                         ; preds = %while.body.preheader
+  %0 = load i32* undef, align 4
+  br i1 undef, label %land.lhs.true.i255, label %if.end7
+
+land.lhs.true.i255:                               ; preds = %if.then3
+  br i1 undef, label %if.then.i256, label %if.end7
+
+if.then.i256:                                     ; preds = %land.lhs.true.i255
+  %sub.i = sub i32 0, %0
+  %conv = sext i32 %sub.i to i64
+  br i1 undef, label %if.end7, label %while.end
+
+if.end7:                                          ; preds = %if.then.i256, %land.lhs.true.i255, %if.then3, %while.body.preheader
+  unreachable
+
+while.end:                                        ; preds = %if.then.i256
+  %cmp18 = icmp sgt i32 %sub.i, 0
+  %.conv = select i1 %cmp18, i64 -131, i64 %conv
+  ret i64 %.conv
+
+return:                                           ; preds = %entry
+  ret i64 -131
+}
+
+; The tail call to a varargs function sets %AL.
+; uitofp expands to an FCMOV instruction which splits the basic block.
+; Make sure the live range of %AL isn't split.
+@.str = private unnamed_addr constant { [1 x i8], [63 x i8] } zeroinitializer, align 32
+define void @pr13188(i64* nocapture %this) uwtable ssp address_safety align 2 {
+entry:
+  %x7 = load i64* %this, align 8
+  %sub = add i64 %x7, -1
+  %conv = uitofp i64 %sub to float
+  %div = fmul float %conv, 5.000000e-01
+  %conv2 = fpext float %div to double
+  tail call void (...)* @_Z6PrintFz(i8* getelementptr inbounds ({ [1 x i8], [63 x i8] }* @.str, i64 0, i32 0, i64 0), double %conv2)
+  ret void
+}
+declare void @_Z6PrintFz(...)
diff --git a/test/CodeGen/X86/ctpop-combine.ll b/test/CodeGen/X86/ctpop-combine.ll
index 6406cc7..0a3dfca 100644
--- a/test/CodeGen/X86/ctpop-combine.ll
+++ b/test/CodeGen/X86/ctpop-combine.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s
 
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
 
diff --git a/test/CodeGen/X86/dagcombine-cse.ll b/test/CodeGen/X86/dagcombine-cse.ll
index c3c7990..af69531 100644
--- a/test/CodeGen/X86/dagcombine-cse.ll
+++ b/test/CodeGen/X86/dagcombine-cse.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin -stats |& grep asm-printer | grep 14
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin -stats 2>&1 | grep asm-printer | grep 14
 
 define i32 @t(i8* %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/dbg-merge-loc-entry.ll b/test/CodeGen/X86/dbg-merge-loc-entry.ll
index c35935f..d1e349f 100644
--- a/test/CodeGen/X86/dbg-merge-loc-entry.ll
+++ b/test/CodeGen/X86/dbg-merge-loc-entry.ll
@@ -10,7 +10,7 @@ target triple = "x86_64-apple-darwin8"
 ;CHECK-NEXT:    .short  Lset
 ;CHECK-NEXT: Ltmp
 ;CHECK-NEXT:	.byte	85                      ## DW_OP_reg5
-;CHECK-NEXT: Ltmp5
+;CHECK-NEXT: Ltmp
 ;CHECK-NEXT:	.quad	0
 ;CHECK-NEXT:	.quad	0
 
diff --git a/test/CodeGen/X86/dbg-value-range.ll b/test/CodeGen/X86/dbg-value-range.ll
index 28d873b..6b16865 100644
--- a/test/CodeGen/X86/dbg-value-range.ll
+++ b/test/CodeGen/X86/dbg-value-range.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin10 < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-apple-darwin10 -regalloc=basic -join-physregs < %s | FileCheck %s
 
 %struct.a = type { i32 }
 
diff --git a/test/CodeGen/X86/divide-by-constant.ll b/test/CodeGen/X86/divide-by-constant.ll
index e577ecb..8e7c13d 100644
--- a/test/CodeGen/X86/divide-by-constant.ll
+++ b/test/CodeGen/X86/divide-by-constant.ll
@@ -71,3 +71,24 @@ define i32 @test7(i32 %x) nounwind {
 ; CHECK-NOT: shrl
 ; CHECK: ret
 }
+
+; PR13326
+define i8 @test8(i8 %x) nounwind {
+  %div = udiv i8 %x, 78
+  ret i8 %div
+; CHECK: test8:
+; CHECK: shrb %
+; CHECK: imull $211
+; CHECK: shrl $13
+; CHECK: ret
+}
+
+define i8 @test9(i8 %x) nounwind {
+  %div = udiv i8 %x, 116
+  ret i8 %div
+; CHECK: test9:
+; CHECK: shrb $2
+; CHECK: imull $71
+; CHECK: shrl $11
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/dynamic-allocas-VLAs.ll b/test/CodeGen/X86/dynamic-allocas-VLAs.ll
new file mode 100644
index 0000000..c5e47fa
--- /dev/null
+++ b/test/CodeGen/X86/dynamic-allocas-VLAs.ll
@@ -0,0 +1,237 @@
+; RUN: llc < %s -mcpu=generic -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -force-align-stack -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s -check-prefix=FORCE-ALIGN
+; rdar://11496434
+
+; no VLAs or dynamic alignment
+define i32 @t1() nounwind uwtable ssp {
+entry:
+  %a = alloca i32, align 4
+  call void @t1_helper(i32* %a) nounwind
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 13
+  ret i32 %add
+
+; CHECK: _t1
+; CHECK-NOT: andq $-{{[0-9]+}}, %rsp
+; CHECK: leaq [[OFFSET:[0-9]*]](%rsp), %rdi
+; CHECK: callq _t1_helper
+; CHECK: movl [[OFFSET]](%rsp), %eax
+; CHECK: addl $13, %eax
+}
+
+declare void @t1_helper(i32*)
+
+; dynamic realignment
+define i32 @t2() nounwind uwtable ssp {
+entry:
+  %a = alloca i32, align 4
+  %v = alloca <8 x float>, align 32
+  call void @t2_helper(i32* %a, <8 x float>* %v) nounwind
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 13
+  ret i32 %add
+
+; CHECK: _t2
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; CHECK: andq $-32, %rsp
+; CHECK: subq ${{[0-9]+}}, %rsp
+;
+; CHECK: leaq {{[0-9]*}}(%rsp), %rdi
+; CHECK: leaq {{[0-9]*}}(%rsp), %rsi
+; CHECK: callq _t2_helper
+;
+; CHECK: movq %rbp, %rsp
+; CHECK: popq %rbp
+}
+
+declare void @t2_helper(i32*, <8 x float>*)
+
+; VLAs
+define i32 @t3(i64 %sz) nounwind uwtable ssp {
+entry:
+  %a = alloca i32, align 4
+  %vla = alloca i32, i64 %sz, align 16
+  call void @t3_helper(i32* %a, i32* %vla) nounwind
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 13
+  ret i32 %add
+
+; CHECK: _t3
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; CHECK: pushq %rbx
+; CHECK-NOT: andq $-{{[0-9]+}}, %rsp
+; CHECK: subq ${{[0-9]+}}, %rsp
+;
+; CHECK: leaq -{{[0-9]+}}(%rbp), %rsp
+; CHECK: popq %rbx
+; CHECK: popq %rbp
+}
+
+declare void @t3_helper(i32*, i32*)
+
+; VLAs + Dynamic realignment
+define i32 @t4(i64 %sz) nounwind uwtable ssp {
+entry:
+  %a = alloca i32, align 4
+  %v = alloca <8 x float>, align 32
+  %vla = alloca i32, i64 %sz, align 16
+  call void @t4_helper(i32* %a, i32* %vla, <8 x float>* %v) nounwind
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 13
+  ret i32 %add
+
+; CHECK: _t4
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; CHECK: pushq %r14
+; CHECK: pushq %rbx
+; CHECK: andq $-32, %rsp
+; CHECK: subq ${{[0-9]+}}, %rsp
+; CHECK: movq %rsp, %rbx
+;
+; CHECK: leaq {{[0-9]*}}(%rbx), %rdi
+; CHECK: leaq {{[0-9]*}}(%rbx), %rdx
+; CHECK: callq   _t4_helper
+;
+; CHECK: leaq -16(%rbp), %rsp
+; CHECK: popq %rbx
+; CHECK: popq %r14
+; CHECK: popq %rbp
+}
+
+declare void @t4_helper(i32*, i32*, <8 x float>*)
+
+; Dynamic realignment + Spill
+define i32 @t5(float* nocapture %f) nounwind uwtable ssp {
+entry:
+  %a = alloca i32, align 4
+  %0 = bitcast float* %f to <8 x float>*
+  %1 = load <8 x float>* %0, align 32
+  call void @t5_helper1(i32* %a) nounwind
+  call void @t5_helper2(<8 x float> %1) nounwind
+  %2 = load i32* %a, align 4
+  %add = add nsw i32 %2, 13
+  ret i32 %add
+
+; CHECK: _t5
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; CHECK: andq $-32, %rsp
+; CHECK: subq ${{[0-9]+}}, %rsp
+;
+; CHECK: vmovaps (%rdi), [[AVXREG:%ymm[0-9]+]]
+; CHECK: vmovaps [[AVXREG]], (%rsp)
+; CHECK: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK: callq   _t5_helper1
+; CHECK: vmovaps (%rsp), %ymm0
+; CHECK: callq   _t5_helper2
+; CHECK: movl {{[0-9]+}}(%rsp), %eax
+;
+; CHECK: movq %rbp, %rsp
+; CHECK: popq %rbp
+}
+
+declare void @t5_helper1(i32*)
+
+declare void @t5_helper2(<8 x float>)
+
+; VLAs + Dynamic realignment + Spill
+; FIXME: RA has already reserved RBX, so we can't do dynamic realignment.
+define i32 @t6(i64 %sz, float* nocapture %f) nounwind uwtable ssp {
+entry:
+; CHECK: _t6
+  %a = alloca i32, align 4
+  %0 = bitcast float* %f to <8 x float>*
+  %1 = load <8 x float>* %0, align 32
+  %vla = alloca i32, i64 %sz, align 16
+  call void @t6_helper1(i32* %a, i32* %vla) nounwind
+  call void @t6_helper2(<8 x float> %1) nounwind
+  %2 = load i32* %a, align 4
+  %add = add nsw i32 %2, 13
+  ret i32 %add
+}
+
+declare void @t6_helper1(i32*, i32*)
+
+declare void @t6_helper2(<8 x float>)
+
+; VLAs + Dynamic realignment + byval
+; The byval adjust the sp after the prolog, but if we're restoring the sp from
+; the base pointer we use the original adjustment.
+%struct.struct_t = type { [5 x i32] }
+
+define void @t7(i32 %size, %struct.struct_t* byval align 8 %arg1) nounwind uwtable {
+entry:
+  %x = alloca i32, align 32
+  store i32 0, i32* %x, align 32
+  %0 = zext i32 %size to i64
+  %vla = alloca i32, i64 %0, align 16
+  %1 = load i32* %x, align 32
+  call void @bar(i32 %1, i32* %vla, %struct.struct_t* byval align 8 %arg1)
+  ret void
+
+; CHECK: _t7
+; CHECK:     pushq %rbp
+; CHECK:     movq %rsp, %rbp
+; CHECK:     pushq %rbx
+; CHECK:     andq $-32, %rsp
+; CHECK:     subq ${{[0-9]+}}, %rsp
+; CHECK:     movq %rsp, %rbx
+
+; Stack adjustment for byval
+; CHECK:     subq {{.*}}, %rsp
+; CHECK:     callq _bar
+; CHECK-NOT: addq {{.*}}, %rsp
+; CHECK:     leaq -8(%rbp), %rsp
+; CHECK:     popq %rbx
+; CHECK:     popq %rbp
+}
+
+declare i8* @llvm.stacksave() nounwind
+
+declare void @bar(i32, i32*, %struct.struct_t* byval align 8)
+
+declare void @llvm.stackrestore(i8*) nounwind
+
+
+; Test when forcing stack alignment
+define i32 @t8() nounwind uwtable {
+entry:
+  %a = alloca i32, align 4
+  call void @t1_helper(i32* %a) nounwind
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 13
+  ret i32 %add
+
+; FORCE-ALIGN: _t8
+; FORCE-ALIGN:      movq %rsp, %rbp
+; FORCE-ALIGN:      andq $-32, %rsp
+; FORCE-ALIGN-NEXT: subq $32, %rsp
+; FORCE-ALIGN:      movq %rbp, %rsp
+; FORCE-ALIGN:      popq %rbp
+}
+
+; VLAs
+define i32 @t9(i64 %sz) nounwind uwtable {
+entry:
+  %a = alloca i32, align 4
+  %vla = alloca i32, i64 %sz, align 16
+  call void @t3_helper(i32* %a, i32* %vla) nounwind
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 13
+  ret i32 %add
+
+; FORCE-ALIGN: _t9
+; FORCE-ALIGN: pushq %rbp
+; FORCE-ALIGN: movq %rsp, %rbp
+; FORCE-ALIGN: pushq %rbx
+; FORCE-ALIGN: andq $-32, %rsp
+; FORCE-ALIGN: subq $32, %rsp
+; FORCE-ALIGN: movq %rsp, %rbx
+
+; FORCE-ALIGN: leaq -8(%rbp), %rsp
+; FORCE-ALIGN: popq %rbx
+; FORCE-ALIGN: popq %rbp
+}
diff --git a/test/CodeGen/X86/early-ifcvt.ll b/test/CodeGen/X86/early-ifcvt.ll
new file mode 100644
index 0000000..7883ffa
--- /dev/null
+++ b/test/CodeGen/X86/early-ifcvt.ll
@@ -0,0 +1,69 @@
+; RUN: llc < %s -enable-early-ifcvt -stress-early-ifcvt | FileCheck %s
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: mm2
+define i32 @mm2(i32* nocapture %p, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  br label %do.body
+
+; CHECK: do.body
+; Loop body has no branches before the backedge.
+; CHECK-NOT: LBB
+do.body:
+  %max.0 = phi i32 [ 0, %entry ], [ %max.1, %do.cond ]
+  %min.0 = phi i32 [ 0, %entry ], [ %min.1, %do.cond ]
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %do.cond ]
+  %p.addr.0 = phi i32* [ %p, %entry ], [ %incdec.ptr, %do.cond ]
+  %incdec.ptr = getelementptr inbounds i32* %p.addr.0, i64 1
+  %0 = load i32* %p.addr.0, align 4
+  %cmp = icmp sgt i32 %0, %max.0
+  br i1 %cmp, label %do.cond, label %if.else
+
+if.else:
+  %cmp1 = icmp slt i32 %0, %min.0
+  %.min.0 = select i1 %cmp1, i32 %0, i32 %min.0
+  br label %do.cond
+
+do.cond:
+  %max.1 = phi i32 [ %0, %do.body ], [ %max.0, %if.else ]
+  %min.1 = phi i32 [ %min.0, %do.body ], [ %.min.0, %if.else ]
+; CHECK: decl %esi
+; CHECK: jne LBB
+  %dec = add i32 %n.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:
+  %sub = sub nsw i32 %max.1, %min.1
+  ret i32 %sub
+}
+
+; CHECK: multipreds
+; Deal with alternative tail predecessors
+; CHECK-NOT: LBB
+; CHECK: cmov
+; CHECK-NOT: LBB
+; CHECK: cmov
+; CHECK-NOT: LBB
+; CHECK: fprintf
+
+define void @multipreds(i32 %sw) nounwind uwtable ssp {
+entry:
+  switch i32 %sw, label %if.then29 [
+    i32 0, label %if.then37
+    i32 127, label %if.end41
+  ]
+
+if.then29:
+  br label %if.end41
+
+if.then37:
+  br label %if.end41
+
+if.end41:
+  %exit_status.0 = phi i32 [ 2, %if.then29 ], [ 0, %if.then37 ], [ 66, %entry ]
+  call void (...)* @fprintf(i32 %exit_status.0) nounwind
+  unreachable
+}
+
+declare void @fprintf(...) nounwind
diff --git a/test/CodeGen/X86/epilogue.ll b/test/CodeGen/X86/epilogue.ll
index 0f16a64..090680e 100644
--- a/test/CodeGen/X86/epilogue.ll
+++ b/test/CodeGen/X86/epilogue.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -mcpu=generic -march=x86 | not grep lea
-; RUN: llc < %s -mcpu=generic -march=x86 | grep {movl	%ebp}
+; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s
+
+; CHECK-NOT: lea{{.*}}(%esp)
+; CHECK: {{(mov.* %ebp, %esp)|(lea.*\(%ebp\), %esp)}}
 
 declare void @bar(<2 x i64>* %n)
 
diff --git a/test/CodeGen/X86/extractps.ll b/test/CodeGen/X86/extractps.ll
index 14778f0..9e1a375 100644
--- a/test/CodeGen/X86/extractps.ll
+++ b/test/CodeGen/X86/extractps.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86 -mcpu=penryn > %t
 ; RUN: not grep movd %t
-; RUN: grep {movss	%xmm} %t | count 1
-; RUN: grep {extractps	\\\$1, %xmm0, } %t | count 1
+; RUN: grep "movss	%xmm" %t | count 1
+; RUN: grep "extractps	\$1, %xmm0, " %t | count 1
 ; PR2647
 
 external global float, align 16         ; <float*>:0 [#uses=2]
diff --git a/test/CodeGen/X86/fabs.ll b/test/CodeGen/X86/fabs.ll
index 9ded7e0..af1867f 100644
--- a/test/CodeGen/X86/fabs.ll
+++ b/test/CodeGen/X86/fabs.ll
@@ -1,28 +1,54 @@
 ; Make sure this testcase codegens to the fabs instruction, not a call to fabsf
-; RUN: llc < %s -march=x86 -mattr=-sse2,-sse3,-sse | grep fabs\$ | \
-; RUN:   count 2
-; RUN: llc < %s -march=x86 -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math -enable-no-nans-fp-math | \
-; RUN:   grep fabs\$ | count 3
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse2,-sse3,-sse | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s --check-prefix=UNSAFE
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -O0 | FileCheck %s --check-prefix=NOOPT
 
 declare float @fabsf(float)
 
 declare x86_fp80 @fabsl(x86_fp80)
 
+; CHECK:  test1:
+; UNSAFE: test1:
+; NOOPT:  test1:
 define float @test1(float %X) {
-        %Y = call float @fabsf(float %X)
+        %Y = call float @fabsf(float %X) readnone
         ret float %Y
 }
+; CHECK:  {{^[ \t]+fabs$}}
+; UNSAFE: {{^[ \t]+fabs$}}
 
+; CHECK-NOT:  fabs
+; UNSAFE-NOT: fabs
+; NOOPT-NOT:  fabsf
+
+; CHECK:  test2:
+; UNSAFE: test2:
+; NOOPT:  test2:
 define double @test2(double %X) {
         %Y = fcmp oge double %X, -0.0
         %Z = fsub double -0.0, %X
         %Q = select i1 %Y, double %X, double %Z
         ret double %Q
 }
+; fabs is not used here.
+; CHECK-NOT:  fabs
+; NOOPT-NOT:  fabs
+
+; UNSAFE: {{^[ \t]+fabs$}}
 
+; UNSAFE-NOT: fabs
+
+; CHECK:  test3:
+; UNSAFE: test3:
+; NOOPT:  test3:
 define x86_fp80 @test3(x86_fp80 %X) {
-        %Y = call x86_fp80 @fabsl(x86_fp80 %X)
+        %Y = call x86_fp80 @fabsl(x86_fp80 %X) readnone
         ret x86_fp80 %Y
 }
+; CHECK:  {{^[ \t]+fabs$}}
+; UNSAFE: {{^[ \t]+fabs$}}
+; NOOPT:  {{^[ \t]+fabs$}}
 
-
+; CHECK-NOT:  fabs
+; UNSAFE-NOT: fabs
+; NOOPT-NOT:  fabs
diff --git a/test/CodeGen/X86/fast-cc-merge-stack-adj.ll b/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
index e4982f0..14cb136 100644
--- a/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
+++ b/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mcpu=generic -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep {add	ESP, 8}
+; RUN:   grep "add	ESP, 8"
 
 target triple = "i686-pc-linux-gnu"
 
diff --git a/test/CodeGen/X86/fast-isel-constpool.ll b/test/CodeGen/X86/fast-isel-constpool.ll
index 323c853..b3adb80 100644
--- a/test/CodeGen/X86/fast-isel-constpool.ll
+++ b/test/CodeGen/X86/fast-isel-constpool.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -fast-isel | grep {LCPI0_0(%rip)}
+; RUN: llc < %s -fast-isel | grep "LCPI0_0(%rip)"
 ; Make sure fast isel uses rip-relative addressing when required.
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin9.0"
diff --git a/test/CodeGen/X86/fast-isel-gv.ll b/test/CodeGen/X86/fast-isel-gv.ll
index 34f8b38..cb2464e 100644
--- a/test/CodeGen/X86/fast-isel-gv.ll
+++ b/test/CodeGen/X86/fast-isel-gv.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -fast-isel | grep {_kill@GOTPCREL(%rip)}
+; RUN: llc < %s -fast-isel | grep "_kill@GOTPCREL(%rip)"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin10.0"
 @f = global i8 (...)* @kill		; <i8 (...)**> [#uses=1]
diff --git a/test/CodeGen/X86/fast-isel-mem.ll b/test/CodeGen/X86/fast-isel-mem.ll
index 8db1936..52b1e85 100644
--- a/test/CodeGen/X86/fast-isel-mem.ll
+++ b/test/CodeGen/X86/fast-isel-mem.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -fast-isel -mtriple=i386-apple-darwin | FileCheck %s
+; RUN: llc < %s -fast-isel -mtriple=i386-apple-darwin -mcpu=generic | FileCheck %s
+; RUN: llc < %s -fast-isel -mtriple=i386-apple-darwin -mcpu=atom | FileCheck -check-prefix=ATOM %s
 
 @src = external global i32
 
@@ -18,6 +19,13 @@ entry:
 ; CHECK: 	movl	%eax, (%ecx)
 ; CHECK: 	ret
 
+; ATOM:	loadgv:
+; ATOM:		movl    L_src$non_lazy_ptr, %ecx
+; ATOM:         movl    (%ecx), %eax
+; ATOM:         addl    (%ecx), %eax
+; ATOM:         movl    %eax, (%ecx)
+; ATOM:         ret
+
 }
 
 %stuff = type { i32 (...)** }
@@ -31,4 +39,8 @@ entry:
 ; CHECK:	movl	$0, %eax
 ; CHECK:	movl	L_LotsStuff$non_lazy_ptr, %ecx
 
+; ATOM: _t:
+; ATOM:         movl    L_LotsStuff$non_lazy_ptr, %ecx
+; ATOM:         movl    $0, %eax
+
 }
diff --git a/test/CodeGen/X86/fast-isel-x86.ll b/test/CodeGen/X86/fast-isel-x86.ll
index b9598bb..19f3888 100644
--- a/test/CodeGen/X86/fast-isel-x86.ll
+++ b/test/CodeGen/X86/fast-isel-x86.ll
@@ -46,3 +46,17 @@ entry:
 ; CHECK: addl $40
 }
 declare void @test3sret(%struct.a* sret)
+
+; Check that fast-isel sret works with fastcc (and does not callee-pop)
+define void @test4() nounwind ssp {
+entry:
+  %tmp = alloca %struct.a, align 8
+  call fastcc void @test4fastccsret(%struct.a* sret %tmp)
+  ret void
+; CHECK: test4:
+; CHECK: subl $28
+; CHECK: leal (%esp), %ecx
+; CHECK: calll _test4fastccsret
+; CHECK addl $28
+}
+declare fastcc void @test4fastccsret(%struct.a* sret)
diff --git a/test/CodeGen/X86/fast-isel.ll b/test/CodeGen/X86/fast-isel.ll
index c88d529..132df2b 100644
--- a/test/CodeGen/X86/fast-isel.ll
+++ b/test/CodeGen/X86/fast-isel.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -fast-isel -fast-isel-abort -march=x86 -mattr=sse2
-; RUN: llc < %s -fast-isel -fast-isel-abort -mtriple=x86_64-apple-darwin10
+; RUN: llc < %s -fast-isel -fast-isel-abort -verify-machineinstrs -march=x86 -mattr=sse2
+; RUN: llc < %s -fast-isel -fast-isel-abort -verify-machineinstrs -mtriple=x86_64-apple-darwin10
 
 ; This tests very minimal fast-isel functionality.
 
@@ -117,3 +117,11 @@ define i64* @life() nounwind {
   ret i64* %a3
 }
 
+declare void @llvm.donothing() readnone
+
+; CHECK: donada
+define void @donada() nounwind {
+entry:
+  call void @llvm.donothing()
+  ret void
+}
diff --git a/test/CodeGen/X86/fastcc-byval.ll b/test/CodeGen/X86/fastcc-byval.ll
index 52b3e57..f1204d6 100644
--- a/test/CodeGen/X86/fastcc-byval.ll
+++ b/test/CodeGen/X86/fastcc-byval.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -tailcallopt=false | grep {movl\[\[:space:\]\]*8(%esp), %eax} | count 2
+; RUN: llc < %s -tailcallopt=false | grep "movl[[:space:]]*8(%esp), %eax" | count 2
 ; PR3122
 ; rdar://6400815
 
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index 5deedb9..b0c1d0a 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -1,8 +1,11 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=i386-apple-darwin10               | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10             | FileCheck %s --check-prefix=CHECK-FMA-CALL
 
 ; CHECK: test_f32
-; CHECK: _fmaf
+; CHECK-FMA-INST: vfmadd213ss
+; CHECK-FMA-CALL: _fmaf
 
 define float @test_f32(float %a, float %b, float %c) nounwind readnone ssp {
 entry:
@@ -11,7 +14,8 @@ entry:
 }
 
 ; CHECK: test_f64
-; CHECK: _fma
+; CHECK-FMA-INST: vfmadd213sd
+; CHECK-FMA-CALL: _fma
 
 define double @test_f64(double %a, double %b, double %c) nounwind readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/fma3-intrinsics.ll b/test/CodeGen/X86/fma3-intrinsics.ll
new file mode 100755
index 0000000..90529e0
--- /dev/null
+++ b/test/CodeGen/X86/fma3-intrinsics.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 -mattr=avx2,+fma | FileCheck %s
+
+define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fmadd213ss %xmm
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fmadd213ps
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  ; CHECK: fmadd213ps {{.*\(%r.*}}, %ymm
+  %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+
+define <4 x float> @test_x86_fnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fnmadd213ss %xmm
+  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fnmadd213ps
+  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  ; CHECK: fnmadd213ps {{.*\(%r.*}}, %ymm
+  %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_fmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fmsub213ss
+  %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fmsub213ps
+  %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_x86_fnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fnmsub213ss
+  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: fnmsub213ps
+  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+;;;;
+
+define <2 x double> @test_x86_fmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fmadd213sd
+  %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_fmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fmadd213pd
+  %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_fnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fnmadd213sd
+  %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_fnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fnmadd213pd
+  %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+
+
+define <2 x double> @test_x86_fmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fmsub213sd
+  %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fmsub213pd
+  %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_fnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fnmsub213sd
+  %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_fnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: fnmsub213pd
+  %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
index 5ed03ef..fd414b3 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
@@ -1,295 +1,295 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+avx,+fma4 | FileCheck %s
 
 ; VFMADD
-define < 4 x float > @test_x86_fma4_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfmaddss
-  %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-define < 4 x float > @test_x86_fma4_vfmadd_ss_load(< 4 x float > %a0, < 4 x float > %a1, float* %a2) {
+define < 4 x float > @test_x86_fma_vfmadd_ss_load(< 4 x float > %a0, < 4 x float > %a1, float* %a2) {
   ; CHECK: vfmaddss (%{{.*}})
   %x = load float *%a2
   %y = insertelement <4 x float> undef, float %x, i32 0
-  %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %y) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %y) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-define < 4 x float > @test_x86_fma4_vfmadd_ss_load2(< 4 x float > %a0, float* %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmadd_ss_load2(< 4 x float > %a0, float* %a1, < 4 x float > %a2) {
   ; CHECK: vfmaddss %{{.*}}, (%{{.*}})
   %x = load float *%a1
   %y = insertelement <4 x float> undef, float %x, i32 0
-  %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float > %a0, < 4 x float > %y, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %y, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmaddsd
-  %res = call < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-define < 2 x double > @test_x86_fma4_vfmadd_sd_load(< 2 x double > %a0, < 2 x double > %a1, double* %a2) {
+define < 2 x double > @test_x86_fma_vfmadd_sd_load(< 2 x double > %a0, < 2 x double > %a1, double* %a2) {
   ; CHECK: vfmaddsd (%{{.*}})
   %x = load double *%a2
   %y = insertelement <2 x double> undef, double %x, i32 0
-  %res = call < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %y) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %y) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-define < 2 x double > @test_x86_fma4_vfmadd_sd_load2(< 2 x double > %a0, double* %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmadd_sd_load2(< 2 x double > %a0, double* %a1, < 2 x double > %a2) {
   ; CHECK: vfmaddsd %{{.*}}, (%{{.*}})
   %x = load double *%a1
   %y = insertelement <2 x double> undef, double %x, i32 0
-  %res = call < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double > %a0, < 2 x double > %y, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %y, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 4 x float > @test_x86_fma4_vfmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfmaddps
-  %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-define < 4 x float > @test_x86_fma4_vfmadd_ps_load(< 4 x float > %a0, < 4 x float > %a1, < 4 x float >* %a2) {
+define < 4 x float > @test_x86_fma_vfmadd_ps_load(< 4 x float > %a0, < 4 x float > %a1, < 4 x float >* %a2) {
   ; CHECK: vfmaddps (%{{.*}})
   %x = load <4 x float>* %a2
-  %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %x) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %x) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-define < 4 x float > @test_x86_fma4_vfmadd_ps_load2(< 4 x float > %a0, < 4 x float >* %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmadd_ps_load2(< 4 x float > %a0, < 4 x float >* %a1, < 4 x float > %a2) {
   ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
   %x = load <4 x float>* %a1
-  %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ps(< 4 x float > %a0, < 4 x float > %x, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %x, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmaddpd
-  %res = call < 2 x double > @llvm.x86.fma4.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-define < 2 x double > @test_x86_fma4_vfmadd_pd_load(< 2 x double > %a0, < 2 x double > %a1, < 2 x double >* %a2) {
+define < 2 x double > @test_x86_fma_vfmadd_pd_load(< 2 x double > %a0, < 2 x double > %a1, < 2 x double >* %a2) {
   ; CHECK: vfmaddpd (%{{.*}})
   %x = load <2 x double>* %a2
-  %res = call < 2 x double > @llvm.x86.fma4.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %x) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %x) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-define < 2 x double > @test_x86_fma4_vfmadd_pd_load2(< 2 x double > %a0, < 2 x double >* %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmadd_pd_load2(< 2 x double > %a0, < 2 x double >* %a1, < 2 x double > %a2) {
   ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
   %x = load <2 x double>* %a1
-  %res = call < 2 x double > @llvm.x86.fma4.vfmadd.pd(< 2 x double > %a0, < 2 x double > %x, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %x, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 8 x float > @test_x86_fma4_vfmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+define < 8 x float > @test_x86_fma_vfmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfmaddps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma4.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
   ret < 8 x float > %res
 }
-declare < 8 x float > @llvm.x86.fma4.vfmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+declare < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
-define < 4 x double > @test_x86_fma4_vfmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+define < 4 x double > @test_x86_fma_vfmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
   ; CHECK: vfmaddpd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma4.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
   ret < 4 x double > %res
 }
-declare < 4 x double > @llvm.x86.fma4.vfmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+declare < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFMSUB
-define < 4 x float > @test_x86_fma4_vfmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfmsubss
-  %res = call < 4 x float > @llvm.x86.fma4.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmsubsd
-  %res = call < 2 x double > @llvm.x86.fma4.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 4 x float > @test_x86_fma4_vfmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfmsubps
-  %res = call < 4 x float > @llvm.x86.fma4.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmsubpd
-  %res = call < 2 x double > @llvm.x86.fma4.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 8 x float > @test_x86_fma4_vfmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+define < 8 x float > @test_x86_fma_vfmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfmsubps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma4.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
   ret < 8 x float > %res
 }
-declare < 8 x float > @llvm.x86.fma4.vfmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+declare < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
-define < 4 x double > @test_x86_fma4_vfmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+define < 4 x double > @test_x86_fma_vfmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
   ; CHECK: vfmsubpd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma4.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
   ret < 4 x double > %res
 }
-declare < 4 x double > @llvm.x86.fma4.vfmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+declare < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFNMADD
-define < 4 x float > @test_x86_fma4_vfnmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfnmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfnmaddss
-  %res = call < 4 x float > @llvm.x86.fma4.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfnmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfnmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfnmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfnmaddsd
-  %res = call < 2 x double > @llvm.x86.fma4.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfnmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 4 x float > @test_x86_fma4_vfnmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfnmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfnmaddps
-  %res = call < 4 x float > @llvm.x86.fma4.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfnmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfnmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfnmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfnmaddpd
-  %res = call < 2 x double > @llvm.x86.fma4.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfnmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 8 x float > @test_x86_fma4_vfnmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+define < 8 x float > @test_x86_fma_vfnmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfnmaddps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma4.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
   ret < 8 x float > %res
 }
-declare < 8 x float > @llvm.x86.fma4.vfnmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+declare < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
-define < 4 x double > @test_x86_fma4_vfnmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+define < 4 x double > @test_x86_fma_vfnmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
   ; CHECK: vfnmaddpd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma4.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
   ret < 4 x double > %res
 }
-declare < 4 x double > @llvm.x86.fma4.vfnmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+declare < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFNMSUB
-define < 4 x float > @test_x86_fma4_vfnmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfnmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfnmsubss
-  %res = call < 4 x float > @llvm.x86.fma4.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfnmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfnmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfnmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfnmsubsd
-  %res = call < 2 x double > @llvm.x86.fma4.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfnmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 4 x float > @test_x86_fma4_vfnmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfnmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfnmsubps
-  %res = call < 4 x float > @llvm.x86.fma4.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfnmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfnmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfnmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfnmsubpd
-  %res = call < 2 x double > @llvm.x86.fma4.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfnmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 8 x float > @test_x86_fma4_vfnmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+define < 8 x float > @test_x86_fma_vfnmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfnmsubps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma4.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
   ret < 8 x float > %res
 }
-declare < 8 x float > @llvm.x86.fma4.vfnmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+declare < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
-define < 4 x double > @test_x86_fma4_vfnmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+define < 4 x double > @test_x86_fma_vfnmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
   ; CHECK: vfnmsubpd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma4.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
   ret < 4 x double > %res
 }
-declare < 4 x double > @llvm.x86.fma4.vfnmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+declare < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFMADDSUB
-define < 4 x float > @test_x86_fma4_vfmaddsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmaddsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfmaddsubps
-  %res = call < 4 x float > @llvm.x86.fma4.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfmaddsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfmaddsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmaddsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmaddsubpd
-  %res = call < 2 x double > @llvm.x86.fma4.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfmaddsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 8 x float > @test_x86_fma4_vfmaddsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+define < 8 x float > @test_x86_fma_vfmaddsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfmaddsubps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma4.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
   ret < 8 x float > %res
 }
-declare < 8 x float > @llvm.x86.fma4.vfmaddsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+declare < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
-define < 4 x double > @test_x86_fma4_vfmaddsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+define < 4 x double > @test_x86_fma_vfmaddsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
   ; CHECK: vfmaddsubpd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma4.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
   ret < 4 x double > %res
 }
-declare < 4 x double > @llvm.x86.fma4.vfmaddsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+declare < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
 
 ; VFMSUBADD
-define < 4 x float > @test_x86_fma4_vfmsubadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+define < 4 x float > @test_x86_fma_vfmsubadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
   ; CHECK: vfmsubaddps
-  %res = call < 4 x float > @llvm.x86.fma4.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
   ret < 4 x float > %res
 }
-declare < 4 x float > @llvm.x86.fma4.vfmsubadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
 
-define < 2 x double > @test_x86_fma4_vfmsubadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+define < 2 x double > @test_x86_fma_vfmsubadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
   ; CHECK: vfmsubaddpd
-  %res = call < 2 x double > @llvm.x86.fma4.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
   ret < 2 x double > %res
 }
-declare < 2 x double > @llvm.x86.fma4.vfmsubadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
 
-define < 8 x float > @test_x86_fma4_vfmsubadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+define < 8 x float > @test_x86_fma_vfmsubadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
   ; CHECK: vfmsubaddps
   ; CHECK: ymm
-  %res = call < 8 x float > @llvm.x86.fma4.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+  %res = call < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
   ret < 8 x float > %res
 }
-declare < 8 x float > @llvm.x86.fma4.vfmsubadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+declare < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
 
-define < 4 x double > @test_x86_fma4_vfmsubadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+define < 4 x double > @test_x86_fma_vfmsubadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
   ; CHECK: vfmsubaddpd
   ; CHECK: ymm
-  %res = call < 4 x double > @llvm.x86.fma4.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+  %res = call < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
   ret < 4 x double > %res
 }
-declare < 4 x double > @llvm.x86.fma4.vfmsubadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+declare < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
new file mode 100644
index 0000000..5d97a87
--- /dev/null
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -0,0 +1,139 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s
+
+; CHECK: test_x86_fmadd_ps
+; CHECK: vfmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = fmul <4 x float> %a0, %a1
+  %res = fadd <4 x float> %x, %a2
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fmsub_ps
+; CHECK: fmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = fmul <4 x float> %a0, %a1
+  %res = fsub <4 x float> %x, %a2
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fnmadd_ps
+; CHECK: fnmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = fmul <4 x float> %a0, %a1
+  %res = fsub <4 x float> %a2, %x
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fnmsub_ps
+; CHECK: fnmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = fmul <4 x float> %a0, %a1
+  %y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
+  %res = fsub <4 x float> %y, %a2
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fmadd_ps_y
+; CHECK: vfmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  %x = fmul <8 x float> %a0, %a1
+  %res = fadd <8 x float> %x, %a2
+  ret <8 x float> %res
+}
+
+; CHECK: test_x86_fmsub_ps_y
+; CHECK: vfmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  %x = fmul <8 x float> %a0, %a1
+  %res = fsub <8 x float> %x, %a2
+  ret <8 x float> %res
+}
+
+; CHECK: test_x86_fnmadd_ps_y
+; CHECK: vfnmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  %x = fmul <8 x float> %a0, %a1
+  %res = fsub <8 x float> %a2, %x
+  ret <8 x float> %res
+}
+
+; CHECK: test_x86_fnmsub_ps_y
+; CHECK: vfnmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+  %x = fmul <8 x float> %a0, %a1
+  %y = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
+  %res = fsub <8 x float> %y, %a2
+  ret <8 x float> %res
+}
+
+; CHECK: test_x86_fmadd_pd_y
+; CHECK: vfmadd213pd     %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+  %x = fmul <4 x double> %a0, %a1
+  %res = fadd <4 x double> %x, %a2
+  ret <4 x double> %res
+}
+
+; CHECK: test_x86_fmsub_pd_y
+; CHECK: vfmsub213pd     %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+  %x = fmul <4 x double> %a0, %a1
+  %res = fsub <4 x double> %x, %a2
+  ret <4 x double> %res
+}
+
+; CHECK: test_x86_fmsub_pd
+; CHECK: vfmsub213pd     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  %x = fmul <2 x double> %a0, %a1
+  %res = fsub <2 x double> %x, %a2
+  ret <2 x double> %res
+}
+
+; CHECK: test_x86_fnmadd_ss
+; CHECK: vfnmadd213ss    %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
+  %x = fmul float %a0, %a1
+  %res = fsub float %a2, %x
+  ret float %res
+}
+
+; CHECK: test_x86_fnmadd_sd
+; CHECK: vfnmadd213sd     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
+  %x = fmul double %a0, %a1
+  %res = fsub double %a2, %x
+  ret double %res
+}
+
+; CHECK: test_x86_fmsub_sd
+; CHECK: vfmsub213sd     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
+  %x = fmul double %a0, %a1
+  %res = fsub double %x, %a2
+  ret double %res
+}
+
+; CHECK: test_x86_fnmsub_ss
+; CHECK: vfnmsub213ss     %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
+  %x = fsub float -0.000000e+00, %a0
+  %y = fmul float %x, %a1
+  %res = fsub float %y, %a2
+  ret float %res
+}
diff --git a/test/CodeGen/X86/fold-load.ll b/test/CodeGen/X86/fold-load.ll
index e03cb7e..c961f75 100644
--- a/test/CodeGen/X86/fold-load.ll
+++ b/test/CodeGen/X86/fold-load.ll
@@ -45,3 +45,29 @@ L:
 
 }
 
+; rdar://10554090
+; xor in exit block will be CSE'ed and load will be folded to xor in entry.
+define i1 @test3(i32* %P, i32* %Q) nounwind {
+; CHECK: test3:
+; CHECK: movl 8(%esp), %eax
+; CHECK: xorl (%eax),
+; CHECK: j
+; CHECK-NOT: xor
+entry:
+  %0 = load i32* %P, align 4
+  %1 = load i32* %Q, align 4
+  %2 = xor i32 %0, %1
+  %3 = and i32 %2, 65535
+  %4 = icmp eq i32 %3, 0
+  br i1 %4, label %exit, label %land.end
+
+exit:
+  %shr.i.i19 = xor i32 %1, %0
+  %5 = and i32 %shr.i.i19, 2147418112
+  %6 = icmp eq i32 %5, 0
+  br label %land.end
+
+land.end:
+  %7 = phi i1 [ %6, %exit ], [ false, %entry ]
+  ret i1 %7
+}
diff --git a/test/CodeGen/X86/fold-pcmpeqd-1.ll b/test/CodeGen/X86/fold-pcmpeqd-1.ll
index cc4198d..d850630 100644
--- a/test/CodeGen/X86/fold-pcmpeqd-1.ll
+++ b/test/CodeGen/X86/fold-pcmpeqd-1.ll
@@ -1,11 +1,16 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 > %t
-; RUN: grep pcmpeqd %t | count 1
-; RUN: grep xor %t | count 1
-; RUN: not grep LCP %t
+; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s
 
 define <2 x double> @foo() nounwind {
   ret <2 x double> bitcast (<2 x i64><i64 -1, i64 -1> to <2 x double>)
+; CHECK: foo:
+; CHECK: pcmpeqd %xmm0, %xmm0
+; CHECK-NOT: %xmm
+; CHECK: ret
 }
 define <2 x double> @bar() nounwind {
   ret <2 x double> bitcast (<2 x i64><i64 0, i64 0> to <2 x double>)
+; CHECK: bar:
+; CHECK: xorps %xmm0, %xmm0
+; CHECK-NOT: %xmm
+; CHECK: ret
 }
diff --git a/test/CodeGen/X86/force-align-stack-alloca.ll b/test/CodeGen/X86/force-align-stack-alloca.ll
new file mode 100644
index 0000000..2ada194
--- /dev/null
+++ b/test/CodeGen/X86/force-align-stack-alloca.ll
@@ -0,0 +1,70 @@
+; This test is attempting to detect when we request forced re-alignment of the
+; stack to an alignment greater than would be available due to the ABI. We
+; arbitrarily force alignment up to 32-bytes for i386 hoping that this will
+; exceed any ABI provisions.
+;
+; RUN: llc < %s -mcpu=generic -force-align-stack -stack-alignment=32 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+define i32 @f(i8* %p) nounwind {
+entry:
+  %0 = load i8* %p
+  %conv = sext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i64 @g(i32 %i) nounwind {
+; CHECK: g:
+; CHECK:      pushl  %ebp
+; CHECK-NEXT: movl   %esp, %ebp
+; CHECK-NEXT: pushl
+; CHECK-NEXT: pushl
+; CHECK-NEXT: andl   $-32, %esp
+; CHECK-NEXT: subl   $32, %esp
+;
+; Now setup the base pointer (%esi).
+; CHECK-NEXT: movl   %esp, %esi
+; CHECK-NOT:         {{[^ ,]*}}, %esp
+;
+; The next adjustment of the stack is due to the alloca.
+; CHECK:      movl   %{{...}}, %esp
+; CHECK-NOT:         {{[^ ,]*}}, %esp
+;
+; Next we set up the memset call, and then undo it.
+; CHECK:      subl   $32, %esp
+; CHECK-NOT:         {{[^ ,]*}}, %esp
+; CHECK:      calll  memset
+; CHECK-NEXT: addl   $32, %esp
+; CHECK-NOT:         {{[^ ,]*}}, %esp
+;
+; Next we set up the call to 'f'.
+; CHECK:      subl   $32, %esp
+; CHECK-NOT:         {{[^ ,]*}}, %esp
+; CHECK:      calll  f
+; CHECK-NEXT: addl   $32, %esp
+; CHECK-NOT:         {{[^ ,]*}}, %esp
+;
+; Restore %esp from %ebp (frame pointer) and subtract the size of
+; zone with callee-saved registers to pop them.
+; This is the state prior to stack realignment and the allocation of VLAs.
+; CHECK-NOT:  popl
+; CHECK:      leal   -8(%ebp), %esp
+; CHECK-NEXT: popl
+; CHECK-NEXT: popl
+; CHECK-NEXT: popl   %ebp
+; CHECK-NEXT: ret
+
+entry:
+  br label %if.then
+
+if.then:
+  %0 = alloca i8, i32 %i
+  call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 %i, i32 1, i1 false)
+  %call = call i32 @f(i8* %0)
+  %conv = sext i32 %call to i64
+  ret i64 %conv
+}
+
+declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i32, i1) nounwind
diff --git a/test/CodeGen/X86/fp-immediate-shorten.ll b/test/CodeGen/X86/fp-immediate-shorten.ll
index cafc61a..62d8100 100644
--- a/test/CodeGen/X86/fp-immediate-shorten.ll
+++ b/test/CodeGen/X86/fp-immediate-shorten.ll
@@ -1,7 +1,7 @@
 ;; Test that this FP immediate is stored in the constant pool as a float.
 
 ; RUN: llc < %s -march=x86 -mattr=-sse2,-sse3 | \
-; RUN:   grep {.long.1123418112}
+; RUN:   grep ".long.1123418112"
 
 define double @D() {
         ret double 1.230000e+02
diff --git a/test/CodeGen/X86/fp-in-intregs.ll b/test/CodeGen/X86/fp-in-intregs.ll
index 6966cf0..1f5121d 100644
--- a/test/CodeGen/X86/fp-in-intregs.ll
+++ b/test/CodeGen/X86/fp-in-intregs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-macosx -mcpu=yonah | FileCheck %s
 ; CHECK-NOT:     {{((xor|and)ps|movd)}}
 
 ; These operations should be done in integer registers, eliminating constant
diff --git a/test/CodeGen/X86/fp-stack-compare-cmov.ll b/test/CodeGen/X86/fp-stack-compare-cmov.ll
new file mode 100644
index 0000000..b457fbc
--- /dev/null
+++ b/test/CodeGen/X86/fp-stack-compare-cmov.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=x86 -mcpu=pentiumpro | FileCheck %s
+; PR1012
+
+define float @foo(float* %col.2.0) {
+; CHECK: fucompi
+; CHECK: fcmov
+  %tmp = load float* %col.2.0
+  %tmp16 = fcmp olt float %tmp, 0.000000e+00
+  %tmp20 = fsub float -0.000000e+00, %tmp
+  %iftmp.2.0 = select i1 %tmp16, float %tmp20, float %tmp
+  ret float %iftmp.2.0
+}
diff --git a/test/CodeGen/X86/fp-stack-compare.ll b/test/CodeGen/X86/fp-stack-compare.ll
index f3998b6..a8557ad 100644
--- a/test/CodeGen/X86/fp-stack-compare.ll
+++ b/test/CodeGen/X86/fp-stack-compare.ll
@@ -1,8 +1,11 @@
 ; RUN: llc < %s -march=x86 -mcpu=i386 | FileCheck %s
-; PR1012
+; PR6679
 
 define float @foo(float* %col.2.0) {
-; CHECK: fucompi
+; CHECK: fucomp
+; CHECK-NOT: fucompi
+; CHECK: j
+; CHECK-NOT: fcmov
   %tmp = load float* %col.2.0
   %tmp16 = fcmp olt float %tmp, 0.000000e+00
   %tmp20 = fsub float -0.000000e+00, %tmp
diff --git a/test/CodeGen/X86/fp-stack-ret.ll b/test/CodeGen/X86/fp-stack-ret.ll
index 1307f70..2733117 100644
--- a/test/CodeGen/X86/fp-stack-ret.ll
+++ b/test/CodeGen/X86/fp-stack-ret.ll
@@ -22,7 +22,7 @@ define fastcc double @test2(<2 x double> %A) {
 
 ; CHECK: test3
 ; CHECK: sub{{.*}}%esp
-; CHECLK-NOT: xmm
+; CHECK-NOT: xmm
 define fastcc double @test3(<4 x float> %A) {
 	%B = bitcast <4 x float> %A to <2 x double>
 	%C = call fastcc double @test2(<2 x double> %B)
diff --git a/test/CodeGen/X86/fp_load_fold.ll b/test/CodeGen/X86/fp_load_fold.ll
index 0145069..a2cea5e 100644
--- a/test/CodeGen/X86/fp_load_fold.ll
+++ b/test/CodeGen/X86/fp_load_fold.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep -i ST | not grep {fadd\\|fsub\\|fdiv\\|fmul}
+; RUN:   grep -i ST | not grep "fadd\|fsub\|fdiv\|fmul"
 
 ; Test that the load of the memory location is folded into the operation.
 
diff --git a/test/CodeGen/X86/full-lsr.ll b/test/CodeGen/X86/full-lsr.ll
index ff9b1b0..1344cdc 100644
--- a/test/CodeGen/X86/full-lsr.ll
+++ b/test/CodeGen/X86/full-lsr.ll
@@ -1,9 +1,17 @@
-; RUN: llc < %s -march=x86 >%t
-
-; RUN: grep {addl	\\\$4,} %t | count 3
-; RUN: not grep {,%} %t
+; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s
 
 define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind {
+; ATOM: foo
+; ATOM: addl
+; ATOM: leal
+; ATOM: leal
+
+; CHECK: foo
+; CHECK: addl
+; CHECK: addl
+; CEHCK: addl
+
 entry:
 	%0 = icmp sgt i32 %N, 0		; <i1> [#uses=1]
 	br i1 %0, label %bb, label %return
diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll
index 4a6927f..72a5096 100644
--- a/test/CodeGen/X86/gather-addresses.ll
+++ b/test/CodeGen/X86/gather-addresses.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-win32 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s | FileCheck %s
 ; rdar://7398554
 
 ; When doing vector gather-scatter index calculation with 32-bit indices,
diff --git a/test/CodeGen/X86/gs-fold.ll b/test/CodeGen/X86/gs-fold.ll
new file mode 100644
index 0000000..dbec76b
--- /dev/null
+++ b/test/CodeGen/X86/gs-fold.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-freebsd | FileCheck %s --check-prefix=CHECK-FBSD
+; RUN: llc < %s -mtriple=x86_64-unknown-linux | FileCheck %s --check-prefix=CHECK-LINUX
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+%struct.thread = type { i32, i32, i32, i32 }
+
+define i32 @test() nounwind uwtable {
+entry:
+  %0 = load volatile %struct.thread* addrspace(256)* null
+  %c = getelementptr inbounds %struct.thread* %0, i64 0, i32 2
+  %1 = load i32* %c, align 4
+  ret i32 %1
+}
+
+; Check that we are not assuming that gs contains the address of gs if we are not targeting Linux
+; CHECK-FBSD: movq	%gs:0, %rax
+; CHECK-FBSD: movl	8(%rax), %eax
+; Check that we are assuming that gs contains the address of gs if we are targeting Linux
+; CHECK-LINUX: movl	%gs:8, %eax
+
diff --git a/test/CodeGen/X86/h-register-addressing-32.ll b/test/CodeGen/X86/h-register-addressing-32.ll
index 76ffd66..968a9e8 100644
--- a/test/CodeGen/X86/h-register-addressing-32.ll
+++ b/test/CodeGen/X86/h-register-addressing-32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep {movzbl	%\[abcd\]h,} | count 7
+; RUN: llc < %s -march=x86 | grep "movzbl	%[abcd]h," | count 7
 
 ; Use h-register extract and zero-extend.
 
diff --git a/test/CodeGen/X86/h-register-addressing-64.ll b/test/CodeGen/X86/h-register-addressing-64.ll
index 98817f3..a19fca5 100644
--- a/test/CodeGen/X86/h-register-addressing-64.ll
+++ b/test/CodeGen/X86/h-register-addressing-64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep {movzbl	%\[abcd\]h,} | count 7
+; RUN: llc < %s -march=x86-64 | grep "movzbl	%[abcd]h," | count 7
 
 ; Use h-register extract and zero-extend.
 
diff --git a/test/CodeGen/X86/h-registers-1.ll b/test/CodeGen/X86/h-registers-1.ll
index 402cdfe..903c453 100644
--- a/test/CodeGen/X86/h-registers-1.ll
+++ b/test/CodeGen/X86/h-registers-1.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-linux > %t
-; RUN: grep {movzbl	%\[abcd\]h,} %t | count 8
-; RUN: grep {%\[abcd\]h} %t | not grep {%r\[\[:digit:\]\]*d}
+; RUN: grep "movzbl	%[abcd]h," %t | count 8
+; RUN: grep "%[abcd]h" %t | not grep "%r[[:digit:]]*d"
 
 ; LLVM creates virtual registers for values live across blocks
 ; based on the type of the value. Make sure that the extracts
diff --git a/test/CodeGen/X86/hoist-invariant-load.ll b/test/CodeGen/X86/hoist-invariant-load.ll
index 4289fa7..74ecd04 100644
--- a/test/CodeGen/X86/hoist-invariant-load.ll
+++ b/test/CodeGen/X86/hoist-invariant-load.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -stats -O2 |& grep "1 machine-licm"
+; RUN: llc < %s -stats -O2 2>&1 | grep "1 machine-licm"
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.7.2"
diff --git a/test/CodeGen/X86/iabs.ll b/test/CodeGen/X86/iabs.ll
index a8ba015..9196cce 100644
--- a/test/CodeGen/X86/iabs.ll
+++ b/test/CodeGen/X86/iabs.ll
@@ -1,13 +1,17 @@
-; RUN: llc < %s -march=x86-64 -stats  |& \
-; RUN:   grep {5 .*Number of machine instrs printed}
+; RUN: llc < %s -march=x86-64 | FileCheck %s
 
 ;; Integer absolute value, should produce something at least as good as:
-;;       movl %edi, %ecx
-;;       sarl $31, %ecx
-;;       leal (%rdi,%rcx), %eax
-;;       xorl %ecx, %eax
+;;       movl   %edi, %eax
+;;       negl   %eax
+;;       cmovll %edi, %eax
 ;;       ret
+; rdar://10695237
 define i32 @test(i32 %a) nounwind {
+; CHECK: test:
+; CHECK: mov
+; CHECK-NEXT: neg
+; CHECK-NEXT: cmov
+; CHECK-NEXT: ret
         %tmp1neg = sub i32 0, %a
         %b = icmp sgt i32 %a, -1
         %abs = select i1 %b, i32 %a, i32 %tmp1neg
diff --git a/test/CodeGen/X86/illegal-vector-args-return.ll b/test/CodeGen/X86/illegal-vector-args-return.ll
index cecf77a..62a21f4 100644
--- a/test/CodeGen/X86/illegal-vector-args-return.ll
+++ b/test/CodeGen/X86/illegal-vector-args-return.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep {mulpd	%xmm3, %xmm1}
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep {mulpd	%xmm2, %xmm0}
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep {addps	%xmm3, %xmm1}
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep {addps	%xmm2, %xmm0}
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "mulpd	%xmm3, %xmm1"
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "mulpd	%xmm2, %xmm0"
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "addps	%xmm3, %xmm1"
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "addps	%xmm2, %xmm0"
 
 define <4 x double> @foo(<4 x double> %x, <4 x double> %z) {
   %y = fmul <4 x double> %x, %z
diff --git a/test/CodeGen/X86/inline-asm-error.ll b/test/CodeGen/X86/inline-asm-error.ll
index 134d6e9..747a589 100644
--- a/test/CodeGen/X86/inline-asm-error.ll
+++ b/test/CodeGen/X86/inline-asm-error.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -march x86 -regalloc=fast       < %s 2> %t1
+; RUN: not llc -march x86 -regalloc=fast -optimize-regalloc=0 < %s 2> %t1
 ; RUN: not llc -march x86 -regalloc=basic      < %s 2> %t2
 ; RUN: not llc -march x86 -regalloc=greedy     < %s 2> %t3
 ; RUN: FileCheck %s < %t1
diff --git a/test/CodeGen/X86/inline-asm-modifier-n.ll b/test/CodeGen/X86/inline-asm-modifier-n.ll
index 5e76b6c..b069c46 100644
--- a/test/CodeGen/X86/inline-asm-modifier-n.ll
+++ b/test/CodeGen/X86/inline-asm-modifier-n.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep { 37}
+; RUN: llc < %s -march=x86 | grep " 37"
 ; rdar://7008959
 
 define void @bork() nounwind {
diff --git a/test/CodeGen/X86/inline-asm.ll b/test/CodeGen/X86/inline-asm.ll
index eef6c2f..e6eb9ef 100644
--- a/test/CodeGen/X86/inline-asm.ll
+++ b/test/CodeGen/X86/inline-asm.ll
@@ -43,3 +43,12 @@ entry:
   %0 = tail call i8 asm sideeffect "xchg $0, $1", "=r,*m,0,~{memory},~{dirflag},~{fpsr},~{flags}"(i32* %p, i1 %desired) nounwind
   ret void
 }
+
+; <rdar://problem/11542429>
+; The constrained GR32_ABCD register class of the 'q' constraint requires
+; special handling after the preceding outputs used up eax-edx.
+define void @constrain_abcd(i8* %h) nounwind ssp {
+entry:
+  %0 = call { i32, i32, i32, i32, i32 } asm sideeffect "", "=&r,=&r,=&r,=&r,=&q,r,~{ecx},~{memory},~{dirflag},~{fpsr},~{flags}"(i8* %h) nounwind
+  ret void
+}
diff --git a/test/CodeGen/X86/inreg.ll b/test/CodeGen/X86/inreg.ll
new file mode 100644
index 0000000..6653cfb
--- /dev/null
+++ b/test/CodeGen/X86/inreg.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=i686-pc-linux -mcpu=corei7 | FileCheck --check-prefix=DAG %s
+; RUN: llc < %s -mtriple=i686-pc-linux -mcpu=corei7 -O0 | FileCheck --check-prefix=FAST %s
+
+%struct.s1 = type { double, float }
+
+define void @g1() nounwind {
+entry:
+  %tmp = alloca %struct.s1, align 4
+  call void @f(%struct.s1* inreg sret %tmp, i32 inreg 41, i32 inreg 42, i32 43)
+  ret void
+  ; DAG: g1:
+  ; DAG: subl $[[AMT:.*]], %esp
+  ; DAG-NEXT: $43, (%esp)
+  ; DAG-NEXT: leal    16(%esp), %eax
+  ; DAG-NEXT: movl    $41, %edx
+  ; DAG-NEXT: movl    $42, %ecx
+  ; DAG-NEXT: calll   f
+  ; DAG-NEXT: addl $[[AMT]], %esp
+  ; DAG-NEXT: ret
+
+  ; FAST: g1:
+  ; FAST: subl $[[AMT:.*]], %esp
+  ; FAST-NEXT: leal    8(%esp), %eax
+  ; FAST-NEXT: movl    $41, %edx
+  ; FAST-NEXT: movl    $42, %ecx
+  ; FAST: $43, (%esp)
+  ; FAST: calll   f
+  ; FAST-NEXT: addl $[[AMT]], %esp
+  ; FAST: ret
+}
+
+declare void @f(%struct.s1* inreg sret, i32 inreg, i32 inreg, i32)
+
+%struct.s2 = type {}
+
+define void @g2(%struct.s2* inreg sret %agg.result) nounwind {
+entry:
+  ret void
+  ; DAG: g2
+  ; DAG-NOT: ret $4
+  ; DAG: .size g2
+
+  ; FAST: g2
+  ; FAST-NOT: ret $4
+  ; FAST: .size g2
+}
diff --git a/test/CodeGen/X86/isel-sink2.ll b/test/CodeGen/X86/isel-sink2.ll
index 5ed0e00..b162666 100644
--- a/test/CodeGen/X86/isel-sink2.ll
+++ b/test/CodeGen/X86/isel-sink2.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 > %t
-; RUN: grep {movb.7(%...)} %t
+; RUN: grep "movb.7(%...)" %t
 ; RUN: not grep leal %t
 
 define i8 @test(i32 *%P) nounwind {
diff --git a/test/CodeGen/X86/ispositive.ll b/test/CodeGen/X86/ispositive.ll
index 8adf723..b1d1a20 100644
--- a/test/CodeGen/X86/ispositive.ll
+++ b/test/CodeGen/X86/ispositive.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep {shrl.*31}
+; RUN: llc < %s -march=x86 | grep "shrl.*31"
 
 define i32 @test1(i32 %X) {
 entry:
diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll
index dbd133c..48e2106 100644
--- a/test/CodeGen/X86/jump_sign.ll
+++ b/test/CodeGen/X86/jump_sign.ll
@@ -22,6 +22,7 @@ declare i32 @bar(...)
 declare i32 @baz(...)
 
 ; rdar://10633221
+; rdar://11355268
 define i32 @g(i32 %a, i32 %b) nounwind {
 entry:
 ; CHECK: g:
@@ -32,3 +33,223 @@ entry:
   %cond = select i1 %cmp, i32 %sub, i32 0
   ret i32 %cond
 }
+
+; rdar://10734411
+define i32 @h(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: h:
+; CHECK-NOT: cmp
+; CHECK: cmov
+; CHECK-NOT: movl
+; CHECK: ret
+  %cmp = icmp slt i32 %b, %a
+  %sub = sub nsw i32 %a, %b
+  %cond = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %cond
+}
+define i32 @i(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: i:
+; CHECK-NOT: cmp
+; CHECK: cmov
+; CHECK-NOT: movl
+; CHECK: ret
+  %cmp = icmp sgt i32 %a, %b
+  %sub = sub nsw i32 %a, %b
+  %cond = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %cond
+}
+define i32 @j(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: j:
+; CHECK-NOT: cmp
+; CHECK: cmov
+; CHECK-NOT: movl
+; CHECK: ret
+  %cmp = icmp ugt i32 %a, %b
+  %sub = sub i32 %a, %b
+  %cond = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %cond
+}
+define i32 @k(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: k:
+; CHECK-NOT: cmp
+; CHECK: cmov
+; CHECK-NOT: movl
+; CHECK: ret
+  %cmp = icmp ult i32 %b, %a
+  %sub = sub i32 %a, %b
+  %cond = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %cond
+}
+; redundant cmp instruction
+define i32 @l(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: l:
+; CHECK-NOT: cmp
+  %cmp = icmp slt i32 %b, %a
+  %sub = sub nsw i32 %a, %b
+  %cond = select i1 %cmp, i32 %sub, i32 %a
+  ret i32 %cond
+}
+define i32 @m(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: m:
+; CHECK-NOT: cmp
+  %cmp = icmp sgt i32 %a, %b
+  %sub = sub nsw i32 %a, %b
+  %cond = select i1 %cmp, i32 %b, i32 %sub
+  ret i32 %cond
+}
+; If EFLAGS is live-out, we can't remove cmp if there exists
+; a swapped sub.
+define i32 @l2(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: l2:
+; CHECK: cmp
+  %cmp = icmp eq i32 %b, %a
+  %sub = sub nsw i32 %a, %b
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = icmp sgt i32 %b, %a
+  %sel = select i1 %cmp2, i32 %sub, i32 %a
+  ret i32 %sel
+
+if.else:
+  ret i32 %sub
+}
+define i32 @l3(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: l3:
+; CHECK: sub
+; CHECK-NOT: cmp
+; CHECK: jge
+  %cmp = icmp sgt i32 %b, %a
+  %sub = sub nsw i32 %a, %b
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  ret i32 %sub
+
+if.else:
+  %add = add nsw i32 %sub, 1
+  ret i32 %add
+}
+; rdar://11830760
+; When Movr0 is between sub and cmp, we need to move "Movr0" before sub.
+define i32 @l4(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: l4:
+; CHECK: xor
+; CHECK: sub
+; CHECK-NOT: cmp
+  %cmp = icmp sgt i32 %b, %a
+  %sub = sub i32 %a, %b
+  %.sub = select i1 %cmp, i32 0, i32 %sub
+  ret i32 %.sub
+}
+; rdar://11540023
+define i32 @n(i32 %x, i32 %y) nounwind {
+entry:
+; CHECK: n:
+; CHECK-NOT: sub
+; CHECK: cmp
+  %sub = sub nsw i32 %x, %y
+  %cmp = icmp slt i32 %sub, 0
+  %y.x = select i1 %cmp, i32 %y, i32 %x
+  ret i32 %y.x
+}
+; PR://13046
+define void @o() nounwind uwtable {
+entry:
+  %0 = load i16* undef, align 2
+  br i1 undef, label %if.then.i, label %if.end.i
+
+if.then.i:                                        ; preds = %entry
+  unreachable
+
+if.end.i:                                         ; preds = %entry
+  br i1 undef, label %sw.bb, label %sw.default
+
+sw.bb:                                            ; preds = %if.end.i
+  br i1 undef, label %if.then44, label %if.end29
+
+if.end29:                                         ; preds = %sw.bb
+; CHECK: o:
+; CHECK: cmp
+  %1 = urem i16 %0, 10
+  %cmp25 = icmp eq i16 %1, 0
+  %. = select i1 %cmp25, i16 2, i16 0
+  br i1 %cmp25, label %if.then44, label %sw.default
+
+sw.default:                                       ; preds = %if.end29, %if.end.i
+  br i1 undef, label %if.then.i96, label %if.else.i97
+
+if.then.i96:                                      ; preds = %sw.default
+  unreachable
+
+if.else.i97:                                      ; preds = %sw.default
+  unreachable
+
+if.then44:                                        ; preds = %if.end29, %sw.bb
+  %aModeRefSel.1.ph = phi i16 [ %., %if.end29 ], [ 3, %sw.bb ]
+  br i1 undef, label %if.then.i103, label %if.else.i104
+
+if.then.i103:                                     ; preds = %if.then44
+  unreachable
+
+if.else.i104:                                     ; preds = %if.then44
+  ret void
+}
+; rdar://11855129
+define i32 @p(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK: p:
+; CHECK-NOT: test
+; CHECK: cmovs
+  %add = add nsw i32 %b, %a
+  %cmp = icmp sgt i32 %add, 0
+  %add. = select i1 %cmp, i32 %add, i32 0
+  ret i32 %add.
+}
+; PR13475
+; If we have sub a, b and cmp b, a and the result of cmp is used
+; by sbb, we should not optimize cmp away.
+define i32 @q(i32 %j.4, i32 %w, i32 %el) {
+; CHECK: q:
+; CHECK: sub
+; CHECK: cmp
+; CHECK-NEXT: sbb
+  %tmp532 = add i32 %j.4, %w
+  %tmp533 = icmp ugt i32 %tmp532, %el
+  %tmp534 = icmp ult i32 %w, %el
+  %or.cond = and i1 %tmp533, %tmp534
+  %tmp535 = sub i32 %el, %w
+  %j.5 = select i1 %or.cond, i32 %tmp535, i32 %j.4
+  ret i32 %j.5
+}
+; rdar://11873276
+define i8* @r(i8* %base, i32* nocapture %offset, i32 %size) nounwind {
+entry:
+; CHECK: r:
+; CHECK: sub
+; CHECK-NOT: cmp
+; CHECK: j
+; CHECK-NOT: sub
+; CHECK: ret
+  %0 = load i32* %offset, align 8
+  %cmp = icmp slt i32 %0, %size
+  br i1 %cmp, label %return, label %if.end
+
+if.end:
+  %sub = sub nsw i32 %0, %size
+  store i32 %sub, i32* %offset, align 8
+  %add.ptr = getelementptr inbounds i8* %base, i32 %sub
+  br label %return
+
+return:
+  %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
+  ret i8* %retval.0
+}
diff --git a/test/CodeGen/X86/label-redefinition.ll b/test/CodeGen/X86/label-redefinition.ll
index 9ad33e0..9e88a18 100644
--- a/test/CodeGen/X86/label-redefinition.ll
+++ b/test/CodeGen/X86/label-redefinition.ll
@@ -1,5 +1,5 @@
 ; PR7054
-; RUN: not llc %s -o - |& grep {'_foo' label emitted multiple times to assembly}
+; RUN: not llc %s -o - 2>&1 | grep "'_foo' label emitted multiple times to assembly"
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
 target triple = "i386-apple-darwin10.0.0"
 
diff --git a/test/CodeGen/X86/large-global.ll b/test/CodeGen/X86/large-global.ll
new file mode 100644
index 0000000..7cb974b
--- /dev/null
+++ b/test/CodeGen/X86/large-global.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s
+; rdar://11729134
+
+; EmitZerofill was incorrectly expecting a 32-bit "size" so 26214400000
+; was printed as 444596224
+
+%struct.X = type { [25000 x i8] }
+
+@gArray = global [1048576 x %struct.X] zeroinitializer, align 16
+
+; CHECK: .zerofill __DATA,__common,_gArray,26214400000,4
diff --git a/test/CodeGen/X86/lea-2.ll b/test/CodeGen/X86/lea-2.ll
index 6930350..43f69b0 100644
--- a/test/CodeGen/X86/lea-2.ll
+++ b/test/CodeGen/X86/lea-2.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep {lea	EAX, DWORD PTR \\\[... + 4\\*... - 5\\\]}
+; RUN:   grep "lea	EAX, DWORD PTR \[... + 4\*... - 5\]"
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
 ; RUN:   not grep add
 
diff --git a/test/CodeGen/X86/liveness-local-regalloc.ll b/test/CodeGen/X86/liveness-local-regalloc.ll
index b469d083..721f545 100644
--- a/test/CodeGen/X86/liveness-local-regalloc.ll
+++ b/test/CodeGen/X86/liveness-local-regalloc.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -O3 -regalloc=fast -mtriple=x86_64-apple-darwin10
+; RUN: llc < %s -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs -mtriple=x86_64-apple-darwin10
 ; <rdar://problem/7755473>
+; PR12821
 
 %0 = type { i32, i8*, i8*, %1*, i8*, i64, i64, i32, i32, i32, i32, [1024 x i8] }
 %1 = type { i8*, i32, i32, i16, i16, %2, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %2, %3*, i32, [3 x i8], [1 x i8], %2, i32, i64 }
@@ -58,3 +59,34 @@ infloop:                                          ; preds = %infloop, %bb3
 infloop1:                                         ; preds = %infloop1, %bb5
   br label %infloop1
 }
+
+
+; RAFast would forget to add a super-register <imp-def> when rewriting:
+;  %vreg10:sub_32bit<def,read-undef> = COPY %R9D<kill>
+; This trips up the machine code verifier.
+define void @autogen_SD24657(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca <16 x i16>
+  %A3 = alloca double
+  %A2 = alloca <2 x i8>
+  %A1 = alloca i1
+  %A = alloca i32
+  %L = load i8* %0
+  store i8 -37, i8* %0
+  %E = extractelement <4 x i64> zeroinitializer, i32 2
+  %Shuff = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 5, i32 7, i32 1, i32 3>
+  %I = insertelement <2 x i8> <i8 -1, i8 -1>, i8 %5, i32 1
+  %B = fadd float 0x45CDF5B1C0000000, 0x45CDF5B1C0000000
+  %FC = uitofp i32 275048 to double
+  %Sl = select i1 true, <2 x i8> %I, <2 x i8> <i8 -1, i8 -1>
+  %Cmp = icmp slt i64 0, %E
+  br label %CF
+
+CF:                                               ; preds = %BB
+  store i8 %5, i8* %0
+  store <2 x i8> %I, <2 x i8>* %A2
+  store i8 %5, i8* %0
+  store i8 %5, i8* %0
+  store i8 %5, i8* %0
+  ret void
+}
diff --git a/test/CodeGen/X86/loop-blocks.ll b/test/CodeGen/X86/loop-blocks.ll
index d14102f..4bd162b 100644
--- a/test/CodeGen/X86/loop-blocks.ll
+++ b/test/CodeGen/X86/loop-blocks.ll
@@ -41,7 +41,6 @@ done:
 ; CHECK-NEXT:   align
 ; CHECK-NEXT: .LBB1_4:
 ; CHECK-NEXT:   callq bar99
-; CHECK-NEXT:   align
 ; CHECK-NEXT: .LBB1_1:
 ; CHECK-NEXT:   callq body
 
@@ -79,7 +78,6 @@ exit:
 ; CHECK-NEXT: .LBB2_5:
 ; CHECK-NEXT:   callq block_a_true_func
 ; CHECK-NEXT:   callq block_a_merge_func
-; CHECK-NEXT:   align
 ; CHECK-NEXT: .LBB2_1:
 ; CHECK-NEXT:   callq body
 ;
@@ -139,13 +137,13 @@ exit:
 ; CHECK-NEXT:   align
 ; CHECK-NEXT: .LBB3_7:
 ; CHECK-NEXT:   callq   bar100
-; CHECK-NEXT:   align
 ; CHECK-NEXT: .LBB3_1:
 ; CHECK-NEXT:   callq   loop_header
 ;      CHECK:   jl .LBB3_7
 ;      CHECK:   jge .LBB3_3
 ; CHECK-NEXT:   callq   bar101
 ; CHECK-NEXT:   jmp     .LBB3_1
+; CHECK-NEXT:   align
 ; CHECK-NEXT: .LBB3_3:
 ;      CHECK:   jge .LBB3_4
 ; CHECK-NEXT:   callq   bar102
diff --git a/test/CodeGen/X86/lsr-loop-exit-cond.ll b/test/CodeGen/X86/lsr-loop-exit-cond.ll
index ebda9f2..8a81f70 100644
--- a/test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ b/test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -1,10 +1,16 @@
-; RUN: llc -mtriple=x86_64-darwin < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-darwin -mcpu=generic < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-darwin -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK: t:
 ; CHECK: decq
-; CHECK-NEXT: movl (
+; CHECK-NEXT: movl (%r9,%rax,4), %eax
 ; CHECK-NEXT: jne
 
+; ATOM: t:
+; ATOM: movl (%r9,%rax,4), %eax
+; ATOM-NEXT: decq
+; ATOM-NEXT: jne
+
 @Te0 = external global [256 x i32]		; <[256 x i32]*> [#uses=5]
 @Te1 = external global [256 x i32]		; <[256 x i32]*> [#uses=4]
 @Te3 = external global [256 x i32]		; <[256 x i32]*> [#uses=2]
@@ -149,6 +155,13 @@ bb2:		; preds = %bb
 ; CHECK: jne
 ; CHECK: ret
 
+; ATOM: f:
+; ATOM: %for.body
+; ATOM: incl [[IV:%e..]]
+; ATOM: cmpl $1, [[IV]]
+; ATOM: jne
+; ATOM: ret
+
 define i32 @f(i32 %i, i32* nocapture %a) nounwind uwtable readonly ssp {
 entry:
   %cmp4 = icmp eq i32 %i, 1
diff --git a/test/CodeGen/X86/lsr-reuse-trunc.ll b/test/CodeGen/X86/lsr-reuse-trunc.ll
index 1f87089..276dab7 100644
--- a/test/CodeGen/X86/lsr-reuse-trunc.ll
+++ b/test/CodeGen/X86/lsr-reuse-trunc.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=nehalem | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=nehalem | FileCheck %s
 
 ; Full strength reduction wouldn't reduce register pressure, so LSR should
 ; stick with indexing here.
diff --git a/test/CodeGen/X86/lsr-static-addr.ll b/test/CodeGen/X86/lsr-static-addr.ll
index c9ed3e5..6566f56 100644
--- a/test/CodeGen/X86/lsr-static-addr.ll
+++ b/test/CodeGen/X86/lsr-static-addr.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -relocation-model=static -asm-verbose=false < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -relocation-model=static -asm-verbose=false < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mcpu=atom -mtriple=x86_64-unknown-linux-gnu -relocation-model=static -asm-verbose=false < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK: xorl  %eax, %eax
 ; CHECK: movsd .LCPI0_0(%rip), %xmm0
@@ -9,6 +10,15 @@
 ; CHECK-NEXT: movsd
 ; CHECK-NEXT: incq %rax
 
+; ATOM: movsd .LCPI0_0(%rip), %xmm0
+; ATOM: xorl  %eax, %eax
+; ATOM: align
+; ATOM-NEXT: BB0_2:
+; ATOM-NEXT: movsd A(,%rax,8)
+; ATOM-NEXT: mulsd
+; ATOM-NEXT: movsd
+; ATOM-NEXT: incq %rax
+
 @A = external global [0 x double]
 
 define void @foo(i64 %n) nounwind {
diff --git a/test/CodeGen/X86/machine-cse.ll b/test/CodeGen/X86/machine-cse.ll
index a757cde..d171fd5 100644
--- a/test/CodeGen/X86/machine-cse.ll
+++ b/test/CodeGen/X86/machine-cse.ll
@@ -99,3 +99,60 @@ return:                                           ; preds = %if.end, %entry
   %retval.0 = phi i32 [ 1, %entry ], [ %., %if.end ]
   ret i32 %retval.0
 }
+
+; rdar://11393714
+define i8* @bsd_memchr(i8* %s, i32 %a, i32 %c, i64 %n) nounwind ssp {
+; CHECK: %entry
+; CHECK: xorl
+; CHECK: %preheader
+; CHECK: %do.body
+; CHECK-NOT: xorl
+; CHECK: %do.cond
+; CHECK-NOT: xorl
+; CHECK: %return
+entry:
+  %cmp = icmp eq i64 %n, 0
+  br i1 %cmp, label %return, label %preheader
+
+preheader:
+  %conv2 = and i32 %c, 255
+  br label %do.body
+
+do.body:
+  %n.addr.0 = phi i64 [ %dec, %do.cond ], [ %n, %preheader ]
+  %p.0 = phi i8* [ %incdec.ptr, %do.cond ], [ %s, %preheader ]
+  %cmp3 = icmp eq i32 %a, %conv2
+  br i1 %cmp3, label %return, label %do.cond
+
+do.cond:
+  %incdec.ptr = getelementptr inbounds i8* %p.0, i64 1
+  %dec = add i64 %n.addr.0, -1
+  %cmp6 = icmp eq i64 %dec, 0
+  br i1 %cmp6, label %return, label %do.body
+
+return:
+  %retval.0 = phi i8* [ null, %entry ], [ null, %do.cond ], [ %p.0, %do.body ]
+  ret i8* %retval.0
+}
+
+; PR13578
+@t2_global = external global i32
+
+declare i1 @t2_func()
+
+define i32 @t2() {
+  store i32 42, i32* @t2_global
+  %c = call i1 @t2_func()
+  br i1 %c, label %a, label %b
+
+a:
+  %l = load i32* @t2_global
+  ret i32 %l
+
+b:
+  ret i32 0
+
+; CHECK: t2:
+; CHECK: t2_global@GOTPCREL(%rip)
+; CHECK-NOT: t2_global@GOTPCREL(%rip)
+}
diff --git a/test/CodeGen/X86/mem-promote-integers.ll b/test/CodeGen/X86/mem-promote-integers.ll
index 80103d1..0015df0 100644
--- a/test/CodeGen/X86/mem-promote-integers.ll
+++ b/test/CodeGen/X86/mem-promote-integers.ll
@@ -1,8 +1,8 @@
 ; Test the basic functionality of integer element promotions of different types.
 ; This tests checks passing of arguments, loading and storing to memory and
 ; basic arithmetic.
-; RUN: llc -march=x86 -promote-elements < %s
-; RUN: llc -march=x86-64 -promote-elements < %s
+; RUN: llc -march=x86 < %s
+; RUN: llc -march=x86-64 < %s
 
 define <1 x i8> @test_1xi8(<1 x i8> %x, <1 x i8>* %b) {
   %bb = load <1 x i8>* %b
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index f4bc1bb..723d1d8 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -disable-simplify-libcalls -mtriple=x86_64-linux | FileCheck %s --check-prefix=NOBUILTIN
 ; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
 
 ; This tests codegen time inlining/optimization of memcmp
@@ -23,6 +24,8 @@ return:                                           ; preds = %entry
 ; CHECK: memcmp2:
 ; CHECK: movw    ([[A0:%rdi|%rcx]]), %ax
 ; CHECK: cmpw    ([[A1:%rsi|%rdx]]), %ax
+; NOBUILTIN: memcmp2:
+; NOBUILTIN: callq
 }
 
 define void @memcmp2a(i8* %X, i32* nocapture %P) nounwind {
diff --git a/test/CodeGen/X86/mmx-punpckhdq.ll b/test/CodeGen/X86/mmx-punpckhdq.ll
index 689f7bf..206cb33 100644
--- a/test/CodeGen/X86/mmx-punpckhdq.ll
+++ b/test/CodeGen/X86/mmx-punpckhdq.ll
@@ -3,7 +3,7 @@
 
 define void @bork(<1 x i64>* %x) {
 ; CHECK: bork
-; CHECK: pextrd
+; CHECK: movlpd
 entry:
 	%tmp2 = load <1 x i64>* %x		; <<1 x i64>> [#uses=1]
 	%tmp6 = bitcast <1 x i64> %tmp2 to <2 x i32>		; <<2 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/movgs.ll b/test/CodeGen/X86/movgs.ll
index aeb540f..65ee7b1 100644
--- a/test/CodeGen/X86/movgs.ll
+++ b/test/CodeGen/X86/movgs.ll
@@ -55,4 +55,20 @@ entry:
 ; X64:	ret
 }
 
+; The two loads here both look identical to selection DAG, except for their
+; address spaces.  Make sure they aren't CSE'd.
+define i32 @test_no_cse() nounwind readonly {
+entry:
+	%tmp = load i32* addrspace(256)* getelementptr (i32* addrspace(256)* inttoptr (i32 72 to i32* addrspace(256)*), i32 31)		; <i32*> [#uses=1]
+	%tmp1 = load i32* %tmp		; <i32> [#uses=1]
+	%tmp2 = load i32* addrspace(257)* getelementptr (i32* addrspace(257)* inttoptr (i32 72 to i32* addrspace(257)*), i32 31)		; <i32*> [#uses=1]
+	%tmp3 = load i32* %tmp2		; <i32> [#uses=1]
+	%tmp4 = add i32 %tmp1, %tmp3
+	ret i32 %tmp4
+}
+; X32: test_no_cse:
+; X32: 	movl	%gs:196
+; X32: 	movl	%fs:196
+; X32: 	ret
+
 declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/multiple-loop-post-inc.ll b/test/CodeGen/X86/multiple-loop-post-inc.ll
index 4f7e28a..9f7d036 100644
--- a/test/CodeGen/X86/multiple-loop-post-inc.ll
+++ b/test/CodeGen/X86/multiple-loop-post-inc.ll
@@ -1,9 +1,9 @@
-; RUN: llc -asm-verbose=false -disable-branch-fold -disable-code-place -disable-tail-duplicate -march=x86-64 < %s | FileCheck %s
+; RUN: llc -asm-verbose=false -disable-branch-fold -disable-code-place -disable-tail-duplicate -march=x86-64 -mcpu=nehalem < %s | FileCheck %s
 ; rdar://7236213
-
-; Xfailed now that scheduler 2-address hack is disabled a lea is generated.
-; The code isn't any worse though.
-; XFAIL: *
+;
+; The scheduler's 2-address hack has been disabled, so there is
+; currently no good guarantee that this test will pass until the
+; machine scheduler develops an equivalent heuristic.
 
 ; CodeGen shouldn't require any lea instructions inside the marked loop.
 ; It should properly set up post-increment uses and do coalescing for
diff --git a/test/CodeGen/X86/neg_cmp.ll b/test/CodeGen/X86/neg_cmp.ll
new file mode 100644
index 0000000..866514e
--- /dev/null
+++ b/test/CodeGen/X86/neg_cmp.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+; rdar://11245199
+; PR12545
+define void @f(i32 %x, i32 %y) nounwind uwtable ssp {
+entry:
+; CHECK: f:
+; CHECK-NOT: neg
+; CHECK: add
+  %sub = sub i32 0, %y
+  %cmp = icmp eq i32 %x, %sub
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  tail call void @g() nounwind
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+declare void @g()
diff --git a/test/CodeGen/X86/opt-shuff-tstore.ll b/test/CodeGen/X86/opt-shuff-tstore.ll
index fc24913..3e72084 100644
--- a/test/CodeGen/X86/opt-shuff-tstore.ll
+++ b/test/CodeGen/X86/opt-shuff-tstore.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s -promote-elements -mattr=+sse2,+sse41 | FileCheck %s
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s  -mattr=+sse2,+sse41 | FileCheck %s
 
 ; CHECK: func_4_8
 ; A single memory write
diff --git a/test/CodeGen/X86/overlap-shift.ll b/test/CodeGen/X86/overlap-shift.ll
index d185af1..e987495 100644
--- a/test/CodeGen/X86/overlap-shift.ll
+++ b/test/CodeGen/X86/overlap-shift.ll
@@ -7,7 +7,7 @@
 ; Check that the shift gets turned into an LEA.
 
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   not grep {mov E.X, E.X}
+; RUN:   not grep "mov E.X, E.X"
 
 @G = external global i32                ; <i32*> [#uses=1]
 
diff --git a/test/CodeGen/X86/pass-three.ll b/test/CodeGen/X86/pass-three.ll
new file mode 100644
index 0000000..23005c7
--- /dev/null
+++ b/test/CodeGen/X86/pass-three.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin11.3.0"
+
+
+define { i8*, i64, i64* } @copy_3(i8* %a, i64 %b, i64* %c) nounwind {
+entry:
+  %0 = insertvalue { i8*, i64, i64* } undef, i8* %a, 0
+  %1 = insertvalue { i8*, i64, i64* } %0, i64 %b, 1
+  %2 = insertvalue { i8*, i64, i64* } %1, i64* %c, 2
+  ret { i8*, i64, i64* } %2
+}
+
+; CHECK: copy_3:
+; CHECK-NOT: (%rdi)
+; CHECK: ret
diff --git a/test/CodeGen/X86/peep-vector-extract-insert.ll b/test/CodeGen/X86/peep-vector-extract-insert.ll
index d48a331..f958b6b 100644
--- a/test/CodeGen/X86/peep-vector-extract-insert.ll
+++ b/test/CodeGen/X86/peep-vector-extract-insert.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep {xorps	%xmm0, %xmm0} | count 2
+; RUN: llc < %s -march=x86-64 | grep "xorps	%xmm0, %xmm0" | count 2
 
 define float @foo(<4 x float> %a) {
   %b = insertelement <4 x float> %a, float 0.0, i32 3
diff --git a/test/CodeGen/X86/phi-immediate-factoring.ll b/test/CodeGen/X86/phi-immediate-factoring.ll
index ef02af2..476bb10 100644
--- a/test/CodeGen/X86/phi-immediate-factoring.ll
+++ b/test/CodeGen/X86/phi-immediate-factoring.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -stats |& grep {Number of blocks eliminated} | grep 6
+; RUN: llc < %s -march=x86 -stats 2>&1 | grep "Number of blocks eliminated" | grep 6
 ; PR1296
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/X86/phielim-split.ll b/test/CodeGen/X86/phielim-split.ll
new file mode 100644
index 0000000..aa47735
--- /dev/null
+++ b/test/CodeGen/X86/phielim-split.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
+target triple = "x86_64-apple-macosx10.8.0"
+
+; The critical edge from for.cond to if.end2 should be split to avoid injecting
+; copies into the loop. The use of %b after the loop causes interference that
+; makes a copy necessary.
+; <rdar://problem/11561842>
+;
+; CHECK: split_loop_exit
+; CHECK: %for.cond
+; CHECK-NOT: mov
+; CHECK: je
+
+define i32 @split_loop_exit(i32 %a, i32 %b, i8* nocapture %p) nounwind uwtable readonly ssp {
+entry:
+  %cmp = icmp sgt i32 %a, 10
+  br i1 %cmp, label %for.cond, label %if.end2
+
+for.cond:                                         ; preds = %entry, %for.cond
+  %p.addr.0 = phi i8* [ %incdec.ptr, %for.cond ], [ %p, %entry ]
+  %incdec.ptr = getelementptr inbounds i8* %p.addr.0, i64 1
+  %0 = load i8* %p.addr.0, align 1
+  %tobool = icmp eq i8 %0, 0
+  br i1 %tobool, label %for.cond, label %if.end2
+
+if.end2:                                          ; preds = %for.cond, %entry
+  %r.0 = phi i32 [ %a, %entry ], [ %b, %for.cond ]
+  %add = add nsw i32 %r.0, %b
+  ret i32 %add
+}
diff --git a/test/CodeGen/X86/phys-reg-local-regalloc.ll b/test/CodeGen/X86/phys-reg-local-regalloc.ll
index 8b9ea17..37eca1c 100644
--- a/test/CodeGen/X86/phys-reg-local-regalloc.ll
+++ b/test/CodeGen/X86/phys-reg-local-regalloc.ll
@@ -1,6 +1,7 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin9 -regalloc=fast | FileCheck %s
-; RUN: llc -O0 < %s -march=x86 -mtriple=i386-apple-darwin9 -regalloc=fast | FileCheck %s
-; CHECKed instructions should be the same with or without -O0.
+; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -optimize-regalloc=0 | FileCheck %s
+; RUN: llc -O0 < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast | FileCheck %s
+; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=atom -regalloc=fast -optimize-regalloc=0 | FileCheck -check-prefix=ATOM %s
+; CHECKed instructions should be the same with or without -O0 except on Intel Atom due to instruction scheduling.
 
 @.str = private constant [12 x i8] c"x + y = %i\0A\00", align 1 ; <[12 x i8]*> [#uses=1]
 
@@ -15,6 +16,19 @@ entry:
 ; CHECK: movl	%ebx, 40(%esp)
 ; CHECK-NOT: movl
 ; CHECK: addl %ebx, %eax
+
+; On Intel Atom the scheduler moves a movl instruction
+; used for the printf call to follow movl 24(%esp), %eax
+; ATOM: movl 24(%esp), %eax
+; ATOM: movl
+; ATOM: movl   %eax, 36(%esp)
+; ATOM-NOT: movl
+; ATOM: movl 28(%esp), %ebx
+; ATOM-NOT: movl
+; ATOM: movl   %ebx, 40(%esp)
+; ATOM-NOT: movl
+; ATOM: addl %ebx, %eax
+
   %retval = alloca i32                            ; <i32*> [#uses=2]
   %"%ebx" = alloca i32                            ; <i32*> [#uses=1]
   %"%eax" = alloca i32                            ; <i32*> [#uses=2]
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-3.ll b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
index 4162015..984d7e5 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-3.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
@@ -1,10 +1,14 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -join-physregs | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
+; XFAIL: *
 ; rdar://5571034
 
 ; This requires physreg joining, %vreg13 is live everywhere:
 ; 304L		%CL<def> = COPY %vreg13:sub_8bit; GR32_ABCD:%vreg13
 ; 320L		%vreg15<def> = COPY %vreg19; GR32:%vreg15 GR32_NOSP:%vreg19
 ; 336L		%vreg15<def> = SAR32rCL %vreg15, %EFLAGS<imp-def,dead>, %CL<imp-use,kill>; GR32:%vreg15
+;
+; This test is XFAIL until the register allocator understands trivial physreg
+; interference. <rdar://9802098>
 
 define void @foo(i32* nocapture %quadrant, i32* nocapture %ptr, i32 %bbSize, i32 %bbStart, i32 %shifts) nounwind ssp {
 ; CHECK: foo:
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index d8ed4c0..da4af81 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1,9 +1,7 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -stack-alignment=16 -join-physregs > %t
+; RUN: llc < %s -march=x86 -mattr=sse41 -mcpu=nehalem -stack-alignment=16 > %t
 ; RUN: grep pmul %t | count 12
 ; RUN: grep mov %t | count 11
 
-; The f() arguments in %xmm0 and %xmm1 cause an extra movdqa without -join-physregs.
-
 define <4 x i32> @a(<4 x i32> %i) nounwind  {
         %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
         ret <4 x i32> %A
diff --git a/test/CodeGen/X86/pointer-vector.ll b/test/CodeGen/X86/pointer-vector.ll
index cc1df2f..800fbed 100644
--- a/test/CodeGen/X86/pointer-vector.ll
+++ b/test/CodeGen/X86/pointer-vector.ll
@@ -105,8 +105,7 @@ define <2 x i32*> @BITCAST1(<2 x i8*>* %p) nounwind {
 entry:
   %G = load <2 x i8*>* %p
 ;CHECK: movl
-;CHECK: movd
-;CHECK: pinsrd
+;CHECK: movsd
   %T = bitcast <2 x i8*> %G to <2 x i32*>
 ;CHECK: ret
   ret <2 x i32*> %T
diff --git a/test/CodeGen/X86/pr11415.ll b/test/CodeGen/X86/pr11415.ll
index e1fa032..6c32a22 100644
--- a/test/CodeGen/X86/pr11415.ll
+++ b/test/CodeGen/X86/pr11415.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-pc-linux %s -o - -regalloc=fast | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux %s -o - -regalloc=fast -optimize-regalloc=0 | FileCheck %s
 
 ; We used to consider the early clobber in the second asm statement as
 ; defining %0 before it was read. This caused us to omit the
diff --git a/test/CodeGen/X86/pr11468.ll b/test/CodeGen/X86/pr11468.ll
new file mode 100644
index 0000000..f7e9adb
--- /dev/null
+++ b/test/CodeGen/X86/pr11468.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -force-align-stack -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s
+; PR11468
+
+define void @f(i64 %sz) uwtable {
+entry:
+  %a = alloca i32, align 32
+  store volatile i32 0, i32* %a, align 32
+  ; force to push r14 on stack
+  call void asm sideeffect "nop", "~{r14},~{dirflag},~{fpsr},~{flags}"() nounwind, !srcloc !0
+  ret void
+
+; CHECK: _f
+; CHECK: pushq %rbp
+; CHECK: .cfi_offset %rbp, -16
+; CHECK: movq %rsp, %rbp
+; CHECK: .cfi_def_cfa_register %rbp
+
+; We first push register on stack, and then realign it, so that
+; .cfi_offset value is correct
+; CHECK: pushq %r14
+; CHECK: andq $-32, %rsp
+; CHECK: .cfi_offset %r14, -24
+
+; Restore %rsp from %rbp and subtract the total size of saved regsiters.
+; CHECK: leaq -8(%rbp), %rsp
+
+; Pop saved registers.
+; CHECK: popq %r14
+; CHECK: popq %rbp
+}
+
+!0 = metadata !{i32 125}
+
diff --git a/test/CodeGen/X86/pr12889.ll b/test/CodeGen/X86/pr12889.ll
new file mode 100644
index 0000000..331d8f9
--- /dev/null
+++ b/test/CodeGen/X86/pr12889.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=x86
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@c0 = common global i8 0, align 1
+
+define void @func() nounwind uwtable {
+entry:
+  %0 = load i8* @c0, align 1, !tbaa !0
+  %tobool = icmp ne i8 %0, 0
+  %conv = zext i1 %tobool to i8
+  %storemerge = shl nuw nsw i8 %conv, %conv
+  store i8 %storemerge, i8* @c0, align 1
+  ret void
+}
+
+!0 = metadata !{metadata !"omnipotent char", metadata !1}
+!1 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/pr13209.ll b/test/CodeGen/X86/pr13209.ll
new file mode 100644
index 0000000..1c93163
--- /dev/null
+++ b/test/CodeGen/X86/pr13209.ll
@@ -0,0 +1,74 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s
+
+; CHECK: pr13209:
+; CHECK-NOT: mov
+; CHECK: .size pr13209
+
+define zeroext i1 @pr13209(i8** %x, i8*** %jumpTable) nounwind {
+if.end51:
+  br label %indirectgoto.preheader
+indirectgoto.preheader:
+  %frombool.i5915.ph = phi i8 [ undef, %if.end51 ], [ %frombool.i5917, %jit_return ]
+  br label %indirectgoto
+do.end165:
+  %tmp92 = load i8** %x, align 8
+  br label %indirectgoto
+do.end209:
+  %tmp104 = load i8** %x, align 8
+  br label %indirectgoto
+do.end220:
+  %tmp107 = load i8** %x, align 8
+  br label %indirectgoto
+do.end231:
+  %tmp110 = load i8** %x, align 8
+  br label %indirectgoto
+do.end242:
+  %tmp113 = load i8** %x, align 8
+  br label %indirectgoto
+do.end253:
+  %tmp116 = load i8** %x, align 8
+  br label %indirectgoto
+do.end286:
+  %tmp125 = load i8** %x, align 8
+  br label %indirectgoto
+do.end297:
+  %tmp128 = load i8** %x, align 8
+  br label %indirectgoto
+do.end308:
+  %tmp131 = load i8** %x, align 8
+  br label %indirectgoto
+do.end429:
+  %tmp164 = load i8** %x, align 8
+  br label %indirectgoto
+do.end440:
+  %tmp167 = load i8** %x, align 8
+  br label %indirectgoto
+do.body482:
+  br i1 false, label %indirectgoto, label %do.body495
+do.body495:
+  br label %indirectgoto
+do.end723:
+  br label %inline_return
+inline_return:
+  %frombool.i5917 = phi i8 [ 0, %if.end5571 ], [ %frombool.i5915, %do.end723 ]
+  br label %jit_return
+jit_return:
+  br label %indirectgoto.preheader
+L_JSOP_UINT24:
+  %tmp864 = load i8** %x, align 8
+  br label %indirectgoto
+L_JSOP_THROWING:
+  %tmp1201 = load i8** %x, align 8
+  br label %indirectgoto
+do.body4936:
+  %tmp1240 = load i8** %x, align 8
+  br label %indirectgoto
+do.body5184:
+  %tmp1340 = load i8** %x, align 8
+  br label %indirectgoto
+if.end5571:
+  br  label %inline_return
+indirectgoto:
+  %frombool.i5915 = phi i8  [ 0, %do.body495 ],[ 0, %do.body482 ] , [ %frombool.i5915, %do.body4936 ],[ %frombool.i5915, %do.body5184 ], [ %frombool.i5915, %L_JSOP_UINT24 ], [ %frombool.i5915, %do.end286 ], [ %frombool.i5915, %do.end297 ], [ %frombool.i5915, %do.end308 ], [ %frombool.i5915, %do.end429 ], [ %frombool.i5915, %do.end440 ], [ %frombool.i5915, %L_JSOP_THROWING ], [ %frombool.i5915, %do.end253 ], [ %frombool.i5915, %do.end242 ], [ %frombool.i5915, %do.end231 ], [ %frombool.i5915, %do.end220 ], [ %frombool.i5915, %do.end209 ],[ %frombool.i5915, %do.end165 ], [ %frombool.i5915.ph, %indirectgoto.preheader ]
+  indirectbr i8* null, [ label %if.end5571, label %do.end165, label %do.end209, label %do.end220, label %do.end231, label %do.end242, label %do.end253, label %do.end723, label %L_JSOP_THROWING, label %do.end440, label %do.end429, label %do.end308, label %do.end297, label %do.end286, label %L_JSOP_UINT24, label %do.body5184, label %do.body4936, label %do.body482]
+}
diff --git a/test/CodeGen/X86/pr13220.ll b/test/CodeGen/X86/pr13220.ll
new file mode 100644
index 0000000..b9ac4b6
--- /dev/null
+++ b/test/CodeGen/X86/pr13220.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march=x86 < %s
+; PR13220
+
+define <8 x i32> @foo(<8 x i96> %x) {
+  %a = lshr <8 x i96> %x, <i96 1, i96 1, i96 1, i96 1, i96 1, i96 1, i96 1, i96 1>
+  %b = trunc <8 x i96> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+define <8 x i32> @bar(<8 x i97> %x) {
+  %a = lshr <8 x i97> %x, <i97 1, i97 1, i97 1, i97 1, i97 1, i97 1, i97 1, i97 1>
+  %b = trunc <8 x i97> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+define <8 x i32> @bax() {
+  %a = lshr <8 x i96> <i96 4, i96 4, i96 4, i96 4, i96 4, i96 4, i96 4, i96 4>, <i96 1, i96 1, i96 1, i96 1, i96 1, i96 1, i96 1, i96 1>
+  %b = trunc <8 x i96> %a to <8 x i32>
+  ret <8 x i32> %b
+}
diff --git a/test/CodeGen/X86/pr13577.ll b/test/CodeGen/X86/pr13577.ll
new file mode 100644
index 0000000..faaec26
--- /dev/null
+++ b/test/CodeGen/X86/pr13577.ll
@@ -0,0 +1,8 @@
+; RUN: llc < %s -march=x86-64
+
+define x86_fp80 @foo(x86_fp80 %a) {
+  %1 = tail call x86_fp80 @copysignl(x86_fp80 0xK7FFF8000000000000000, x86_fp80 %a) nounwind readnone
+  ret x86_fp80 %1
+}
+
+declare x86_fp80 @copysignl(x86_fp80, x86_fp80) nounwind readnone
diff --git a/test/CodeGen/X86/pr2656.ll b/test/CodeGen/X86/pr2656.ll
index afd7114..f0e31f7 100644
--- a/test/CodeGen/X86/pr2656.ll
+++ b/test/CodeGen/X86/pr2656.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep {xorps.\*sp} | count 1
+; RUN: llc < %s -march=x86 -mattr=+sse2 | grep "xorps.*sp" | count 1
 ; PR2656
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/CodeGen/X86/pr3522.ll b/test/CodeGen/X86/pr3522.ll
index 1122530..d8f3778 100644
--- a/test/CodeGen/X86/pr3522.ll
+++ b/test/CodeGen/X86/pr3522.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -stats |& not grep {instructions sunk}
+; RUN: llc < %s -march=x86 -stats 2>&1 | not grep "instructions sunk"
 ; PR3522
 
 target triple = "i386-pc-linux-gnu"
diff --git a/test/CodeGen/X86/promote-trunc.ll b/test/CodeGen/X86/promote-trunc.ll
index 4211d82..40a58b0 100644
--- a/test/CodeGen/X86/promote-trunc.ll
+++ b/test/CodeGen/X86/promote-trunc.ll
@@ -1,4 +1,4 @@
-; RUN: llc -promote-elements < %s -march=x86-64
+; RUN: llc < %s -march=x86-64
 
 define<4 x i8> @func_8_64() {
   %F = load <4 x i64>* undef
diff --git a/test/CodeGen/X86/rd-mod-wr-eflags.ll b/test/CodeGen/X86/rd-mod-wr-eflags.ll
index faca3d7..8ef9b5d 100644
--- a/test/CodeGen/X86/rd-mod-wr-eflags.ll
+++ b/test/CodeGen/X86/rd-mod-wr-eflags.ll
@@ -177,3 +177,49 @@ if.end4:
 return:
   ret void
 }
+
+; Deal with TokenFactor chain
+; rdar://11236106
+@foo = external global i64*, align 8
+
+define void @test3() nounwind ssp {
+entry:
+; CHECK: test3:
+; CHECK: decq 16(%rax)
+  %0 = load i64** @foo, align 8
+  %arrayidx = getelementptr inbounds i64* %0, i64 2
+  %1 = load i64* %arrayidx, align 8
+  %dec = add i64 %1, -1
+  store i64 %dec, i64* %arrayidx, align 8
+  %cmp = icmp eq i64 %dec, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @baz() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+declare void @baz()
+
+; Avoid creating a cycle in the DAG which would trigger an assert in the
+; scheduler.
+; PR12565
+; rdar://11451474
+@x = external global i32, align 4
+@y = external global i32, align 4
+@z = external global i32, align 4
+
+define void @test4() nounwind uwtable ssp {
+entry:
+  %0 = load i32* @x, align 4
+  %1 = load i32* @y, align 4
+  %dec = add nsw i32 %1, -1
+  store i32 %dec, i32* @y, align 4
+  %tobool.i = icmp ne i32 %dec, 0
+  %cond.i = select i1 %tobool.i, i32 %0, i32 0
+  store i32 %cond.i, i32* @z, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/rdrand.ll b/test/CodeGen/X86/rdrand.ll
new file mode 100644
index 0000000..e2224a6
--- /dev/null
+++ b/test/CodeGen/X86/rdrand.ll
@@ -0,0 +1,85 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core-avx-i -mattr=+rdrand | FileCheck %s
+declare {i16, i32} @llvm.x86.rdrand.16()
+declare {i32, i32} @llvm.x86.rdrand.32()
+declare {i64, i32} @llvm.x86.rdrand.64()
+
+define i32 @_rdrand16_step(i16* %random_val) {
+  %call = call {i16, i32} @llvm.x86.rdrand.16()
+  %randval = extractvalue {i16, i32} %call, 0
+  store i16 %randval, i16* %random_val
+  %isvalid = extractvalue {i16, i32} %call, 1
+  ret i32 %isvalid
+; CHECK: _rdrand16_step:
+; CHECK: rdrandw	%ax
+; CHECK: movw	%ax, (%r[[A0:di|cx]])
+; CHECK: movzwl	%ax, %ecx
+; CHECK: movl	$1, %eax
+; CHECK: cmovael	%ecx, %eax
+; CHECK: ret
+}
+
+define i32 @_rdrand32_step(i32* %random_val) {
+  %call = call {i32, i32} @llvm.x86.rdrand.32()
+  %randval = extractvalue {i32, i32} %call, 0
+  store i32 %randval, i32* %random_val
+  %isvalid = extractvalue {i32, i32} %call, 1
+  ret i32 %isvalid
+; CHECK: _rdrand32_step:
+; CHECK: rdrandl	%e[[T0:[a-z]+]]
+; CHECK: movl	%e[[T0]], (%r[[A0]])
+; CHECK: movl	$1, %eax
+; CHECK: cmovael	%e[[T0]], %eax
+; CHECK: ret
+}
+
+define i32 @_rdrand64_step(i64* %random_val) {
+  %call = call {i64, i32} @llvm.x86.rdrand.64()
+  %randval = extractvalue {i64, i32} %call, 0
+  store i64 %randval, i64* %random_val
+  %isvalid = extractvalue {i64, i32} %call, 1
+  ret i32 %isvalid
+; CHECK: _rdrand64_step:
+; CHECK: rdrandq	%r[[T1:[[a-z]+]]
+; CHECK: movq	%r[[T1]], (%r[[A0]])
+; CHECK: movl	$1, %eax
+; CHECK: cmovael	%e[[T1]], %eax
+; CHECK: ret
+}
+
+; Check that MachineCSE doesn't eliminate duplicate rdrand instructions.
+define i32 @CSE() nounwind {
+ %rand1 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
+ %v1 = extractvalue { i32, i32 } %rand1, 0
+ %rand2 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
+ %v2 = extractvalue { i32, i32 } %rand2, 0
+ %add = add i32 %v2, %v1
+ ret i32 %add
+; CHECK: CSE:
+; CHECK: rdrandl
+; CHECK: rdrandl
+}
+
+; Check that MachineLICM doesn't hoist rdrand instructions.
+define void @loop(i32* %p, i32 %n) nounwind {
+entry:
+  %tobool1 = icmp eq i32 %n, 0
+  br i1 %tobool1, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %p.addr.03 = phi i32* [ %incdec.ptr, %while.body ], [ %p, %entry ]
+  %n.addr.02 = phi i32 [ %dec, %while.body ], [ %n, %entry ]
+  %dec = add nsw i32 %n.addr.02, -1
+  %incdec.ptr = getelementptr inbounds i32* %p.addr.03, i64 1
+  %rand = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
+  %v1 = extractvalue { i32, i32 } %rand, 0
+  store i32 %v1, i32* %p.addr.03, align 4
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+; CHECK: loop:
+; CHECK-NOT: rdrandl
+; CHECK: This Inner Loop Header: Depth=1
+; CHECK: rdrandl
+}
diff --git a/test/CodeGen/X86/regpressure.ll b/test/CodeGen/X86/regpressure.ll
index e0b5f7a..52d7b56 100644
--- a/test/CodeGen/X86/regpressure.ll
+++ b/test/CodeGen/X86/regpressure.ll
@@ -1,8 +1,8 @@
 ;; Both functions in this testcase should codegen to the same function, and
 ;; neither of them should require spilling anything to the stack.
 
-; RUN: llc < %s -march=x86 -stats |& \
-; RUN:   not grep {Number of register spills}
+; RUN: llc < %s -march=x86 -stats 2>&1 | \
+; RUN:   not grep "Number of register spills"
 
 ;; This can be compiled to use three registers if the loads are not
 ;; folded into the multiplies, 2 registers otherwise.
diff --git a/test/CodeGen/X86/remat-fold-load.ll b/test/CodeGen/X86/remat-fold-load.ll
new file mode 100644
index 0000000..de77ad3
--- /dev/null
+++ b/test/CodeGen/X86/remat-fold-load.ll
@@ -0,0 +1,143 @@
+; RUN: llc < %s -disable-fp-elim -verify-coalescing
+; PR13414
+;
+; During coalescing, remat triggers DCE which deletes the penultimate use of a
+; load. This load should not be folded into the remaining use because it is not
+; safe to move, and it would extend the live range of the address.
+;
+; LiveRangeEdit::foldAsLoad() doesn't extend live ranges, so -verify-coalescing
+; catches the problem.
+
+target triple = "i386-unknown-linux-gnu"
+
+%type_a = type { %type_a*, %type_b }
+%type_b = type { %type_c, i32 }
+%type_c = type { i32, %type_d }
+%type_d = type { i64 }
+%type_e = type { %type_c, i64 }
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+define linkonce_odr void @test() nounwind {
+entry:
+  br i1 undef, label %while.end.while.end26_crit_edge, label %while.body12.lr.ph
+
+while.end.while.end26_crit_edge:                  ; preds = %entry
+  br label %while.end26
+
+while.body12.lr.ph:                               ; preds = %entry
+  br label %while.body12
+
+while.body12:                                     ; preds = %if.end24, %while.body12.lr.ph
+  %tmp = phi %type_a* [ undef, %while.body12.lr.ph ], [ %tmp18, %if.end24 ]
+  %ins151154161 = phi i128 [ 0, %while.body12.lr.ph ], [ %phitmp, %if.end24 ]
+  %ins135156160 = phi i128 [ 0, %while.body12.lr.ph ], [ %phitmp158, %if.end24 ]
+  %ins151 = or i128 0, %ins151154161
+  %cmp.i.i.i.i.i67 = icmp sgt i32 undef, 8
+  br i1 %cmp.i.i.i.i.i67, label %if.then.i.i.i.i71, label %if.else.i.i.i.i74
+
+if.then.i.i.i.i71:                                ; preds = %while.body12
+  %call4.i.i.i.i68 = call noalias i8* @malloc(i32 undef) nounwind
+  %tmp1 = getelementptr inbounds %type_a* %tmp, i32 0, i32 1, i32 0, i32 1
+  %buf_6.i.i.i.i70 = bitcast %type_d* %tmp1 to i8**
+  %tmp2 = load i8** %buf_6.i.i.i.i70, align 4
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* undef, i8* %tmp2, i32 undef, i32 1, i1 false) nounwind
+  unreachable
+
+if.else.i.i.i.i74:                                ; preds = %while.body12
+  %i_.i.i.i.i72 = getelementptr inbounds %type_a* %tmp, i32 0, i32 1, i32 0, i32 1, i32 0
+  %tmp3 = load i64* %i_.i.i.i.i72, align 4
+  %tmp4 = zext i64 %tmp3 to i128
+  %tmp5 = shl nuw nsw i128 %tmp4, 32
+  %ins148 = or i128 %tmp5, %ins151
+  %second3.i.i76 = getelementptr inbounds %type_a* %tmp, i32 0, i32 1, i32 1
+  %tmp6 = load i32* %second3.i.i76, align 4
+  %tmp7 = zext i32 %tmp6 to i128
+  %tmp8 = shl nuw i128 %tmp7, 96
+  %mask144 = and i128 %ins148, 79228162495817593519834398720
+  %tmp9 = load %type_e** undef, align 4
+  %len_.i.i.i.i86 = getelementptr inbounds %type_e* %tmp9, i32 0, i32 0, i32 0
+  %tmp10 = load i32* %len_.i.i.i.i86, align 4
+  %tmp11 = zext i32 %tmp10 to i128
+  %ins135 = or i128 %tmp11, %ins135156160
+  %cmp.i.i.i.i.i88 = icmp sgt i32 %tmp10, 8
+  br i1 %cmp.i.i.i.i.i88, label %if.then.i.i.i.i92, label %if.else.i.i.i.i95
+
+if.then.i.i.i.i92:                                ; preds = %if.else.i.i.i.i74
+  %call4.i.i.i.i89 = call noalias i8* @malloc(i32 %tmp10) nounwind
+  %ins126 = or i128 0, %ins135
+  %tmp12 = getelementptr inbounds %type_e* %tmp9, i32 0, i32 0, i32 1
+  %buf_6.i.i.i.i91 = bitcast %type_d* %tmp12 to i8**
+  %tmp13 = load i8** %buf_6.i.i.i.i91, align 4
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %call4.i.i.i.i89, i8* %tmp13, i32 %tmp10, i32 1, i1 false) nounwind
+  br label %A
+
+if.else.i.i.i.i95:                                ; preds = %if.else.i.i.i.i74
+  %i_.i.i.i.i93 = getelementptr inbounds %type_e* %tmp9, i32 0, i32 0, i32 1, i32 0
+  br label %A
+
+A:                                                ; preds = %if.else.i.i.i.i95, %if.then.i.i.i.i92
+  %ins135157 = phi i128 [ %ins126, %if.then.i.i.i.i92 ], [ undef, %if.else.i.i.i.i95 ]
+  %second3.i.i97 = getelementptr inbounds %type_e* %tmp9, i32 0, i32 1
+  %tmp14 = load i64* %second3.i.i97, align 4
+  %tmp15 = trunc i64 %tmp14 to i32
+  %cmp.i99 = icmp sgt i32 %tmp6, %tmp15
+  %tmp16 = trunc i128 %ins135157 to i32
+  %cmp.i.i.i.i.i.i101 = icmp sgt i32 %tmp16, 8
+  br i1 %cmp.i.i.i.i.i.i101, label %if.then.i.i.i.i.i103, label %B
+
+if.then.i.i.i.i.i103:                             ; preds = %A
+  unreachable
+
+B:                                                ; preds = %A
+  %tmp17 = trunc i128 %ins148 to i32
+  %cmp.i.i.i.i.i.i83 = icmp sgt i32 %tmp17, 8
+  br i1 %cmp.i.i.i.i.i.i83, label %if.then.i.i.i.i.i85, label %C
+
+if.then.i.i.i.i.i85:                              ; preds = %B
+  unreachable
+
+C:                                                ; preds = %B
+  br i1 %cmp.i99, label %if.then17, label %if.end24
+
+if.then17:                                        ; preds = %C
+  br i1 false, label %if.then.i.i.i.i.i43, label %D
+
+if.then.i.i.i.i.i43:                              ; preds = %if.then17
+  unreachable
+
+D:                                                ; preds = %if.then17
+  br i1 undef, label %if.then.i.i.i.i.i, label %E
+
+if.then.i.i.i.i.i:                                ; preds = %D
+  unreachable
+
+E:                                                ; preds = %D
+  br label %if.end24
+
+if.end24:                                         ; preds = %E, %C
+  %phitmp = or i128 %tmp8, %mask144
+  %phitmp158 = or i128 undef, undef
+  %tmp18 = load %type_a** undef, align 4
+  %tmp19 = load %type_a** undef, align 4
+  %cmp.i49 = icmp eq %type_a* %tmp18, %tmp19
+  br i1 %cmp.i49, label %while.cond10.while.end26_crit_edge, label %while.body12
+
+while.cond10.while.end26_crit_edge:               ; preds = %if.end24
+  %.pre = load %type_e** undef, align 4
+  br label %while.end26
+
+while.end26:                                      ; preds = %while.cond10.while.end26_crit_edge, %while.end.while.end26_crit_edge
+  br i1 undef, label %while.body.lr.ph.i, label %F
+
+while.body.lr.ph.i:                               ; preds = %while.end26
+  br label %while.body.i
+
+while.body.i:                                     ; preds = %while.body.i, %while.body.lr.ph.i
+  br i1 false, label %while.body.i, label %F
+
+F:                                                ; preds = %while.body.i, %while.end26
+  ret void
+}
+
+declare noalias i8* @malloc(i32) nounwind
diff --git a/test/CodeGen/X86/remat-scalar-zero.ll b/test/CodeGen/X86/remat-scalar-zero.ll
index 75f438d..f6095a7 100644
--- a/test/CodeGen/X86/remat-scalar-zero.ll
+++ b/test/CodeGen/X86/remat-scalar-zero.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu > %t
 ; RUN: not grep xor %t
 ; RUN: not grep movap %t
-; RUN: grep {\\.quad.*0} %t
+; RUN: grep "\.quad.*0" %t
 
 ; Remat should be able to fold the zero constant into the div instructions
 ; as a constant-pool load.
diff --git a/test/CodeGen/X86/reverse_branches.ll b/test/CodeGen/X86/reverse_branches.ll
new file mode 100644
index 0000000..9772125
--- /dev/null
+++ b/test/CodeGen/X86/reverse_branches.ll
@@ -0,0 +1,104 @@
+; RUN: llc -mtriple=x86_64-apple-darwin < %s | FileCheck %s
+
+@.str2 = private unnamed_addr constant [7 x i8] c"memchr\00", align 1
+@.str3 = private unnamed_addr constant [11 x i8] c"bsd_memchr\00", align 1
+@str4 = private unnamed_addr constant [5 x i8] c"Bug!\00"
+
+; Make sure at end of do.cond.i, we jump to do.body.i first to have a tighter
+; inner loop.
+define i32 @test_branches_order() uwtable ssp {
+; CHECK: test_branches_order:
+; CHECK: [[L0:LBB0_[0-9]+]]: ## %do.body.i
+; CHECK: je
+; CHECK: %do.cond.i
+; CHECK: jne [[L0]]
+; CHECK: jmp
+; CHECK: %exit
+entry:
+  %strs = alloca [1000 x [1001 x i8]], align 16
+  br label %for.cond
+
+for.cond:
+  %j.0 = phi i32 [ 0, %entry ], [ %inc10, %for.inc9 ]
+  %cmp = icmp slt i32 %j.0, 1000
+  br i1 %cmp, label %for.cond1, label %for.end11
+
+for.cond1:
+  %indvars.iv50 = phi i64 [ %indvars.iv.next51, %for.body3 ], [ 0, %for.cond ]
+  %0 = trunc i64 %indvars.iv50 to i32
+  %cmp2 = icmp slt i32 %0, 1000
+  br i1 %cmp2, label %for.body3, label %for.inc9
+
+for.body3:
+  %arraydecay = getelementptr inbounds [1000 x [1001 x i8]]* %strs, i64 0, i64 %indvars.iv50, i64 0
+  %call = call i8* @memchr(i8* %arraydecay, i32 120, i64 1000)
+  %add.ptr = getelementptr inbounds [1000 x [1001 x i8]]* %strs, i64 0, i64 %indvars.iv50, i64 %indvars.iv50
+  %cmp7 = icmp eq i8* %call, %add.ptr
+  %indvars.iv.next51 = add i64 %indvars.iv50, 1
+  br i1 %cmp7, label %for.cond1, label %if.then
+
+if.then:
+  %puts = call i32 @puts(i8* getelementptr inbounds ([5 x i8]* @str4, i64 0, i64 0))
+  call void @exit(i32 1) noreturn
+  unreachable
+
+for.inc9:
+  %inc10 = add nsw i32 %j.0, 1
+  br label %for.cond
+
+for.end11:
+  %puts42 = call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @.str2, i64 0, i64 0))
+  br label %for.cond14
+
+for.cond14:
+  %j13.0 = phi i32 [ 0, %for.end11 ], [ %inc39, %for.inc38 ]
+  %cmp15 = icmp slt i32 %j13.0, 1000
+  br i1 %cmp15, label %for.cond18, label %for.end40
+
+for.cond18:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %exit ], [ 0, %for.cond14 ]
+  %1 = trunc i64 %indvars.iv to i32
+  %cmp19 = icmp slt i32 %1, 1000
+  br i1 %cmp19, label %for.body20, label %for.inc38
+
+for.body20:
+  %arraydecay24 = getelementptr inbounds [1000 x [1001 x i8]]* %strs, i64 0, i64 %indvars.iv, i64 0
+  br label %do.body.i
+
+do.body.i:
+  %n.addr.0.i = phi i64 [ %dec.i, %do.cond.i ], [ 1000, %for.body20 ]
+  %p.0.i = phi i8* [ %incdec.ptr.i, %do.cond.i ], [ %arraydecay24, %for.body20 ]
+  %2 = load i8* %p.0.i, align 1
+  %cmp3.i = icmp eq i8 %2, 120
+  br i1 %cmp3.i, label %exit, label %do.cond.i
+
+do.cond.i:
+  %incdec.ptr.i = getelementptr inbounds i8* %p.0.i, i64 1
+  %dec.i = add i64 %n.addr.0.i, -1
+  %cmp5.i = icmp eq i64 %dec.i, 0
+  br i1 %cmp5.i, label %if.then32, label %do.body.i
+
+exit:
+  %add.ptr30 = getelementptr inbounds [1000 x [1001 x i8]]* %strs, i64 0, i64 %indvars.iv, i64 %indvars.iv
+  %cmp31 = icmp eq i8* %p.0.i, %add.ptr30
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  br i1 %cmp31, label %for.cond18, label %if.then32
+
+if.then32:
+  %puts43 = call i32 @puts(i8* getelementptr inbounds ([5 x i8]* @str4, i64 0, i64 0))
+  call void @exit(i32 1) noreturn
+  unreachable
+
+for.inc38:
+  %inc39 = add nsw i32 %j13.0, 1
+  br label %for.cond14
+
+for.end40:
+  %puts44 = call i32 @puts(i8* getelementptr inbounds ([11 x i8]* @.str3, i64 0, i64 0))
+  ret i32 0
+}
+
+declare i8* @memchr(i8*, i32, i64) nounwind readonly
+declare void @exit(i32) noreturn
+declare i32 @puts(i8* nocapture) nounwind
+
diff --git a/test/CodeGen/X86/rotate.ll b/test/CodeGen/X86/rotate.ll
index 1e20273..1173001 100644
--- a/test/CodeGen/X86/rotate.ll
+++ b/test/CodeGen/X86/rotate.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep {ro\[rl\]} | count 12
+; RUN:   grep "ro[rl]" | count 12
 
 define i32 @rotl32(i32 %A, i8 %Amt) {
 	%shift.upgrd.1 = zext i8 %Amt to i32		; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/rounding-ops.ll b/test/CodeGen/X86/rounding-ops.ll
index 0dd74ea..51fcf64 100644
--- a/test/CodeGen/X86/rounding-ops.ll
+++ b/test/CodeGen/X86/rounding-ops.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse41 | FileCheck -check-prefix=CHECK-SSE %s
-; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck -check-prefix=CHECK-AVX %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse41 | FileCheck -check-prefix=CHECK-SSE %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx | FileCheck -check-prefix=CHECK-AVX %s
 
 define float @test1(float %x) nounwind  {
   %call = tail call float @floorf(float %x) nounwind readnone
diff --git a/test/CodeGen/X86/segmented-stacks-dynamic.ll b/test/CodeGen/X86/segmented-stacks-dynamic.ll
index 5ce08aa..d68b00b 100644
--- a/test/CodeGen/X86/segmented-stacks-dynamic.ll
+++ b/test/CodeGen/X86/segmented-stacks-dynamic.ll
@@ -51,14 +51,14 @@ false:
 ; X64-NEXT: callq __morestack
 ; X64-NEXT: ret
 
-; X64:      movq %rsp, %rdi
-; X64-NEXT: subq %rax, %rdi
-; X64-NEXT: cmpq %rdi, %fs:112
+; X64:      movq %rsp, %[[RDI:rdi|rax]]
+; X64-NEXT: subq %{{.*}}, %[[RDI]]
+; X64-NEXT: cmpq %[[RDI]], %fs:112
 
-; X64:      movq %rdi, %rsp
+; X64:      movq %[[RDI]], %rsp
 
-; X64:      movq %rax, %rdi
+; X64:      movq %{{.*}}, %rdi
 ; X64-NEXT: callq __morestack_allocate_stack_space
-; X64-NEXT: movq %rax, %rdi
+; X64:      movq %rax, %rdi
 
 }
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index f465a4f..2e39473 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mcpu=generic | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mcpu=atom | FileCheck -check-prefix=ATOM %s
 ; PR5757
 
 %0 = type { i64, i32 }
@@ -12,6 +13,10 @@ define i32 @test1(%0* %p, %0* %q, i1 %r) nounwind {
 ; CHECK: test1:
 ; CHECK: cmovneq %rdi, %rsi
 ; CHECK: movl (%rsi), %eax
+
+; ATOM: test1:
+; ATOM: cmovneq %rdi, %rsi
+; ATOM: movl (%rsi), %eax
 }
 
 
@@ -31,6 +36,10 @@ bb91:		; preds = %bb84
 ; CHECK: test2:
 ; CHECK: movnew
 ; CHECK: movswl
+
+; ATOM: test2:
+; ATOM: movnew
+; ATOM: movswl
 }
 
 declare i1 @return_false()
@@ -44,6 +53,9 @@ entry:
 	ret float %iftmp.0.0
 ; CHECK: test3:
 ; CHECK: movss	{{.*}},4), %xmm0
+
+; ATOM: test3:
+; ATOM: movss  {{.*}},4), %xmm0
 }
 
 define signext i8 @test4(i8* nocapture %P, double %F) nounwind readonly {
@@ -55,6 +67,9 @@ entry:
 	ret i8 %2
 ; CHECK: test4:
 ; CHECK: movsbl	({{.*}},4), %eax
+
+; ATOM: test4:
+; ATOM: movsbl ({{.*}},4), %eax
 }
 
 define void @test5(i1 %c, <2 x i16> %a, <2 x i16> %b, <2 x i16>* %p) nounwind {
@@ -62,6 +77,8 @@ define void @test5(i1 %c, <2 x i16> %a, <2 x i16> %b, <2 x i16>* %p) nounwind {
   store <2 x i16> %x, <2 x i16>* %p
   ret void
 ; CHECK: test5:
+
+; ATOM: test5:
 }
 
 define void @test6(i32 %C, <4 x float>* %A, <4 x float>* %B) nounwind {
@@ -79,6 +96,12 @@ define void @test6(i32 %C, <4 x float>* %A, <4 x float>* %B) nounwind {
 ; CHECK: ret
 ; CHECK: mulps
 ; CHECK: ret
+
+; ATOM: test6:
+; ATOM: je
+; ATOM: ret
+; ATOM: mulps
+; ATOM: ret
 }
 
 ; Select with fp80's
@@ -89,6 +112,10 @@ define x86_fp80 @test7(i32 %tmp8) nounwind {
 ; CHECK: test7:
 ; CHECK: leaq
 ; CHECK: fldt (%r{{.}}x,%r{{.}}x)
+
+; ATOM: test7:
+; ATOM: leaq
+; ATOM: fldt (%r{{.}}x,%r{{.}}x)
 }
 
 ; widening select v6i32 and then a sub
@@ -97,8 +124,10 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
 	%val = sub <6 x i32> %x, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
 	store <6 x i32> %val, <6 x i32>* %dst.addr
 	ret void
-        
+
 ; CHECK: test8:
+
+; ATOM: test8:
 }
 
 
@@ -113,6 +142,12 @@ define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK: sbbq	%rax, %rax
 ; CHECK: orq	%rsi, %rax
 ; CHECK: ret
+
+; ATOM: test9:
+; ATOM: cmpq   $1, %rdi
+; ATOM: sbbq   %rax, %rax
+; ATOM: orq    %rsi, %rax
+; ATOM: ret
 }
 
 ;; Same as test9
@@ -125,6 +160,12 @@ define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK: sbbq	%rax, %rax
 ; CHECK: orq	%rsi, %rax
 ; CHECK: ret
+
+; ATOM: test9a:
+; ATOM: cmpq   $1, %rdi
+; ATOM: sbbq   %rax, %rax
+; ATOM: orq    %rsi, %rax
+; ATOM: ret
 }
 
 define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone {
@@ -137,6 +178,12 @@ define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK: sbbq	%rax, %rax
 ; CHECK: orq	%rsi, %rax
 ; CHECK: ret
+
+; ATOM: test9b:
+; ATOM: cmpq   $1, %rdi
+; ATOM: sbbq   %rax, %rax
+; ATOM: orq    %rsi, %rax
+; ATOM: ret
 }
 
 ;; Select between -1 and 1.
@@ -149,6 +196,12 @@ define i64 @test10(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK: sbbq	%rax, %rax
 ; CHECK: orq	$1, %rax
 ; CHECK: ret
+
+; ATOM: test10:
+; ATOM: cmpq   $1, %rdi
+; ATOM: sbbq   %rax, %rax
+; ATOM: orq    $1, %rax
+; ATOM: ret
 }
 
 
@@ -163,6 +216,13 @@ define i64 @test11(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK: notq %rax
 ; CHECK: orq	%rsi, %rax
 ; CHECK: ret
+
+; ATOM: test11:
+; ATOM: cmpq   $1, %rdi
+; ATOM: sbbq   %rax, %rax
+; ATOM: notq %rax
+; ATOM: orq    %rsi, %rax
+; ATOM: ret
 }
 
 define i64 @test11a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
@@ -175,6 +235,13 @@ define i64 @test11a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK: notq %rax
 ; CHECK: orq	%rsi, %rax
 ; CHECK: ret
+
+; ATOM: test11a:
+; ATOM: cmpq   $1, %rdi
+; ATOM: sbbq   %rax, %rax
+; ATOM: notq %rax
+; ATOM: orq    %rsi, %rax
+; ATOM: ret
 }
 
 
@@ -189,10 +256,16 @@ entry:
   %call = tail call noalias i8* @_Znam(i64 %D) nounwind noredzone
   ret i8* %call
 ; CHECK: test12:
-; CHECK: mulq
 ; CHECK: movq $-1, %rdi
+; CHECK: mulq
 ; CHECK: cmovnoq	%rax, %rdi
 ; CHECK: jmp	__Znam
+
+; ATOM: test12:
+; ATOM: mulq
+; ATOM: movq $-1, %rdi
+; ATOM: cmovnoq        %rax, %rdi
+; ATOM: jmp    __Znam
 }
 
 declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
@@ -205,6 +278,11 @@ define i32 @test13(i32 %a, i32 %b) nounwind {
 ; CHECK: cmpl
 ; CHECK-NEXT: sbbl
 ; CHECK-NEXT: ret
+
+; ATOM: test13:
+; ATOM: cmpl
+; ATOM-NEXT: sbbl
+; ATOM-NEXT: ret
 }
 
 define i32 @test14(i32 %a, i32 %b) nounwind {
@@ -216,5 +294,53 @@ define i32 @test14(i32 %a, i32 %b) nounwind {
 ; CHECK-NEXT: sbbl
 ; CHECK-NEXT: notl
 ; CHECK-NEXT: ret
+
+; ATOM: test14:
+; ATOM: cmpl
+; ATOM-NEXT: sbbl
+; ATOM-NEXT: notl
+; ATOM-NEXT: ret
+}
+
+; rdar://10961709
+define i32 @test15(i32 %x) nounwind {
+entry:
+  %cmp = icmp ne i32 %x, 0
+  %sub = sext i1 %cmp to i32
+  ret i32 %sub
+; CHECK: test15:
+; CHECK: negl
+; CHECK: sbbl
+
+; ATOM: test15:
+; ATOM: negl
+; ATOM: sbbl
 }
 
+define i64 @test16(i64 %x) nounwind uwtable readnone ssp {
+entry:
+  %cmp = icmp ne i64 %x, 0
+  %conv1 = sext i1 %cmp to i64
+  ret i64 %conv1
+; CHECK: test16:
+; CHECK: negq
+; CHECK: sbbq
+
+; ATOM: test16:
+; ATOM: negq
+; ATOM: sbbq
+}
+
+define i16 @test17(i16 %x) nounwind {
+entry:
+  %cmp = icmp ne i16 %x, 0
+  %sub = sext i1 %cmp to i16
+  ret i16 %sub
+; CHECK: test17:
+; CHECK: negw
+; CHECK: sbbw
+
+; ATOM: test17:
+; ATOM: negw
+; ATOM: sbbw
+}
diff --git a/test/CodeGen/X86/selectiondag-cse.ll b/test/CodeGen/X86/selectiondag-cse.ll
new file mode 100644
index 0000000..a653a1c
--- /dev/null
+++ b/test/CodeGen/X86/selectiondag-cse.ll
@@ -0,0 +1,69 @@
+; RUN: llc < %s
+; PR12599
+;
+; This bitcode causes the X86 target to make changes to the DAG during
+; selection in MatchAddressRecursively. The edit triggers CSE which causes both
+; the current node and yet-to-be-selected nodes to be deleted.
+;
+; SelectionDAGISel::DoInstructionSelection must handle that.
+;
+target triple = "x86_64-apple-macosx"
+
+%0 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8**, i8**, i32, i32***, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [9 x [16 x [16 x i16]]], [5 x [16 x [16 x i16]]], [9 x [8 x [8 x i16]]], [2 x [4 x [16 x [16 x i16]]]], [16 x [16 x i16]], [16 x [16 x i32]], i32****, i32***, i32***, i32***, i32****, i32****, %1*, %2*, %9*, i32*, i32*, i32, i32, i32, i32, [4 x [4 x i32]], i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i16******, i16******, i16******, i16******, [15 x i16], i32, i32, i32, i32, i32, i32, i32, i32, [6 x [32 x i32]], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [1 x i32], i32, i32, [2 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %10*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, double**, double***, i32***, double**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [3 x [2 x i32]], [2 x i32], i32, i32, i16, i32, i32, i32, i32, i32 }
+%1 = type { i32, i32, [100 x %2*], i32, float, float, float }
+%2 = type { i32, i32, i32, i32, i32, i32, %3*, %6*, %8*, i32, i32*, i32*, i32*, i32, i32*, i32*, i32*, i32 (i32)*, [3 x [2 x i32]] }
+%3 = type { %4*, %5, %5 }
+%4 = type { i32, i32, i8, i32, i32, i8, i8, i32, i32, i8*, i32 }
+%5 = type { i32, i32, i32, i32, i32, i8*, i32*, i32, i32 }
+%6 = type { [3 x [11 x %7]], [2 x [9 x %7]], [2 x [10 x %7]], [2 x [6 x %7]], [4 x %7], [4 x %7], [3 x %7] }
+%7 = type { i16, i8, i64 }
+%8 = type { [2 x %7], [4 x %7], [3 x [4 x %7]], [10 x [4 x %7]], [10 x [15 x %7]], [10 x [15 x %7]], [10 x [5 x %7]], [10 x [5 x %7]], [10 x [15 x %7]], [10 x [15 x %7]] }
+%9 = type { i32, i32, i32, [2 x i32], i32, [8 x i32], %9*, %9*, i32, [2 x [4 x [4 x [2 x i32]]]], [16 x i8], [16 x i8], i32, i64, [4 x i32], [4 x i32], i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i16, double, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
+%10 = type { i32, i32, i32, i32, i32, %10* }
+
+@images = external hidden global %0, align 8
+
+define hidden fastcc void @Mode_Decision_for_4x4IntraBlocks() nounwind uwtable ssp {
+bb4:
+  %tmp = or i208 undef, 0
+  br i1 undef, label %bb35, label %bb5
+
+bb5:
+  %tmp6 = add i32 0, 2
+  %tmp7 = lshr i208 %tmp, 80
+  %tmp8 = trunc i208 %tmp7 to i32
+  %tmp9 = and i32 %tmp8, 65535
+  %tmp10 = shl nuw nsw i32 %tmp9, 1
+  %tmp11 = add i32 0, 2
+  %tmp12 = add i32 %tmp11, 0
+  %tmp13 = add i32 %tmp12, %tmp10
+  %tmp14 = lshr i32 %tmp13, 2
+  %tmp15 = trunc i32 %tmp14 to i16
+  store i16 %tmp15, i16* getelementptr inbounds (%0* @images, i64 0, i32 47, i64 3, i64 0, i64 3), align 2
+  %tmp16 = lshr i208 %tmp, 96
+  %tmp17 = trunc i208 %tmp16 to i32
+  %tmp18 = and i32 %tmp17, 65535
+  %tmp19 = add i32 %tmp18, 2
+  %tmp20 = add i32 %tmp19, 0
+  %tmp21 = add i32 %tmp20, 0
+  %tmp22 = lshr i32 %tmp21, 2
+  %tmp23 = trunc i32 %tmp22 to i16
+  store i16 %tmp23, i16* getelementptr inbounds (%0* @images, i64 0, i32 47, i64 3, i64 2, i64 3), align 2
+  %tmp24 = add i32 %tmp6, %tmp9
+  %tmp25 = add i32 %tmp24, 0
+  %tmp26 = lshr i32 %tmp25, 2
+  %tmp27 = trunc i32 %tmp26 to i16
+  store i16 %tmp27, i16* getelementptr inbounds (%0* @images, i64 0, i32 47, i64 7, i64 1, i64 2), align 4
+  %tmp28 = lshr i208 %tmp, 80
+  %tmp29 = shl nuw nsw i208 %tmp28, 1
+  %tmp30 = trunc i208 %tmp29 to i32
+  %tmp31 = and i32 %tmp30, 131070
+  %tmp32 = add i32 %tmp12, %tmp31
+  %tmp33 = lshr i32 %tmp32, 2
+  %tmp34 = trunc i32 %tmp33 to i16
+  store i16 %tmp34, i16* getelementptr inbounds (%0* @images, i64 0, i32 47, i64 7, i64 1, i64 3), align 2
+  br label %bb35
+
+bb35:                                             ; preds = %bb5, %bb4
+  unreachable
+}
diff --git a/test/CodeGen/X86/sext-setcc-self.ll b/test/CodeGen/X86/sext-setcc-self.ll
new file mode 100644
index 0000000..23d66a2
--- /dev/null
+++ b/test/CodeGen/X86/sext-setcc-self.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=x86-64 -mcpu=nehalem -asm-verbose=false < %s | FileCheck %s
+
+define <4 x i32> @test_ueq(<4 x float> %in) {
+entry:
+  ; CHECK: pcmpeqd %xmm0, %xmm0
+  ; CHECK-NEXT: ret
+  %0 = fcmp ueq <4 x float> %in, %in
+  %1 = sext <4 x i1> %0 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @test_uge(<4 x float> %in) {
+entry:
+  ; CHECK: pcmpeqd %xmm0, %xmm0
+  ; CHECK-NEXT: ret
+  %0 = fcmp uge <4 x float> %in, %in
+  %1 = sext <4 x i1> %0 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @test_ule(<4 x float> %in) {
+entry:
+  ; CHECK: pcmpeqd %xmm0, %xmm0
+  ; CHECK-NEXT: ret
+  %0 = fcmp ule <4 x float> %in, %in
+  %1 = sext <4 x i1> %0 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @test_one(<4 x float> %in) {
+entry:
+  ; CHECK: xorps %xmm0, %xmm0
+  ; CHECK-NEXT: ret
+  %0 = fcmp one <4 x float> %in, %in
+  %1 = sext <4 x i1> %0 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @test_ogt(<4 x float> %in) {
+entry:
+  ; CHECK: xorps %xmm0, %xmm0
+  ; CHECK-NEXT: ret
+  %0 = fcmp ogt <4 x float> %in, %in
+  %1 = sext <4 x i1> %0 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @test_olt(<4 x float> %in) {
+entry:
+  ; CHECK: xorps %xmm0, %xmm0
+  ; CHECK-NEXT: ret
+  %0 = fcmp olt <4 x float> %in, %in
+  %1 = sext <4 x i1> %0 to <4 x i32>
+  ret <4 x i32> %1
+}
diff --git a/test/CodeGen/X86/shift-and.ll b/test/CodeGen/X86/shift-and.ll
index b747cc5..1de9151 100644
--- a/test/CodeGen/X86/shift-and.ll
+++ b/test/CodeGen/X86/shift-and.ll
@@ -1,13 +1,27 @@
-; RUN: llc < %s -march=x86    | grep and | count 2
-; RUN: llc < %s -march=x86-64 | not grep and 
+; RUN: llc < %s -mtriple=i386-apple-macosx   | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s --check-prefix=X64
 
 define i32 @t1(i32 %t, i32 %val) nounwind {
+; X32: t1:
+; X32-NOT: andl
+; X32: shll
+
+; X64: t1:
+; X64-NOT: andl
+; X64: shll
        %shamt = and i32 %t, 31
        %res = shl i32 %val, %shamt
        ret i32 %res
 }
 
 define i32 @t2(i32 %t, i32 %val) nounwind {
+; X32: t2:
+; X32-NOT: andl
+; X32: shll
+
+; X64: t2:
+; X64-NOT: andl
+; X64: shll
        %shamt = and i32 %t, 63
        %res = shl i32 %val, %shamt
        ret i32 %res
@@ -16,6 +30,13 @@ define i32 @t2(i32 %t, i32 %val) nounwind {
 @X = internal global i16 0
 
 define void @t3(i16 %t) nounwind {
+; X32: t3:
+; X32-NOT: andl
+; X32: sarw
+
+; X64: t3:
+; X64-NOT: andl
+; X64: sarw
        %shamt = and i16 %t, 31
        %tmp = load i16* @X
        %tmp1 = ashr i16 %tmp, %shamt
@@ -24,13 +45,34 @@ define void @t3(i16 %t) nounwind {
 }
 
 define i64 @t4(i64 %t, i64 %val) nounwind {
+; X64: t4:
+; X64-NOT: and
+; X64: shrq
        %shamt = and i64 %t, 63
        %res = lshr i64 %val, %shamt
        ret i64 %res
 }
 
 define i64 @t5(i64 %t, i64 %val) nounwind {
+; X64: t5:
+; X64-NOT: and
+; X64: shrq
        %shamt = and i64 %t, 191
        %res = lshr i64 %val, %shamt
        ret i64 %res
 }
+
+
+; rdar://11866926
+define i64 @t6(i64 %key, i64* nocapture %val) nounwind {
+entry:
+; X64: t6:
+; X64-NOT: movabsq
+; X64: decq
+; X64: andq
+  %shr = lshr i64 %key, 3
+  %0 = load i64* %val, align 8
+  %sub = add i64 %0, 2305843009213693951
+  %and = and i64 %sub, %shr
+  ret i64 %and
+}
diff --git a/test/CodeGen/X86/shift-coalesce.ll b/test/CodeGen/X86/shift-coalesce.ll
index d38f9a8..4f27e97 100644
--- a/test/CodeGen/X86/shift-coalesce.ll
+++ b/test/CodeGen/X86/shift-coalesce.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep {shld.*CL}
+; RUN:   grep "shld.*CL"
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   not grep {mov CL, BL}
+; RUN:   not grep "mov CL, BL"
 
 ; PR687
 
diff --git a/test/CodeGen/X86/shift-double.ll b/test/CodeGen/X86/shift-double.ll
index 5adee7c..8d2b290 100644
--- a/test/CodeGen/X86/shift-double.ll
+++ b/test/CodeGen/X86/shift-double.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep {sh\[lr\]d} | count 5
+; RUN:   grep "sh[lr]d" | count 5
 
 define i64 @test1(i64 %X, i8 %C) {
         %shift.upgrd.1 = zext i8 %C to i64              ; <i64> [#uses=1]
diff --git a/test/CodeGen/X86/shift-folding.ll b/test/CodeGen/X86/shift-folding.ll
index 3ea6011..c518cdd 100644
--- a/test/CodeGen/X86/shift-folding.ll
+++ b/test/CodeGen/X86/shift-folding.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -verify-coalescing | FileCheck %s
 
 define i32* @test1(i32* %P, i32 %X) {
 ; CHECK: test1:
diff --git a/test/CodeGen/X86/shl_elim.ll b/test/CodeGen/X86/shl_elim.ll
index 0827221..83e1eb5 100644
--- a/test/CodeGen/X86/shl_elim.ll
+++ b/test/CodeGen/X86/shl_elim.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 | grep {movl	8(.esp), %eax}
-; RUN: llc < %s -march=x86 | grep {shrl	.eax}
-; RUN: llc < %s -march=x86 | grep {movswl	.ax, .eax}
+; RUN: llc < %s -march=x86 | grep "movl	8(.esp), %eax"
+; RUN: llc < %s -march=x86 | grep "shrl	.eax"
+; RUN: llc < %s -march=x86 | grep "movswl	.ax, .eax"
 
 define i32 @test1(i64 %a) nounwind {
         %tmp29 = lshr i64 %a, 24                ; <i64> [#uses=1]
diff --git a/test/CodeGen/X86/sincos.ll b/test/CodeGen/X86/sincos.ll
index 13f9329..1479be1 100644
--- a/test/CodeGen/X86/sincos.ll
+++ b/test/CodeGen/X86/sincos.ll
@@ -1,8 +1,6 @@
 ; Make sure this testcase codegens to the sin and cos instructions, not calls
-; RUN: llc < %s -march=x86 -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | \
-; RUN:   grep sin\$ | count 3
-; RUN: llc < %s -march=x86 -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | \
-; RUN:   grep cos\$ | count 3
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | FileCheck %s --check-prefix=SIN
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | FileCheck %s --check-prefix=COS
 
 declare float  @sinf(float) readonly
 
@@ -10,39 +8,59 @@ declare double @sin(double) readonly
 
 declare x86_fp80 @sinl(x86_fp80) readonly
 
+; SIN: test1:
 define float @test1(float %X) {
         %Y = call float @sinf(float %X) readonly
         ret float %Y
 }
+; SIN: {{^[ \t]*fsin$}}
 
+; SIN-NOT: fsin
+
+; SIN: test2:
 define double @test2(double %X) {
         %Y = call double @sin(double %X) readonly
         ret double %Y
 }
+; SIN: {{^[ \t]*fsin$}}
+
+; SIN-NOT: fsin
 
+; SIN: test3:
 define x86_fp80 @test3(x86_fp80 %X) {
         %Y = call x86_fp80 @sinl(x86_fp80 %X) readonly
         ret x86_fp80 %Y
 }
+; SIN: {{^[ \t]*fsin$}}
 
+; SIN-NOT: fsin
+; COS-NOT: fcos
 declare float @cosf(float) readonly
 
 declare double @cos(double) readonly
 
 declare x86_fp80 @cosl(x86_fp80) readonly
 
+
+; SIN: test4:
+; COS: test3:
 define float @test4(float %X) {
         %Y = call float @cosf(float %X) readonly
         ret float %Y
 }
+; COS: {{^[ \t]*fcos}}
 
 define double @test5(double %X) {
         %Y = call double @cos(double %X) readonly
         ret double %Y
 }
+; COS: {{^[ \t]*fcos}}
 
 define x86_fp80 @test6(x86_fp80 %X) {
         %Y = call x86_fp80 @cosl(x86_fp80 %X) readonly
         ret x86_fp80 %Y
 }
+; COS: {{^[ \t]*fcos}}
 
+; SIN-NOT: fsin
+; COS-NOT: fcos
diff --git a/test/CodeGen/X86/sink-hoist.ll b/test/CodeGen/X86/sink-hoist.ll
index 7957eb8..649cd61 100644
--- a/test/CodeGen/X86/sink-hoist.ll
+++ b/test/CodeGen/X86/sink-hoist.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -asm-verbose=false -mtriple=x86_64-unknown-linux-gnu -post-RA-scheduler=true | FileCheck %s
+; RUN: llc < %s -march=x86-64 -asm-verbose=false -mtriple=x86_64-unknown-linux-gnu -mcpu=nehalem -post-RA-scheduler=true | FileCheck %s
 
 ; Currently, floating-point selects are lowered to CFG triangles.
 ; This means that one side of the select is always unconditionally
diff --git a/test/CodeGen/X86/sink-out-of-loop.ll b/test/CodeGen/X86/sink-out-of-loop.ll
new file mode 100644
index 0000000..c600f925
--- /dev/null
+++ b/test/CodeGen/X86/sink-out-of-loop.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=x86_64-apple-darwin < %s | FileCheck %s
+
+; A MOV32ri is inside a loop, it has two successors, one successor is inside the
+; same loop, the other successor is outside the loop. We should be able to sink
+; MOV32ri outside the loop.
+; rdar://11980766
+define i32 @sink_succ(i32 %argc, i8** nocapture %argv) nounwind uwtable ssp {
+; CHECK: sink_succ
+; CHECK: [[OUTER_LN1:LBB0_[0-9]+]]: ## %preheader
+; CHECK: %exit
+; CHECK-NOT: movl
+; CHECK: jne [[OUTER_LN1]]
+; CHECK: movl
+; CHECK: [[LN2:LBB0_[0-9]+]]: ## %for.body2
+; CHECK: jne [[LN2]]
+; CHECK: ret
+entry:
+  br label %preheader
+
+preheader:
+  %i.127 = phi i32 [ 0, %entry ], [ %inc9, %exit ]
+  br label %for.body1.lr
+
+for.body1.lr:
+  %iv30 = phi i32 [ 1, %preheader ], [ %iv.next31, %for.inc40.i ]
+  br label %for.body1
+
+for.body1:
+  %iv.i = phi i64 [ 0, %for.body1.lr ], [ %iv.next.i, %for.body1 ]
+  %iv.next.i = add i64 %iv.i, 1
+  %lftr.wideiv32 = trunc i64 %iv.next.i to i32
+  %exitcond33 = icmp eq i32 %lftr.wideiv32, %iv30
+  br i1 %exitcond33, label %for.inc40.i, label %for.body1
+
+for.inc40.i:
+  %iv.next31 = add i32 %iv30, 1
+  %exitcond49.i = icmp eq i32 %iv.next31, 32
+  br i1 %exitcond49.i, label %exit, label %for.body1.lr
+
+exit:
+  %inc9 = add nsw i32 %i.127, 1
+  %exitcond34 = icmp eq i32 %inc9, 10
+  br i1 %exitcond34, label %for.body2, label %preheader
+
+for.body2:
+  %iv = phi i64 [ %iv.next, %for.body2 ], [ 0, %exit ]
+  %iv.next = add i64 %iv, 1
+  %lftr.wideiv = trunc i64 %iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 2048
+  br i1 %exitcond, label %for.end20, label %for.body2
+
+for.end20:
+  ret i32 0
+}
diff --git a/test/CodeGen/X86/splat-scalar-load.ll b/test/CodeGen/X86/splat-scalar-load.ll
index 81a072f..980f18c 100644
--- a/test/CodeGen/X86/splat-scalar-load.ll
+++ b/test/CodeGen/X86/splat-scalar-load.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 -mcpu=nehalem | FileCheck %s
 ; rdar://7434544
 
 define <2 x i64> @t2() nounwind {
diff --git a/test/CodeGen/X86/sse-align-12.ll b/test/CodeGen/X86/sse-align-12.ll
index 118e393..71a42f4 100644
--- a/test/CodeGen/X86/sse-align-12.ll
+++ b/test/CodeGen/X86/sse-align-12.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=nehalem | FileCheck %s
 
 ; CHECK: a:
 ; CHECK: movdqu
diff --git a/test/CodeGen/X86/sse-domains.ll b/test/CodeGen/X86/sse-domains.ll
index d1e07c8..c99287b 100644
--- a/test/CodeGen/X86/sse-domains.ll
+++ b/test/CodeGen/X86/sse-domains.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc < %s -mcpu=nehalem | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.7"
 
diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index 1112440..3839e87 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86-64 -asm-verbose=false -join-physregs -promote-elements | FileCheck %s
-; RUN: llc < %s -march=x86-64 -asm-verbose=false -join-physregs -enable-unsafe-fp-math -enable-no-nans-fp-math -promote-elements | FileCheck -check-prefix=UNSAFE %s
-; RUN: llc < %s -march=x86-64 -asm-verbose=false -join-physregs -enable-no-nans-fp-math -promote-elements | FileCheck -check-prefix=FINITE %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=nehalem -asm-verbose=false  | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=nehalem -asm-verbose=false -enable-unsafe-fp-math -enable-no-nans-fp-math  | FileCheck -check-prefix=UNSAFE %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=nehalem -asm-verbose=false -enable-no-nans-fp-math  | FileCheck -check-prefix=FINITE %s
 
 ; Some of these patterns can be matched as SSE min or max. Some of
 ; then can be matched provided that the operands are swapped.
@@ -8,13 +8,10 @@
 ; and a conditional branch.
 
 ; The naming convention is {,x_,y_}{o,u}{gt,lt,ge,le}{,_inverse}
-; x_ : use 0.0 instead of %y
-; y_ : use -0.0 instead of %y
+;  _x: use 0.0 instead of %y
+;  _y: use -0.0 instead of %y
 ; _inverse : swap the arms of the select.
 
-; Some of these tests depend on -join-physregs commuting instructions to
-; eliminate copies.
-
 ; CHECK:      ogt:
 ; CHECK-NEXT: maxsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
@@ -139,147 +136,147 @@ define double @ole_inverse(double %x, double %y) nounwind {
   ret double %d
 }
 
-; CHECK:      x_ogt:
+; CHECK:      ogt_x:
 ; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; CHECK-NEXT: maxsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_ogt:
+; UNSAFE:      ogt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ogt:
+; FINITE:      ogt_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; FINITE-NEXT: maxsd %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ogt(double %x) nounwind {
+define double @ogt_x(double %x) nounwind {
   %c = fcmp ogt double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_olt:
+; CHECK:      olt_x:
 ; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; CHECK-NEXT: minsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_olt:
+; UNSAFE:      olt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_olt:
+; FINITE:      olt_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; FINITE-NEXT: minsd %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_olt(double %x) nounwind {
+define double @olt_x(double %x) nounwind {
   %c = fcmp olt double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_ogt_inverse:
+; CHECK:      ogt_inverse_x:
 ; CHECK-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; CHECK-NEXT: minsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_ogt_inverse:
+; UNSAFE:      ogt_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd  %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ogt_inverse:
+; FINITE:      ogt_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; FINITE-NEXT: minsd  %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ogt_inverse(double %x) nounwind {
+define double @ogt_inverse_x(double %x) nounwind {
   %c = fcmp ogt double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      x_olt_inverse:
+; CHECK:      olt_inverse_x:
 ; CHECK-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; CHECK-NEXT: maxsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_olt_inverse:
+; UNSAFE:      olt_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_olt_inverse:
+; FINITE:      olt_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; FINITE-NEXT: maxsd  %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_olt_inverse(double %x) nounwind {
+define double @olt_inverse_x(double %x) nounwind {
   %c = fcmp olt double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      x_oge:
+; CHECK:      oge_x:
 ; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      x_oge:
+; UNSAFE:      oge_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_oge:
+; FINITE:      oge_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: maxsd   %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_oge(double %x) nounwind {
+define double @oge_x(double %x) nounwind {
   %c = fcmp oge double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_ole:
+; CHECK:      ole_x:
 ; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      x_ole:
+; UNSAFE:      ole_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ole:
+; FINITE:      ole_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; FINITE-NEXT: minsd %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ole(double %x) nounwind {
+define double @ole_x(double %x) nounwind {
   %c = fcmp ole double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_oge_inverse:
-; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      x_oge_inverse:
+; CHECK:      oge_inverse_x:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      oge_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_oge_inverse:
+; FINITE:      oge_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: minsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_oge_inverse(double %x) nounwind {
+define double @oge_inverse_x(double %x) nounwind {
   %c = fcmp oge double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      x_ole_inverse:
-; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      x_ole_inverse:
+; CHECK:      ole_inverse_x:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      ole_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ole_inverse:
+; FINITE:      ole_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: maxsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ole_inverse(double %x) nounwind {
+define double @ole_inverse_x(double %x) nounwind {
   %c = fcmp ole double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
@@ -411,419 +408,419 @@ define double @ule_inverse(double %x, double %y) nounwind {
   ret double %d
 }
 
-; CHECK:      x_ugt:
+; CHECK:      ugt_x:
 ; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      x_ugt:
+; UNSAFE:      ugt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ugt:
+; FINITE:      ugt_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: maxsd   %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ugt(double %x) nounwind {
+define double @ugt_x(double %x) nounwind {
   %c = fcmp ugt double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_ult:
+; CHECK:      ult_x:
 ; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      x_ult:
+; UNSAFE:      ult_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ult:
+; FINITE:      ult_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: minsd   %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ult(double %x) nounwind {
+define double @ult_x(double %x) nounwind {
   %c = fcmp ult double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_ugt_inverse:
-; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      x_ugt_inverse:
+; CHECK:      ugt_inverse_x:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      ugt_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ugt_inverse:
+; FINITE:      ugt_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: minsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ugt_inverse(double %x) nounwind {
+define double @ugt_inverse_x(double %x) nounwind {
   %c = fcmp ugt double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      x_ult_inverse:
-; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      x_ult_inverse:
+; CHECK:      ult_inverse_x:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      ult_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ult_inverse:
+; FINITE:      ult_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
 ; FINITE-NEXT: maxsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ult_inverse(double %x) nounwind {
+define double @ult_inverse_x(double %x) nounwind {
   %c = fcmp ult double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      x_uge:
+; CHECK:      uge_x:
 ; CHECK-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; CHECK-NEXT: maxsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_uge:
+; UNSAFE:      uge_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_uge:
+; FINITE:      uge_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; FINITE-NEXT: maxsd  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_uge(double %x) nounwind {
+define double @uge_x(double %x) nounwind {
   %c = fcmp uge double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_ule:
+; CHECK:      ule_x:
 ; CHECK-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; CHECK-NEXT: minsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_ule:
+; UNSAFE:      ule_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ule:
+; FINITE:      ule_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
 ; FINITE-NEXT: minsd  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ule(double %x) nounwind {
+define double @ule_x(double %x) nounwind {
   %c = fcmp ule double %x, 0.000000e+00
   %d = select i1 %c, double %x, double 0.000000e+00
   ret double %d
 }
 
-; CHECK:      x_uge_inverse:
+; CHECK:      uge_inverse_x:
 ; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; CHECK-NEXT: minsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_uge_inverse:
+; UNSAFE:      uge_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; UNSAFE-NEXT: minsd %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_uge_inverse:
+; FINITE:      uge_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; FINITE-NEXT: minsd %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_uge_inverse(double %x) nounwind {
+define double @uge_inverse_x(double %x) nounwind {
   %c = fcmp uge double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      x_ule_inverse:
+; CHECK:      ule_inverse_x:
 ; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; CHECK-NEXT: maxsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      x_ule_inverse:
+; UNSAFE:      ule_inverse_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; UNSAFE-NEXT: maxsd %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      x_ule_inverse:
+; FINITE:      ule_inverse_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
 ; FINITE-NEXT: maxsd %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @x_ule_inverse(double %x) nounwind {
+define double @ule_inverse_x(double %x) nounwind {
   %c = fcmp ule double %x, 0.000000e+00
   %d = select i1 %c, double 0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_ogt:
+; CHECK:      ogt_y:
 ; CHECK-NEXT: maxsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_ogt:
+; UNSAFE:      ogt_y:
 ; UNSAFE-NEXT: maxsd {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ogt:
+; FINITE:      ogt_y:
 ; FINITE-NEXT: maxsd {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ogt(double %x) nounwind {
+define double @ogt_y(double %x) nounwind {
   %c = fcmp ogt double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_olt:
+; CHECK:      olt_y:
 ; CHECK-NEXT: minsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_olt:
+; UNSAFE:      olt_y:
 ; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_olt:
+; FINITE:      olt_y:
 ; FINITE-NEXT: minsd {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_olt(double %x) nounwind {
+define double @olt_y(double %x) nounwind {
   %c = fcmp olt double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_ogt_inverse:
+; CHECK:      ogt_inverse_y:
 ; CHECK-NEXT: movsd  {{[^,]*}}, %xmm1
 ; CHECK-NEXT: minsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_ogt_inverse:
+; UNSAFE:      ogt_inverse_y:
 ; UNSAFE-NEXT: movsd  {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: minsd  %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ogt_inverse:
+; FINITE:      ogt_inverse_y:
 ; FINITE-NEXT: movsd  {{[^,]*}}, %xmm1
 ; FINITE-NEXT: minsd  %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ogt_inverse(double %x) nounwind {
+define double @ogt_inverse_y(double %x) nounwind {
   %c = fcmp ogt double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_olt_inverse:
+; CHECK:      olt_inverse_y:
 ; CHECK-NEXT: movsd  {{[^,]*}}, %xmm1
 ; CHECK-NEXT: maxsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_olt_inverse:
+; UNSAFE:      olt_inverse_y:
 ; UNSAFE-NEXT: movsd  {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_olt_inverse:
+; FINITE:      olt_inverse_y:
 ; FINITE-NEXT: movsd  {{[^,]*}}, %xmm1
 ; FINITE-NEXT: maxsd  %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_olt_inverse(double %x) nounwind {
+define double @olt_inverse_y(double %x) nounwind {
   %c = fcmp olt double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_oge:
+; CHECK:      oge_y:
 ; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      y_oge:
+; UNSAFE:      oge_y:
 ; UNSAFE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_oge:
+; FINITE:      oge_y:
 ; FINITE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_oge(double %x) nounwind {
+define double @oge_y(double %x) nounwind {
   %c = fcmp oge double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_ole:
+; CHECK:      ole_y:
 ; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      y_ole:
+; UNSAFE:      ole_y:
 ; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ole:
+; FINITE:      ole_y:
 ; FINITE-NEXT: minsd {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ole(double %x) nounwind {
+define double @ole_y(double %x) nounwind {
   %c = fcmp ole double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_oge_inverse:
-; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      y_oge_inverse:
+; CHECK:      oge_inverse_y:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      oge_inverse_y:
 ; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: minsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_oge_inverse:
+; FINITE:      oge_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; FINITE-NEXT: minsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_oge_inverse(double %x) nounwind {
+define double @oge_inverse_y(double %x) nounwind {
   %c = fcmp oge double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_ole_inverse:
-; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      y_ole_inverse:
+; CHECK:      ole_inverse_y:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      ole_inverse_y:
 ; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ole_inverse:
+; FINITE:      ole_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; FINITE-NEXT: maxsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ole_inverse(double %x) nounwind {
+define double @ole_inverse_y(double %x) nounwind {
   %c = fcmp ole double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_ugt:
+; CHECK:      ugt_y:
 ; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      y_ugt:
+; UNSAFE:      ugt_y:
 ; UNSAFE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ugt:
+; FINITE:      ugt_y:
 ; FINITE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ugt(double %x) nounwind {
+define double @ugt_y(double %x) nounwind {
   %c = fcmp ugt double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_ult:
+; CHECK:      ult_y:
 ; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      y_ult:
+; UNSAFE:      ult_y:
 ; UNSAFE-NEXT: minsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ult:
+; FINITE:      ult_y:
 ; FINITE-NEXT: minsd   {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ult(double %x) nounwind {
+define double @ult_y(double %x) nounwind {
   %c = fcmp ult double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_ugt_inverse:
-; CHECK:      ucomisd %xmm0, %xmm1
-; UNSAFE:      y_ugt_inverse:
+; CHECK:      ugt_inverse_y:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      ugt_inverse_y:
 ; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: minsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ugt_inverse:
+; FINITE:      ugt_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; FINITE-NEXT: minsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ugt_inverse(double %x) nounwind {
+define double @ugt_inverse_y(double %x) nounwind {
   %c = fcmp ugt double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_ult_inverse:
-; CHECK:      ucomisd %xmm1, %xmm0
-; UNSAFE:      y_ult_inverse:
+; CHECK:      ult_inverse_y:
+; CHECK:      ucomisd %xmm
+; UNSAFE:      ult_inverse_y:
 ; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ult_inverse:
+; FINITE:      ult_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
 ; FINITE-NEXT: maxsd   %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ult_inverse(double %x) nounwind {
+define double @ult_inverse_y(double %x) nounwind {
   %c = fcmp ult double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_uge:
+; CHECK:      uge_y:
 ; CHECK-NEXT: movsd  {{[^,]*}}, %xmm1
 ; CHECK-NEXT: maxsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_uge:
+; UNSAFE:      uge_y:
 ; UNSAFE-NEXT: maxsd  {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_uge:
+; FINITE:      uge_y:
 ; FINITE-NEXT: maxsd  {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_uge(double %x) nounwind {
+define double @uge_y(double %x) nounwind {
   %c = fcmp uge double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_ule:
+; CHECK:      ule_y:
 ; CHECK-NEXT: movsd  {{[^,]*}}, %xmm1
 ; CHECK-NEXT: minsd  %xmm0, %xmm1
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_ule:
+; UNSAFE:      ule_y:
 ; UNSAFE-NEXT: minsd  {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ule:
+; FINITE:      ule_y:
 ; FINITE-NEXT: minsd  {{[^,]*}}, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ule(double %x) nounwind {
+define double @ule_y(double %x) nounwind {
   %c = fcmp ule double %x, -0.000000e+00
   %d = select i1 %c, double %x, double -0.000000e+00
   ret double %d
 }
 
-; CHECK:      y_uge_inverse:
+; CHECK:      uge_inverse_y:
 ; CHECK-NEXT: minsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_uge_inverse:
+; UNSAFE:      uge_inverse_y:
 ; UNSAFE-NEXT: movsd {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: minsd %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_uge_inverse:
+; FINITE:      uge_inverse_y:
 ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
 ; FINITE-NEXT: minsd %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_uge_inverse(double %x) nounwind {
+define double @uge_inverse_y(double %x) nounwind {
   %c = fcmp uge double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
 }
 
-; CHECK:      y_ule_inverse:
+; CHECK:      ule_inverse_y:
 ; CHECK-NEXT: maxsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
-; UNSAFE:      y_ule_inverse:
+; UNSAFE:      ule_inverse_y:
 ; UNSAFE-NEXT: movsd {{[^,]*}}, %xmm1
 ; UNSAFE-NEXT: maxsd %xmm0, %xmm1
 ; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
-; FINITE:      y_ule_inverse:
+; FINITE:      ule_inverse_y:
 ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
 ; FINITE-NEXT: maxsd %xmm0, %xmm1
 ; FINITE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; FINITE-NEXT: ret
-define double @y_ule_inverse(double %x) nounwind {
+define double @ule_inverse_y(double %x) nounwind {
   %c = fcmp ule double %x, -0.000000e+00
   %d = select i1 %c, double -0.000000e+00, double %x
   ret double %d
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index 5ea1b4d..48638b3 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -249,9 +249,10 @@ entry:
 ; X64: 	t16:
 ; X64: 		pextrw	$8, %xmm0, %eax
 ; X64: 		pslldq	$2, %xmm0
-; X64: 		movd	%xmm0, %ecx
-; X64: 		pextrw	$1, %xmm0, %edx
-; X64: 		pinsrw	$0, %ecx, %xmm0
+; X64: 		pextrw	$1, %xmm0, %ecx
+; X64: 		movzbl	%cl, %ecx
+; X64: 		orl	%eax, %ecx
+; X64: 		pinsrw	$1, %ecx, %xmm0
 ; X64: 		ret
 }
 
diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll
index 1a1017d..a2a0deb 100644
--- a/test/CodeGen/X86/sse41-blend.ll
+++ b/test/CodeGen/X86/sse41-blend.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -promote-elements -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse41 | FileCheck %s
 
 ;CHECK: vsel_float
 ;CHECK: blendvps
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index 54264b1..c6f9f0c 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse41 | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse41 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse41 -mcpu=penryn | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse41 -mcpu=penryn | FileCheck %s -check-prefix=X64
 
 @g16 = external global i16
 
diff --git a/test/CodeGen/X86/sse4a.ll b/test/CodeGen/X86/sse4a.ll
new file mode 100644
index 0000000..076e213
--- /dev/null
+++ b/test/CodeGen/X86/sse4a.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4a | FileCheck %s
+
+define void @test1(i8* %p, <4 x float> %a) nounwind optsize ssp {
+; CHECK: test1:
+; CHECK: movntss
+  tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a) nounwind
+  ret void
+}
+
+declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>)
+
+define void @test2(i8* %p, <2 x double> %a) nounwind optsize ssp {
+; CHECK: test2:
+; CHECK: movntsd
+  tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a) nounwind
+  ret void
+}
+
+declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>)
+
+define <2 x i64> @test3(<2 x i64> %x) nounwind uwtable ssp {
+; CHECK: test3:
+; CHECK: extrq
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2)
+  ret <2 x i64> %1
+}
+
+declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) nounwind
+
+define <2 x i64> @test4(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+; CHECK: test4:
+; CHECK: extrq
+  %1 = bitcast <2 x i64> %y to <16 x i8>
+  %2 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %x, <16 x i8> %1) nounwind
+  ret <2 x i64> %2
+}
+
+declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>) nounwind
+
+define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+; CHECK: test5:
+; CHECK: insertq
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 5, i8 6)
+  ret <2 x i64> %1
+}
+
+declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind
+
+define <2 x i64> @test6(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
+; CHECK: test6:
+; CHECK: insertq
+  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind
+  ret <2 x i64> %1
+}
+
+declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>) nounwind
diff --git a/test/CodeGen/X86/sse_reload_fold.ll b/test/CodeGen/X86/sse_reload_fold.ll
index a57fa58..fd8db3b 100644
--- a/test/CodeGen/X86/sse_reload_fold.ll
+++ b/test/CodeGen/X86/sse_reload_fold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+64bit,+sse3 -print-failed-fuse-candidates -regalloc=basic |& FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+64bit,+sse3 -print-failed-fuse-candidates -regalloc=basic 2>&1 | FileCheck %s
 ; CHECK: fail
 ; CHECK-NOT: fail
 
diff --git a/test/CodeGen/X86/stack-align.ll b/test/CodeGen/X86/stack-align.ll
index f6c13ec..0ddb237 100644
--- a/test/CodeGen/X86/stack-align.ll
+++ b/test/CodeGen/X86/stack-align.ll
@@ -10,11 +10,11 @@ target triple = "i686-apple-darwin8"
 define void @test({ double, double }* byval  %z, double* %P) nounwind {
 entry:
 	%tmp3 = load double* @G, align 16		; <double> [#uses=1]
-	%tmp4 = tail call double @fabs( double %tmp3 )		; <double> [#uses=1]
+	%tmp4 = tail call double @fabs( double %tmp3 ) readnone	; <double> [#uses=1]
         store volatile double %tmp4, double* %P
 	%tmp = getelementptr { double, double }* %z, i32 0, i32 0		; <double*> [#uses=1]
 	%tmp1 = load volatile double* %tmp, align 8		; <double> [#uses=1]
-	%tmp2 = tail call double @fabs( double %tmp1 )		; <double> [#uses=1]
+	%tmp2 = tail call double @fabs( double %tmp1 ) readnone	; <double> [#uses=1]
     ; CHECK: andpd{{.*}}4(%esp), %xmm
 	%tmp6 = fadd double %tmp4, %tmp2		; <double> [#uses=1]
 	store volatile double %tmp6, double* %P, align 8
diff --git a/test/CodeGen/X86/stack-protector-linux.ll b/test/CodeGen/X86/stack-protector.ll
index fe2a9c5..c075114 100644
--- a/test/CodeGen/X86/stack-protector-linux.ll
+++ b/test/CodeGen/X86/stack-protector.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -mtriple=i386-pc-linux-gnu < %s -o - | grep %gs:
 ; RUN: llc -mtriple=x86_64-pc-linux-gnu < %s -o - | grep %fs:
 ; RUN: llc -code-model=kernel -mtriple=x86_64-pc-linux-gnu < %s -o - | grep %gs:
-; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | grep {__stack_chk_guard}
-; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | grep {__stack_chk_fail}
+; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | grep "__stack_chk_guard"
+; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | grep "__stack_chk_fail"
 
 @"\01LC" = internal constant [11 x i8] c"buf == %s\0A\00"		; <[11 x i8]*> [#uses=1]
 
diff --git a/test/CodeGen/X86/store_op_load_fold2.ll b/test/CodeGen/X86/store_op_load_fold2.ll
index 8313166..6e4fe90 100644
--- a/test/CodeGen/X86/store_op_load_fold2.ll
+++ b/test/CodeGen/X86/store_op_load_fold2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-linux -x86-asm-syntax=att | FileCheck %s -check-prefix=ATT
-; RUN: llc < %s -mtriple=i686-linux -x86-asm-syntax=intel | FileCheck %s -check-prefix=INTEL
+; RUN: llc < %s -mtriple=i686-linux -mcpu=corei7 -x86-asm-syntax=att | FileCheck %s -check-prefix=ATT
+; RUN: llc < %s -mtriple=i686-linux -mcpu=corei7 -x86-asm-syntax=intel | FileCheck %s -check-prefix=INTEL
 
 target datalayout = "e-p:32:32"
         %struct.Macroblock = type { i32, i32, i32, i32, i32, [8 x i32], %struct.Macroblock*, %struct.Macroblock*, i32, [2 x [4 x [4 x [2 x i32]]]], [16 x i8], [16 x i8], i32, i64, [4 x i32], [4 x i32], i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i16, double, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
diff --git a/test/CodeGen/X86/subreg-to-reg-1.ll b/test/CodeGen/X86/subreg-to-reg-1.ll
index a297728..4f31ab5 100644
--- a/test/CodeGen/X86/subreg-to-reg-1.ll
+++ b/test/CodeGen/X86/subreg-to-reg-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep {leal	.*), %e.\*} | count 1
+; RUN: llc < %s -march=x86-64 | grep "leal	.*), %e.*" | count 1
 
 ; Don't eliminate or coalesce away the explicit zero-extension!
 ; This is currently using an leal because of a 3-addressification detail,
diff --git a/test/CodeGen/X86/subreg-to-reg-4.ll b/test/CodeGen/X86/subreg-to-reg-4.ll
index 0ea5541..0693789 100644
--- a/test/CodeGen/X86/subreg-to-reg-4.ll
+++ b/test/CodeGen/X86/subreg-to-reg-4.ll
@@ -5,7 +5,7 @@
 ; RUN: not grep negq %t
 ; RUN: not grep addq %t
 ; RUN: not grep subq %t
-; RUN: not grep {movl	%} %t
+; RUN: not grep "movl	%" %t
 
 ; Utilize implicit zero-extension on x86-64 to eliminate explicit
 ; zero-extensions. Shrink 64-bit adds to 32-bit when the high
diff --git a/test/CodeGen/X86/switch-order-weight.ll b/test/CodeGen/X86/switch-order-weight.ll
new file mode 100644
index 0000000..0fdd56d
--- /dev/null
+++ b/test/CodeGen/X86/switch-order-weight.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=x86_64-apple-darwin11 < %s | FileCheck %s
+
+; Check that the cases which lead to unreachable are checked after "10"
+
+define void @test1(i32 %x) nounwind uwtable ssp {
+entry:
+  switch i32 %x, label %if.end7 [
+    i32 0, label %if.then
+    i32 10, label %if.then2
+    i32 20, label %if.then5
+  ]
+
+; CHECK: test1:
+; CHECK-NOT: unr
+; CHECK: cmpl $10
+; CHECK: bar
+; CHECK: cmpl $20
+
+if.then:
+  tail call void @unr(i32 23) noreturn nounwind
+  unreachable
+
+if.then2:
+  tail call void @bar(i32 42) nounwind
+  br label %if.end7
+
+if.then5:
+  tail call void @unr(i32 5) noreturn nounwind
+  unreachable
+
+if.end7:
+  ret void
+}
+
+declare void @unr(i32) noreturn
+
+declare void @bar(i32)
diff --git a/test/CodeGen/X86/tailcall-64.ll b/test/CodeGen/X86/tailcall-64.ll
new file mode 100644
index 0000000..7030753
--- /dev/null
+++ b/test/CodeGen/X86/tailcall-64.ll
@@ -0,0 +1,96 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin11.4.0"
+
+declare i64 @testi()
+
+define i64 @test_trivial() {
+ %A = tail call i64 @testi()
+ ret i64 %A
+}
+; CHECK: test_trivial:
+; CHECK: jmp	_testi                  ## TAILCALL
+
+
+define i64 @test_noop_bitcast() {
+ %A = tail call i64 @testi()
+ %B = bitcast i64 %A to i64
+ ret i64 %B
+}
+; CHECK: test_noop_bitcast:
+; CHECK: jmp	_testi                  ## TAILCALL
+
+
+; Tail call shouldn't be blocked by no-op inttoptr.
+define i8* @test_inttoptr() {
+  %A = tail call i64 @testi()
+  %B = inttoptr i64 %A to i8*
+  ret i8* %B
+}
+
+; CHECK: test_inttoptr:
+; CHECK: jmp	_testi                  ## TAILCALL
+
+
+declare <4 x float> @testv()
+
+define <4 x i32> @test_vectorbitcast() {
+  %A = tail call <4 x float> @testv()
+  %B = bitcast <4 x float> %A to <4 x i32>
+  ret <4 x i32> %B
+}
+; CHECK: test_vectorbitcast:
+; CHECK: jmp	_testv                  ## TAILCALL
+
+
+declare { i64, i64 } @testp()
+
+define {i64, i64} @test_pair_trivial() {
+  %A = tail call { i64, i64} @testp()
+  ret { i64, i64} %A
+}
+; CHECK: test_pair_trivial:
+; CHECK: jmp	_testp                  ## TAILCALL
+
+
+
+define {i64, i64} @test_pair_trivial_extract() {
+  %A = tail call { i64, i64} @testp()
+  %x = extractvalue { i64, i64} %A, 0
+  %y = extractvalue { i64, i64} %A, 1
+  
+  %b = insertvalue {i64, i64} undef, i64 %x, 0
+  %c = insertvalue {i64, i64} %b, i64 %y, 1
+  
+  ret { i64, i64} %c
+}
+
+; CHECK: test_pair_trivial_extract:
+; CHECK: jmp	_testp                  ## TAILCALL
+
+define {i8*, i64} @test_pair_conv_extract() {
+  %A = tail call { i64, i64} @testp()
+  %x = extractvalue { i64, i64} %A, 0
+  %y = extractvalue { i64, i64} %A, 1
+  
+  %x1 = inttoptr i64 %x to i8*
+  
+  %b = insertvalue {i8*, i64} undef, i8* %x1, 0
+  %c = insertvalue {i8*, i64} %b, i64 %y, 1
+  
+  ret { i8*, i64} %c
+}
+
+; CHECK: test_pair_conv_extract:
+; CHECK: jmp	_testp                  ## TAILCALL
+
+
+
+; PR13006
+define { i64, i64 } @crash(i8* %this) {
+  %c = tail call { i64, i64 } @testp()
+  %mrv7 = insertvalue { i64, i64 } %c, i64 undef, 1
+  ret { i64, i64 } %mrv7
+}
+
+
diff --git a/test/CodeGen/X86/tailcall-cgp-dup.ll b/test/CodeGen/X86/tailcall-cgp-dup.ll
new file mode 100644
index 0000000..a80b90f
--- /dev/null
+++ b/test/CodeGen/X86/tailcall-cgp-dup.ll
@@ -0,0 +1,87 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+
+; Teach CGP to dup returns to enable tail call optimization.
+; rdar://9147433
+
+define i32 @foo(i32 %x) nounwind ssp {
+; CHECK: foo:
+entry:
+  switch i32 %x, label %return [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb1
+    i32 3, label %sw.bb3
+    i32 4, label %sw.bb5
+    i32 5, label %sw.bb7
+    i32 6, label %sw.bb9
+  ]
+
+sw.bb:                                            ; preds = %entry
+; CHECK: jmp _f1
+  %call = tail call i32 @f1() nounwind
+  br label %return
+
+sw.bb1:                                           ; preds = %entry
+; CHECK: jmp _f2
+  %call2 = tail call i32 @f2() nounwind
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+; CHECK: jmp _f3
+  %call4 = tail call i32 @f3() nounwind
+  br label %return
+
+sw.bb5:                                           ; preds = %entry
+; CHECK: jmp _f4
+  %call6 = tail call i32 @f4() nounwind
+  br label %return
+
+sw.bb7:                                           ; preds = %entry
+; CHECK: jmp _f5
+  %call8 = tail call i32 @f5() nounwind
+  br label %return
+
+sw.bb9:                                           ; preds = %entry
+; CHECK: jmp _f6
+  %call10 = tail call i32 @f6() nounwind
+  br label %return
+
+return:                                           ; preds = %entry, %sw.bb9, %sw.bb7, %sw.bb5, %sw.bb3, %sw.bb1, %sw.bb
+  %retval.0 = phi i32 [ %call10, %sw.bb9 ], [ %call8, %sw.bb7 ], [ %call6, %sw.bb5 ], [ %call4, %sw.bb3 ], [ %call2, %sw.bb1 ], [ %call, %sw.bb ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @f1()
+
+declare i32 @f2()
+
+declare i32 @f3()
+
+declare i32 @f4()
+
+declare i32 @f5()
+
+declare i32 @f6()
+
+; rdar://11958338
+%0 = type opaque
+
+declare i8* @bar(i8*) uwtable optsize noinline ssp
+
+define hidden %0* @thingWithValue(i8* %self) uwtable ssp {
+entry:
+; CHECK: thingWithValue:
+; CHECK: jmp _bar
+  br i1 undef, label %if.then.i, label %if.else.i
+
+if.then.i:                                        ; preds = %entry
+  br label %someThingWithValue.exit
+
+if.else.i:                                        ; preds = %entry
+  %call4.i = tail call i8* @bar(i8* undef) optsize
+  br label %someThingWithValue.exit
+
+someThingWithValue.exit:                          ; preds = %if.else.i, %if.then.i
+  %retval.0.in.i = phi i8* [ undef, %if.then.i ], [ %call4.i, %if.else.i ]
+  %retval.0.i = bitcast i8* %retval.0.in.i to %0*
+  ret %0* %retval.0.i
+}
diff --git a/test/CodeGen/X86/tailcall-i1.ll b/test/CodeGen/X86/tailcall-i1.ll
deleted file mode 100644
index 8ef1f11..0000000
--- a/test/CodeGen/X86/tailcall-i1.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: llc < %s -march=x86 -tailcallopt | grep TAILCALL
-define fastcc i1 @i1test(i32, i32, i32, i32) {
-  entry:
-  %4 = tail call fastcc i1 @i1test( i32 %0, i32 %1, i32 %2, i32 %3)
-  ret i1 %4
-}
diff --git a/test/CodeGen/X86/tailcall-largecode.ll b/test/CodeGen/X86/tailcall-largecode.ll
index c3f4278..e9b8721 100644
--- a/test/CodeGen/X86/tailcall-largecode.ll
+++ b/test/CodeGen/X86/tailcall-largecode.ll
@@ -49,6 +49,11 @@ define fastcc i32 @direct_manyargs() {
 ;  CHECK: pushq
 ; Pass the stack argument.
 ;  CHECK: movl $7, 16(%rsp)
+; This is the large code model, so &manyargs_callee may not fit into
+; the jmp instruction.  Put it into a register which won't be clobbered
+; while restoring callee-saved registers and won't be used for passing
+; arguments.
+;  CHECK: movabsq $manyargs_callee, %rax
 ; Pass the register arguments, in the right registers.
 ;  CHECK: movl $1, %edi
 ;  CHECK: movl $2, %esi
@@ -56,11 +61,6 @@ define fastcc i32 @direct_manyargs() {
 ;  CHECK: movl $4, %ecx
 ;  CHECK: movl $5, %r8d
 ;  CHECK: movl $6, %r9d
-; This is the large code model, so &manyargs_callee may not fit into
-; the jmp instruction.  Put it into R11, which won't be clobbered
-; while restoring callee-saved registers and won't be used for passing
-; arguments.
-;  CHECK: movabsq $manyargs_callee, %rax
 ; Adjust the stack to "return".
 ;  CHECK: popq
 ; And tail-call to the target.
diff --git a/test/CodeGen/X86/tailcall-void.ll b/test/CodeGen/X86/tailcall-void.ll
deleted file mode 100644
index 4e578d1..0000000
--- a/test/CodeGen/X86/tailcall-void.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: llc < %s -march=x86 -tailcallopt | grep TAILCALL
-define fastcc void @i1test(i32, i32, i32, i32) {
-  entry:
-   tail call fastcc void @i1test( i32 %0, i32 %1, i32 %2, i32 %3)
-   ret void 
-}
diff --git a/test/CodeGen/X86/tailcall1.ll b/test/CodeGen/X86/tailcall.ll
index f7ff5d5..36a38e0 100644
--- a/test/CodeGen/X86/tailcall1.ll
+++ b/test/CodeGen/X86/tailcall.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -tailcallopt | grep TAILCALL | count 5
+; RUN: llc < %s -march=x86 -tailcallopt | grep TAILCALL | count 7
 
 ; With -tailcallopt, CodeGen guarantees a tail call optimization
 ; for all of these.
@@ -38,3 +38,15 @@ define fastcc i32 @noret() nounwind {
   tail call fastcc void @does_not_return()
   unreachable
 }
+
+define fastcc void @void_test(i32, i32, i32, i32) {
+  entry:
+   tail call fastcc void @void_test( i32 %0, i32 %1, i32 %2, i32 %3)
+   ret void 
+}
+
+define fastcc i1 @i1test(i32, i32, i32, i32) {
+  entry:
+  %4 = tail call fastcc i1 @i1test( i32 %0, i32 %1, i32 %2, i32 %3)
+  ret i1 %4
+}
diff --git a/test/CodeGen/X86/tailcallbyval.ll b/test/CodeGen/X86/tailcallbyval.ll
index 03d6f94..118eee6 100644
--- a/test/CodeGen/X86/tailcallbyval.ll
+++ b/test/CodeGen/X86/tailcallbyval.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -tailcallopt | grep TAILCALL
-; RUN: llc < %s -march=x86 -tailcallopt | grep {movl\[\[:space:\]\]*4(%esp), %eax} | count 1
+; RUN: llc < %s -march=x86 -tailcallopt | grep "movl[[:space:]]*4(%esp), %eax" | count 1
 %struct.s = type {i32, i32, i32, i32, i32, i32, i32, i32,
                   i32, i32, i32, i32, i32, i32, i32, i32,
                   i32, i32, i32, i32, i32, i32, i32, i32 }
diff --git a/test/CodeGen/X86/targetLoweringGeneric.ll b/test/CodeGen/X86/targetLoweringGeneric.ll
new file mode 100644
index 0000000..ba5f8f8
--- /dev/null
+++ b/test/CodeGen/X86/targetLoweringGeneric.ll
@@ -0,0 +1,38 @@
+; RUN: llc -mtriple=i386-apple-darwin9 -fast-isel=false -O0 < %s | FileCheck %s
+
+; Gather non-machine specific tests for the transformations in
+; CodeGen/SelectionDAG/TargetLowering.  Currently, these
+; can't be tested easily by checking the SDNodes that are
+; the data structures that these transformations act on.
+; Therefore, use X86 assembler output to check against.
+
+; rdar://11195364 A problem with the transformation:
+;  If all of the demanded bits on one side are known, and all of the set
+;  bits on that side are also known to be set on the other side, turn this
+;  into an AND, as we know the bits will be cleared.
+; The known set (one) bits for the arguments %xor1 are not the same, so the
+; transformation should not occur
+define void @foo(i32 %i32In1, i32 %i32In2, i32 %i32In3, i32 %i32In4, 
+                 i32 %i32In5, i32 %i32In6, i32* %i32StarOut, i1 %i1In1, 
+                 i32* %i32SelOut) nounwind {
+    %and3 = and i32 %i32In1, 1362779777
+    %or2 = or i32 %i32In2, %i32In3
+    %and2 = and i32 %or2, 1362779777
+    %xor3 = xor i32 %and3, %and2
+    ; CHECK: shll
+    %shl1 = shl i32 %xor3, %i32In4
+    %sub1 = sub i32 %or2, %shl1
+    %add1 = add i32 %sub1, %i32In5
+    %and1 = and i32 %add1, 1
+    %xor2 = xor i32 %and1, 1
+    %or1 = or i32 %xor2, 364806994 ;0x15BE8352
+    ; CHECK-NOT: andl $96239955
+    %xor1 = xor i32 %or1, 268567040 ;0x10020200
+    ; force an output so not DCE'd
+    store i32 %xor1, i32* %i32StarOut
+    ; force not fast isel by using a select
+    %i32SelVal = select i1 %i1In1, i32 %i32In1, i32 %xor1
+    store i32 %i32SelVal, i32* %i32SelOut
+    ; CHECK: ret
+    ret void
+}
diff --git a/test/CodeGen/X86/thiscall-struct-return.ll b/test/CodeGen/X86/thiscall-struct-return.ll
index a7be483..0507cb8 100644
--- a/test/CodeGen/X86/thiscall-struct-return.ll
+++ b/test/CodeGen/X86/thiscall-struct-return.ll
@@ -10,7 +10,7 @@ declare x86_thiscallcc void @_ZNK1C6MediumEv(%struct.M* noalias sret %agg.result
 
 define void @testv() nounwind {
 ; CHECK: testv:
-; CHECK: leal
+; CHECK: leal 16(%esp), %esi
 ; CHECK-NEXT: movl	%esi, (%esp)
 ; CHECK-NEXT: calll _ZN1CC1Ev
 ; CHECK: leal 8(%esp), %eax
@@ -29,7 +29,7 @@ entry:
 
 define void @test2v() nounwind {
 ; CHECK: test2v:
-; CHECK: leal
+; CHECK: leal 16(%esp), %esi
 ; CHECK-NEXT: movl	%esi, (%esp)
 ; CHECK-NEXT: calll _ZN1CC1Ev
 ; CHECK: leal 8(%esp), %eax
diff --git a/test/CodeGen/X86/tls-local-dynamic.ll b/test/CodeGen/X86/tls-local-dynamic.ll
new file mode 100644
index 0000000..c5fd16b
--- /dev/null
+++ b/test/CodeGen/X86/tls-local-dynamic.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck  %s
+
+@x = internal thread_local global i32 0, align 4
+@y = internal thread_local global i32 0, align 4
+
+; get_x and get_y are here to prevent x and y to be optimized away as 0
+
+define i32* @get_x() {
+entry:
+  ret i32* @x
+; FIXME: This function uses a single thread-local variable,
+; so we might want to fall back to general-dynamic here.
+; CHECK:       get_x:
+; CHECK:       leaq x@TLSLD(%rip), %rdi
+; CHECK-NEXT:  callq __tls_get_addr@PLT
+; CHECK:       x@DTPOFF
+}
+
+define i32* @get_y() {
+entry:
+  ret i32* @y
+}
+
+define i32 @f(i32 %i) {
+entry:
+  %cmp = icmp eq i32 %i, 1
+  br i1 %cmp, label %return, label %if.else
+; This bb does not access TLS, so should not call __tls_get_addr.
+; CHECK:       f:
+; CHECK-NOT:   __tls_get_addr
+; CHECK:       je
+
+
+if.else:
+  %0 = load i32* @x, align 4
+  %cmp1 = icmp eq i32 %i, 2
+  br i1 %cmp1, label %if.then2, label %return
+; Now we call __tls_get_addr.
+; CHECK:       # %if.else
+; CHECK:       leaq x@TLSLD(%rip), %rdi
+; CHECK-NEXT:  callq __tls_get_addr@PLT
+; CHECK:       x@DTPOFF
+
+
+if.then2:
+  %1 = load i32* @y, align 4
+  %add = add nsw i32 %1, %0
+  br label %return
+; This accesses TLS, but is dominated by the previous block,
+; so should not have to call __tls_get_addr again.
+; CHECK:       # %if.then2
+; CHECK-NOT:   __tls_get_addr
+; CHECK:       y@DTPOFF
+
+
+return:
+  %retval.0 = phi i32 [ %add, %if.then2 ], [ 5, %entry ], [ %0, %if.else ]
+  ret i32 %retval.0
+}
diff --git a/test/CodeGen/X86/tls-models.ll b/test/CodeGen/X86/tls-models.ll
new file mode 100644
index 0000000..7c527e2
--- /dev/null
+++ b/test/CodeGen/X86/tls-models.ll
@@ -0,0 +1,166 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X64_PIC %s
+; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X32_PIC %s
+
+; Darwin always uses the same model.
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin | FileCheck -check-prefix=DARWIN %s
+
+@external_gd = external thread_local global i32
+@internal_gd = internal thread_local global i32 42
+
+@external_ld = external thread_local(localdynamic) global i32
+@internal_ld = internal thread_local(localdynamic) global i32 42
+
+@external_ie = external thread_local(initialexec) global i32
+@internal_ie = internal thread_local(initialexec) global i32 42
+
+@external_le = external thread_local(localexec) global i32
+@internal_le = internal thread_local(localexec) global i32 42
+
+; ----- no model specified -----
+
+define i32* @f1() {
+entry:
+  ret i32* @external_gd
+
+  ; Non-PIC code can use initial-exec, PIC code has to use general dynamic.
+  ; X64:     f1:
+  ; X64:     external_gd@GOTTPOFF
+  ; X32:     f1:
+  ; X32:     external_gd@INDNTPOFF
+  ; X64_PIC: f1:
+  ; X64_PIC: external_gd@TLSGD
+  ; X32_PIC: f1:
+  ; X32_PIC: external_gd@TLSGD
+  ; DARWIN:  f1:
+  ; DARWIN:  _external_gd@TLVP
+}
+
+define i32* @f2() {
+entry:
+  ret i32* @internal_gd
+
+  ; Non-PIC code can use local exec, PIC code can use local dynamic.
+  ; X64:     f2:
+  ; X64:     internal_gd@TPOFF
+  ; X32:     f2:
+  ; X32:     internal_gd@NTPOFF
+  ; X64_PIC: f2:
+  ; X64_PIC: internal_gd@TLSLD
+  ; X32_PIC: f2:
+  ; X32_PIC: internal_gd@TLSLDM
+  ; DARWIN:  f2:
+  ; DARWIN:  _internal_gd@TLVP
+}
+
+
+; ----- localdynamic specified -----
+
+define i32* @f3() {
+entry:
+  ret i32* @external_ld
+
+  ; Non-PIC code can use initial exec, PIC code use local dynamic as specified.
+  ; X64:     f3:
+  ; X64:     external_ld@GOTTPOFF
+  ; X32:     f3:
+  ; X32:     external_ld@INDNTPOFF
+  ; X64_PIC: f3:
+  ; X64_PIC: external_ld@TLSLD
+  ; X32_PIC: f3:
+  ; X32_PIC: external_ld@TLSLDM
+  ; DARWIN:  f3:
+  ; DARWIN:  _external_ld@TLVP
+}
+
+define i32* @f4() {
+entry:
+  ret i32* @internal_ld
+
+  ; Non-PIC code can use local exec, PIC code can use local dynamic.
+  ; X64:     f4:
+  ; X64:     internal_ld@TPOFF
+  ; X32:     f4:
+  ; X32:     internal_ld@NTPOFF
+  ; X64_PIC: f4:
+  ; X64_PIC: internal_ld@TLSLD
+  ; X32_PIC: f4:
+  ; X32_PIC: internal_ld@TLSLDM
+  ; DARWIN:  f4:
+  ; DARWIN:  _internal_ld@TLVP
+}
+
+
+; ----- initialexec specified -----
+
+define i32* @f5() {
+entry:
+  ret i32* @external_ie
+
+  ; Non-PIC and PIC code will use initial exec as specified.
+  ; X64:     f5:
+  ; X64:     external_ie@GOTTPOFF
+  ; X32:     f5:
+  ; X32:     external_ie@INDNTPOFF
+  ; X64_PIC: f5:
+  ; X64_PIC: external_ie@GOTTPOFF
+  ; X32_PIC: f5:
+  ; X32_PIC: external_ie@GOTNTPOFF
+  ; DARWIN:  f5:
+  ; DARWIN:  _external_ie@TLVP
+}
+
+define i32* @f6() {
+entry:
+  ret i32* @internal_ie
+
+  ; Non-PIC code can use local exec, PIC code use initial exec as specified.
+  ; X64:     f6:
+  ; X64:     internal_ie@TPOFF
+  ; X32:     f6:
+  ; X32:     internal_ie@NTPOFF
+  ; X64_PIC: f6:
+  ; X64_PIC: internal_ie@GOTTPOFF
+  ; X32_PIC: f6:
+  ; X32_PIC: internal_ie@GOTNTPOFF
+  ; DARWIN:  f6:
+  ; DARWIN:  _internal_ie@TLVP
+}
+
+
+; ----- localexec specified -----
+
+define i32* @f7() {
+entry:
+  ret i32* @external_le
+
+  ; Non-PIC and PIC code will use local exec as specified.
+  ; X64:     f7:
+  ; X64:     external_le@TPOFF
+  ; X32:     f7:
+  ; X32:     external_le@NTPOFF
+  ; X64_PIC: f7:
+  ; X64_PIC: external_le@TPOFF
+  ; X32_PIC: f7:
+  ; X32_PIC: external_le@NTPOFF
+  ; DARWIN:  f7:
+  ; DARWIN:  _external_le@TLVP
+}
+
+define i32* @f8() {
+entry:
+  ret i32* @internal_le
+
+  ; Non-PIC and PIC code will use local exec as specified.
+  ; X64:     f8:
+  ; X64:     internal_le@TPOFF
+  ; X32:     f8:
+  ; X32:     internal_le@NTPOFF
+  ; X64_PIC: f8:
+  ; X64_PIC: internal_le@TPOFF
+  ; X32_PIC: f8:
+  ; X32_PIC: internal_le@NTPOFF
+  ; DARWIN:  f8:
+  ; DARWIN:  _internal_le@TLVP
+}
diff --git a/test/CodeGen/X86/tls-pic.ll b/test/CodeGen/X86/tls-pic.ll
index b83416d..51c3d23 100644
--- a/test/CodeGen/X86/tls-pic.ll
+++ b/test/CodeGen/X86/tls-pic.ll
@@ -2,6 +2,8 @@
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X64 %s
 
 @i = thread_local global i32 15
+@j = internal thread_local global i32 42
+@k = internal thread_local global i32 42
 
 define i32 @f1() {
 entry:
@@ -64,4 +66,22 @@ entry:
 ; X64:   callq __tls_get_addr@PLT
 
 
+define i32 @f5() nounwind {
+entry:
+	%0 = load i32* @j, align 4
+	%1 = load i32* @k, align 4
+	%add = add nsw i32 %0, %1
+	ret i32 %add
+}
 
+; X32:    f5:
+; X32:      leal {{[jk]}}@TLSLDM(%ebx)
+; X32-NEXT: calll ___tls_get_addr@PLT
+; X32-NEXT: movl {{[jk]}}@DTPOFF(%eax)
+; X32-NEXT: addl {{[jk]}}@DTPOFF(%eax)
+
+; X64:    f5:
+; X64:      leaq {{[jk]}}@TLSLD(%rip), %rdi
+; X64-NEXT: callq	__tls_get_addr@PLT
+; X64-NEXT: movl {{[jk]}}@DTPOFF(%rax)
+; X64-NEXT: addl {{[jk]}}@DTPOFF(%rax)
diff --git a/test/CodeGen/X86/tls-pie.ll b/test/CodeGen/X86/tls-pie.ll
index e2e58a54..3fca9f5 100644
--- a/test/CodeGen/X86/tls-pie.ll
+++ b/test/CodeGen/X86/tls-pie.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -relocation-model=pic -enable-pie \
+; RUN: llc < %s -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic -enable-pie \
 ; RUN:   | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic -enable-pie \
+; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic -enable-pie \
 ; RUN:   | FileCheck -check-prefix=X64 %s
 
 @i = thread_local global i32 15
@@ -35,7 +35,12 @@ entry:
 
 define i32 @f3() {
 ; X32: f3:
-; X32:      movl i2@INDNTPOFF, %eax
+; X32:      calll .L{{[0-9]+}}$pb
+; X32-NEXT: .L{{[0-9]+}}$pb:
+; X32-NEXT: popl %eax
+; X32-NEXT: .Ltmp{{[0-9]+}}:
+; X32-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp{{[0-9]+}}-.L{{[0-9]+}}$pb), %eax
+; X32-NEXT: movl i2@GOTNTPOFF(%eax), %eax
 ; X32-NEXT: movl %gs:(%eax), %eax
 ; X32-NEXT: ret
 ; X64: f3:
@@ -50,8 +55,13 @@ entry:
 
 define i32* @f4() {
 ; X32: f4:
-; X32:      movl %gs:0, %eax
-; X32-NEXT: addl i2@INDNTPOFF, %eax
+; X32:      calll .L{{[0-9]+}}$pb
+; X32-NEXT: .L{{[0-9]+}}$pb:
+; X32-NEXT: popl %ecx
+; X32-NEXT: .Ltmp{{[0-9]+}}:
+; X32-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp{{[0-9]+}}-.L{{[0-9]+}}$pb), %ecx
+; X32-NEXT: movl %gs:0, %eax
+; X32-NEXT: addl i2@GOTNTPOFF(%ecx), %eax
 ; X32-NEXT: ret
 ; X64: f4:
 ; X64:      movq %fs:0, %rax
diff --git a/test/CodeGen/X86/trap.ll b/test/CodeGen/X86/trap.ll
index 03ae6bf..3f44be0 100644
--- a/test/CodeGen/X86/trap.ll
+++ b/test/CodeGen/X86/trap.ll
@@ -1,9 +1,21 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | grep ud2
-define i32 @test() noreturn nounwind  {
+; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
+
+; CHECK: test0:
+; CHECK: ud2
+define i32 @test0() noreturn nounwind  {
 entry:
 	tail call void @llvm.trap( )
 	unreachable
 }
 
+; CHECK: test1:
+; CHECK: int3
+define i32 @test1() noreturn nounwind  {
+entry:
+	tail call void @llvm.debugtrap( )
+	unreachable
+}
+
 declare void @llvm.trap() nounwind 
+declare void @llvm.debugtrap() nounwind 
 
diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll
index 57d6e97..9877d7b 100644
--- a/test/CodeGen/X86/trunc-ext-ld-st.ll
+++ b/test/CodeGen/X86/trunc-ext-ld-st.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -promote-elements -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse41 | FileCheck %s
 
 ;CHECK: load_2_i8
 ; A single 16-bit load
diff --git a/test/CodeGen/X86/twoaddr-coalesce-2.ll b/test/CodeGen/X86/twoaddr-coalesce-2.ll
index 6f16a25..af6d47a 100644
--- a/test/CodeGen/X86/twoaddr-coalesce-2.ll
+++ b/test/CodeGen/X86/twoaddr-coalesce-2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -stats |& \
-; RUN:   grep {twoaddrinstr} | grep {Number of instructions aggressively commuted}
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -stats 2>&1 | \
+; RUN:   grep "twoaddrinstr" | grep "Number of instructions aggressively commuted"
 ; rdar://6480363
 
 target triple = "i386-apple-darwin9.6"
diff --git a/test/CodeGen/X86/twoaddr-pass-sink.ll b/test/CodeGen/X86/twoaddr-pass-sink.ll
index 077fee0..513c304 100644
--- a/test/CodeGen/X86/twoaddr-pass-sink.ll
+++ b/test/CodeGen/X86/twoaddr-pass-sink.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -stats |& grep {Number of 3-address instructions sunk}
+; RUN: llc < %s -march=x86 -mattr=+sse2 -stats 2>&1 | grep "Number of 3-address instructions sunk"
 
 define void @t2(<2 x i64>* %vDct, <2 x i64>* %vYp, i8* %skiplist, <2 x i64> %a1) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/uint_to_fp.ll b/test/CodeGen/X86/uint_to_fp.ll
index 41ee194..0536eb0 100644
--- a/test/CodeGen/X86/uint_to_fp.ll
+++ b/test/CodeGen/X86/uint_to_fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | not grep {sub.*esp}
+; RUN: llc < %s -march=x86 -mcpu=yonah | not grep "sub.*esp"
 ; RUN: llc < %s -march=x86 -mcpu=yonah | grep cvtsi2ss
 ; rdar://6034396
 
diff --git a/test/CodeGen/X86/umul-with-carry.ll b/test/CodeGen/X86/umul-with-carry.ll
index 7416051..56fdadb 100644
--- a/test/CodeGen/X86/umul-with-carry.ll
+++ b/test/CodeGen/X86/umul-with-carry.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep {jc} | count 1
+; RUN: llc < %s -march=x86 | grep "jc" | count 1
 ; XFAIL: *
 
 ; FIXME: umul-with-overflow not supported yet.
diff --git a/test/CodeGen/X86/unwindraise.ll b/test/CodeGen/X86/unwindraise.ll
new file mode 100644
index 0000000..a438723
--- /dev/null
+++ b/test/CodeGen/X86/unwindraise.ll
@@ -0,0 +1,252 @@
+; RUN: llc < %s -verify-machineinstrs
+; PR13188
+;
+; The _Unwind_RaiseException function can return normally and via eh.return.
+; This causes confusion about the function live-out registers, since the two
+; different ways of returning have different return values.
+;
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-freebsd9.0"
+
+%struct._Unwind_Context = type { [18 x i8*], i8*, i8*, i8*, %struct.dwarf_eh_bases, i64, i64, i64, [18 x i8] }
+%struct.dwarf_eh_bases = type { i8*, i8*, i8* }
+%struct._Unwind_FrameState = type { %struct.frame_state_reg_info, i64, i64, i8*, i32, i8*, i32 (i32, i32, i64, %struct._Unwind_Exception*, %struct._Unwind_Context*)*, i64, i64, i64, i8, i8, i8, i8, i8* }
+%struct.frame_state_reg_info = type { [18 x %struct.anon], %struct.frame_state_reg_info* }
+%struct.anon = type { %union.anon, i32 }
+%union.anon = type { i64 }
+%struct._Unwind_Exception = type { i64, void (i32, %struct._Unwind_Exception*)*, i64, i64 }
+
+@dwarf_reg_size_table = external hidden unnamed_addr global [18 x i8], align 16
+
+declare void @abort() noreturn
+
+declare fastcc i32 @uw_frame_state_for(%struct._Unwind_Context*, %struct._Unwind_FrameState*) uwtable
+
+define hidden i32 @_Unwind_RaiseException(%struct._Unwind_Exception* %exc) uwtable {
+entry:
+  %fs.i = alloca %struct._Unwind_FrameState, align 8
+  %this_context = alloca %struct._Unwind_Context, align 8
+  %cur_context = alloca %struct._Unwind_Context, align 8
+  %fs = alloca %struct._Unwind_FrameState, align 8
+  call void @llvm.eh.unwind.init()
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  %1 = call i8* @llvm.returnaddress(i32 0)
+  call fastcc void @uw_init_context_1(%struct._Unwind_Context* %this_context, i8* %0, i8* %1)
+  %2 = bitcast %struct._Unwind_Context* %cur_context to i8*
+  %3 = bitcast %struct._Unwind_Context* %this_context to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 240, i32 8, i1 false)
+  %personality = getelementptr inbounds %struct._Unwind_FrameState* %fs, i64 0, i32 6
+  %retaddr_column.i = getelementptr inbounds %struct._Unwind_FrameState* %fs, i64 0, i32 9
+  %flags.i.i.i.i = getelementptr inbounds %struct._Unwind_Context* %cur_context, i64 0, i32 5
+  %ra.i = getelementptr inbounds %struct._Unwind_Context* %cur_context, i64 0, i32 2
+  %exception_class = getelementptr inbounds %struct._Unwind_Exception* %exc, i64 0, i32 0
+  br label %while.body
+
+while.body:                                       ; preds = %uw_update_context.exit, %entry
+  %call = call fastcc i32 @uw_frame_state_for(%struct._Unwind_Context* %cur_context, %struct._Unwind_FrameState* %fs)
+  switch i32 %call, label %do.end21 [
+    i32 5, label %do.end21.loopexit46
+    i32 0, label %if.end3
+  ]
+
+if.end3:                                          ; preds = %while.body
+  %4 = load i32 (i32, i32, i64, %struct._Unwind_Exception*, %struct._Unwind_Context*)** %personality, align 8, !tbaa !0
+  %tobool = icmp eq i32 (i32, i32, i64, %struct._Unwind_Exception*, %struct._Unwind_Context*)* %4, null
+  br i1 %tobool, label %if.end13, label %if.then4
+
+if.then4:                                         ; preds = %if.end3
+  %5 = load i64* %exception_class, align 8, !tbaa !3
+  %call6 = call i32 %4(i32 1, i32 1, i64 %5, %struct._Unwind_Exception* %exc, %struct._Unwind_Context* %cur_context)
+  switch i32 %call6, label %do.end21.loopexit46 [
+    i32 6, label %while.end
+    i32 8, label %if.end13
+  ]
+
+if.end13:                                         ; preds = %if.then4, %if.end3
+  call fastcc void @uw_update_context_1(%struct._Unwind_Context* %cur_context, %struct._Unwind_FrameState* %fs)
+  %6 = load i64* %retaddr_column.i, align 8, !tbaa !3
+  %conv.i = trunc i64 %6 to i32
+  %cmp.i.i.i = icmp slt i32 %conv.i, 18
+  br i1 %cmp.i.i.i, label %cond.end.i.i.i, label %cond.true.i.i.i
+
+cond.true.i.i.i:                                  ; preds = %if.end13
+  call void @abort() noreturn
+  unreachable
+
+cond.end.i.i.i:                                   ; preds = %if.end13
+  %sext.i = shl i64 %6, 32
+  %idxprom.i.i.i = ashr exact i64 %sext.i, 32
+  %arrayidx.i.i.i = getelementptr inbounds [18 x i8]* @dwarf_reg_size_table, i64 0, i64 %idxprom.i.i.i
+  %7 = load i8* %arrayidx.i.i.i, align 1, !tbaa !1
+  %arrayidx2.i.i.i = getelementptr inbounds %struct._Unwind_Context* %cur_context, i64 0, i32 0, i64 %idxprom.i.i.i
+  %8 = load i8** %arrayidx2.i.i.i, align 8, !tbaa !0
+  %9 = load i64* %flags.i.i.i.i, align 8, !tbaa !3
+  %and.i.i.i.i = and i64 %9, 4611686018427387904
+  %tobool.i.i.i = icmp eq i64 %and.i.i.i.i, 0
+  br i1 %tobool.i.i.i, label %if.end.i.i.i, label %land.lhs.true.i.i.i
+
+land.lhs.true.i.i.i:                              ; preds = %cond.end.i.i.i
+  %arrayidx4.i.i.i = getelementptr inbounds %struct._Unwind_Context* %cur_context, i64 0, i32 8, i64 %idxprom.i.i.i
+  %10 = load i8* %arrayidx4.i.i.i, align 1, !tbaa !1
+  %tobool6.i.i.i = icmp eq i8 %10, 0
+  br i1 %tobool6.i.i.i, label %if.end.i.i.i, label %if.then.i.i.i
+
+if.then.i.i.i:                                    ; preds = %land.lhs.true.i.i.i
+  %11 = ptrtoint i8* %8 to i64
+  br label %uw_update_context.exit
+
+if.end.i.i.i:                                     ; preds = %land.lhs.true.i.i.i, %cond.end.i.i.i
+  %cmp8.i.i.i = icmp eq i8 %7, 8
+  br i1 %cmp8.i.i.i, label %if.then10.i.i.i, label %cond.true14.i.i.i
+
+if.then10.i.i.i:                                  ; preds = %if.end.i.i.i
+  %12 = bitcast i8* %8 to i64*
+  %13 = load i64* %12, align 8, !tbaa !3
+  br label %uw_update_context.exit
+
+cond.true14.i.i.i:                                ; preds = %if.end.i.i.i
+  call void @abort() noreturn
+  unreachable
+
+uw_update_context.exit:                           ; preds = %if.then10.i.i.i, %if.then.i.i.i
+  %retval.0.i.i.i = phi i64 [ %11, %if.then.i.i.i ], [ %13, %if.then10.i.i.i ]
+  %14 = inttoptr i64 %retval.0.i.i.i to i8*
+  store i8* %14, i8** %ra.i, align 8, !tbaa !0
+  br label %while.body
+
+while.end:                                        ; preds = %if.then4
+  %private_1 = getelementptr inbounds %struct._Unwind_Exception* %exc, i64 0, i32 2
+  store i64 0, i64* %private_1, align 8, !tbaa !3
+  %15 = load i8** %ra.i, align 8, !tbaa !0
+  %16 = ptrtoint i8* %15 to i64
+  %private_2 = getelementptr inbounds %struct._Unwind_Exception* %exc, i64 0, i32 3
+  store i64 %16, i64* %private_2, align 8, !tbaa !3
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 240, i32 8, i1 false)
+  %17 = bitcast %struct._Unwind_FrameState* %fs.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %17)
+  %personality.i = getelementptr inbounds %struct._Unwind_FrameState* %fs.i, i64 0, i32 6
+  %retaddr_column.i22 = getelementptr inbounds %struct._Unwind_FrameState* %fs.i, i64 0, i32 9
+  br label %while.body.i
+
+while.body.i:                                     ; preds = %uw_update_context.exit44, %while.end
+  %call.i = call fastcc i32 @uw_frame_state_for(%struct._Unwind_Context* %cur_context, %struct._Unwind_FrameState* %fs.i)
+  %18 = load i8** %ra.i, align 8, !tbaa !0
+  %19 = ptrtoint i8* %18 to i64
+  %20 = load i64* %private_2, align 8, !tbaa !3
+  %cmp.i = icmp eq i64 %19, %20
+  %cmp2.i = icmp eq i32 %call.i, 0
+  br i1 %cmp2.i, label %if.end.i, label %do.end21
+
+if.end.i:                                         ; preds = %while.body.i
+  %21 = load i32 (i32, i32, i64, %struct._Unwind_Exception*, %struct._Unwind_Context*)** %personality.i, align 8, !tbaa !0
+  %tobool.i = icmp eq i32 (i32, i32, i64, %struct._Unwind_Exception*, %struct._Unwind_Context*)* %21, null
+  br i1 %tobool.i, label %if.end12.i, label %if.then3.i
+
+if.then3.i:                                       ; preds = %if.end.i
+  %or.i = select i1 %cmp.i, i32 6, i32 2
+  %22 = load i64* %exception_class, align 8, !tbaa !3
+  %call5.i = call i32 %21(i32 1, i32 %or.i, i64 %22, %struct._Unwind_Exception* %exc, %struct._Unwind_Context* %cur_context)
+  switch i32 %call5.i, label %do.end21 [
+    i32 7, label %do.body19
+    i32 8, label %if.end12.i
+  ]
+
+if.end12.i:                                       ; preds = %if.then3.i, %if.end.i
+  br i1 %cmp.i, label %cond.true.i, label %cond.end.i
+
+cond.true.i:                                      ; preds = %if.end12.i
+  call void @abort() noreturn
+  unreachable
+
+cond.end.i:                                       ; preds = %if.end12.i
+  call fastcc void @uw_update_context_1(%struct._Unwind_Context* %cur_context, %struct._Unwind_FrameState* %fs.i)
+  %23 = load i64* %retaddr_column.i22, align 8, !tbaa !3
+  %conv.i23 = trunc i64 %23 to i32
+  %cmp.i.i.i24 = icmp slt i32 %conv.i23, 18
+  br i1 %cmp.i.i.i24, label %cond.end.i.i.i33, label %cond.true.i.i.i25
+
+cond.true.i.i.i25:                                ; preds = %cond.end.i
+  call void @abort() noreturn
+  unreachable
+
+cond.end.i.i.i33:                                 ; preds = %cond.end.i
+  %sext.i26 = shl i64 %23, 32
+  %idxprom.i.i.i27 = ashr exact i64 %sext.i26, 32
+  %arrayidx.i.i.i28 = getelementptr inbounds [18 x i8]* @dwarf_reg_size_table, i64 0, i64 %idxprom.i.i.i27
+  %24 = load i8* %arrayidx.i.i.i28, align 1, !tbaa !1
+  %arrayidx2.i.i.i29 = getelementptr inbounds %struct._Unwind_Context* %cur_context, i64 0, i32 0, i64 %idxprom.i.i.i27
+  %25 = load i8** %arrayidx2.i.i.i29, align 8, !tbaa !0
+  %26 = load i64* %flags.i.i.i.i, align 8, !tbaa !3
+  %and.i.i.i.i31 = and i64 %26, 4611686018427387904
+  %tobool.i.i.i32 = icmp eq i64 %and.i.i.i.i31, 0
+  br i1 %tobool.i.i.i32, label %if.end.i.i.i39, label %land.lhs.true.i.i.i36
+
+land.lhs.true.i.i.i36:                            ; preds = %cond.end.i.i.i33
+  %arrayidx4.i.i.i34 = getelementptr inbounds %struct._Unwind_Context* %cur_context, i64 0, i32 8, i64 %idxprom.i.i.i27
+  %27 = load i8* %arrayidx4.i.i.i34, align 1, !tbaa !1
+  %tobool6.i.i.i35 = icmp eq i8 %27, 0
+  br i1 %tobool6.i.i.i35, label %if.end.i.i.i39, label %if.then.i.i.i37
+
+if.then.i.i.i37:                                  ; preds = %land.lhs.true.i.i.i36
+  %28 = ptrtoint i8* %25 to i64
+  br label %uw_update_context.exit44
+
+if.end.i.i.i39:                                   ; preds = %land.lhs.true.i.i.i36, %cond.end.i.i.i33
+  %cmp8.i.i.i38 = icmp eq i8 %24, 8
+  br i1 %cmp8.i.i.i38, label %if.then10.i.i.i40, label %cond.true14.i.i.i41
+
+if.then10.i.i.i40:                                ; preds = %if.end.i.i.i39
+  %29 = bitcast i8* %25 to i64*
+  %30 = load i64* %29, align 8, !tbaa !3
+  br label %uw_update_context.exit44
+
+cond.true14.i.i.i41:                              ; preds = %if.end.i.i.i39
+  call void @abort() noreturn
+  unreachable
+
+uw_update_context.exit44:                         ; preds = %if.then10.i.i.i40, %if.then.i.i.i37
+  %retval.0.i.i.i42 = phi i64 [ %28, %if.then.i.i.i37 ], [ %30, %if.then10.i.i.i40 ]
+  %31 = inttoptr i64 %retval.0.i.i.i42 to i8*
+  store i8* %31, i8** %ra.i, align 8, !tbaa !0
+  br label %while.body.i
+
+do.body19:                                        ; preds = %if.then3.i
+  call void @llvm.lifetime.end(i64 -1, i8* %17)
+  %call20 = call fastcc i64 @uw_install_context_1(%struct._Unwind_Context* %this_context, %struct._Unwind_Context* %cur_context)
+  %32 = load i8** %ra.i, align 8, !tbaa !0
+  call void @llvm.eh.return.i64(i64 %call20, i8* %32)
+  unreachable
+
+do.end21.loopexit46:                              ; preds = %if.then4, %while.body
+  %retval.0.ph = phi i32 [ 3, %if.then4 ], [ 5, %while.body ]
+  br label %do.end21
+
+do.end21:                                         ; preds = %do.end21.loopexit46, %if.then3.i, %while.body.i, %while.body
+  %retval.0 = phi i32 [ %retval.0.ph, %do.end21.loopexit46 ], [ 3, %while.body ], [ 2, %while.body.i ], [ 2, %if.then3.i ]
+  ret i32 %retval.0
+}
+
+declare void @llvm.eh.unwind.init() nounwind
+
+declare fastcc void @uw_init_context_1(%struct._Unwind_Context*, i8*, i8*) uwtable
+
+declare i8* @llvm.eh.dwarf.cfa(i32) nounwind
+
+declare i8* @llvm.returnaddress(i32) nounwind readnone
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+declare fastcc i64 @uw_install_context_1(%struct._Unwind_Context*, %struct._Unwind_Context*) uwtable
+
+declare void @llvm.eh.return.i64(i64, i8*) nounwind
+
+declare fastcc void @uw_update_context_1(%struct._Unwind_Context*, %struct._Unwind_FrameState* nocapture) uwtable
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"long", metadata !1}
diff --git a/test/CodeGen/X86/v-binop-widen2.ll b/test/CodeGen/X86/v-binop-widen2.ll
index ae3f55a..569586a 100644
--- a/test/CodeGen/X86/v-binop-widen2.ll
+++ b/test/CodeGen/X86/v-binop-widen2.ll
@@ -1,9 +1,16 @@
-; RUN: llc -march=x86 -mattr=+sse < %s | FileCheck %s
+; RUN: llc -march=x86 -mcpu=generic -mattr=+sse < %s | FileCheck %s
+; RUN: llc -march=x86 -mcpu=atom -mattr=+sse < %s | FileCheck -check-prefix=ATOM %s
 
 %vec = type <6 x float>
 ; CHECK: divss
 ; CHECK: divss
 ; CHECK: divps
+
+; Scheduler causes a different instruction order to be produced on Intel Atom
+; ATOM: divps
+; ATOM: divss
+; ATOM: divss
+
 define %vec @vecdiv( %vec %p1, %vec %p2)
 {
   %result = fdiv %vec %p1, %p2
diff --git a/test/CodeGen/X86/vec_call.ll b/test/CodeGen/X86/vec_call.ll
index f2fc7e7..e0862ca 100644
--- a/test/CodeGen/X86/vec_call.ll
+++ b/test/CodeGen/X86/vec_call.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin8 | \
-; RUN:   grep {subl.*60}
+; RUN:   grep "subl.*60"
 ; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin8 | \
-; RUN:   grep {movaps.*32}
+; RUN:   grep "movaps.*32"
 
 
 define void @test() {
diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll
new file mode 100644
index 0000000..08eb16f
--- /dev/null
+++ b/test/CodeGen/X86/vec_cast2.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+
+;CHECK: foo1_8
+;CHECK: vcvtdq2ps
+;CHECK: ret
+define <8 x float> @foo1_8(<8 x i8> %src) {
+  %res = sitofp <8 x i8> %src to <8 x float>
+  ret <8 x float> %res
+}
+
+;CHECK: foo1_4
+;CHECK: vcvtdq2ps
+;CHECK: ret
+define <4 x float> @foo1_4(<4 x i8> %src) {
+  %res = sitofp <4 x i8> %src to <4 x float>
+  ret <4 x float> %res
+}
+
+;CHECK: foo2_8
+;CHECK: vcvtdq2ps
+;CHECK: ret
+define <8 x float> @foo2_8(<8 x i8> %src) {
+  %res = uitofp <8 x i8> %src to <8 x float>
+  ret <8 x float> %res
+}
+
+;CHECK: foo2_4
+;CHECK: vcvtdq2ps
+;CHECK: ret
+define <4 x float> @foo2_4(<4 x i8> %src) {
+  %res = uitofp <4 x i8> %src to <4 x float>
+  ret <4 x float> %res
+}
+
+;CHECK: foo3_8
+;CHECK: vcvttps2dq
+;CHECK: ret
+define <8 x i8> @foo3_8(<8 x float> %src) {
+  %res = fptosi <8 x float> %src to <8 x i8>
+  ret <8 x i8> %res
+}
+;CHECK: foo3_4
+;CHECK: vcvttps2dq
+;CHECK: ret
+define <4 x i8> @foo3_4(<4 x float> %src) {
+  %res = fptosi <4 x float> %src to <4 x i8>
+  ret <4 x i8> %res
+}
+
diff --git a/test/CodeGen/X86/vec_compare-2.ll b/test/CodeGen/X86/vec_compare-2.ll
index 91777f7..46d6a23 100644
--- a/test/CodeGen/X86/vec_compare-2.ll
+++ b/test/CodeGen/X86/vec_compare-2.ll
@@ -10,8 +10,7 @@ define void @blackDespeckle_wrapper(i8** %args_list, i64* %gtid, i64 %xend) {
 entry:
 ; CHECK: cfi_def_cfa_offset
 ; CHECK-NOT: set
-; CHECK: movzwl
-; CHECK: movzwl
+; CHECK: punpcklwd
 ; CHECK: pshufd
 ; CHECK: pshufb
   %shr.i = ashr <4 x i32> zeroinitializer, <i32 3, i32 3, i32 3, i32 3> ; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_compare.ll b/test/CodeGen/X86/vec_compare.ll
index 39c9b77..367dd27 100644
--- a/test/CodeGen/X86/vec_compare.ll
+++ b/test/CodeGen/X86/vec_compare.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=yonah -mtriple=i386-apple-darwin | FileCheck %s
 
 
 define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
diff --git a/test/CodeGen/X86/vec_ins_extract-1.ll b/test/CodeGen/X86/vec_ins_extract-1.ll
index 2951193..565be7a 100644
--- a/test/CodeGen/X86/vec_ins_extract-1.ll
+++ b/test/CodeGen/X86/vec_ins_extract-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | grep {(%esp,%eax,4)} | count 4
+; RUN: llc < %s -march=x86 -mcpu=yonah | grep "(%esp,%eax,4)" | count 4
 
 ; Inserts and extracts with variable indices must be lowered
 ; to memory accesses.
diff --git a/test/CodeGen/X86/vec_insert-6.ll b/test/CodeGen/X86/vec_insert-6.ll
index de3b36f..2a4864a 100644
--- a/test/CodeGen/X86/vec_insert-6.ll
+++ b/test/CodeGen/X86/vec_insert-6.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep pslldq
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin9 -o /dev/null -stats -info-output-file - | grep asm-printer | grep 6
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | grep pslldq
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -mtriple=i686-apple-darwin9 -o /dev/null -stats -info-output-file - | grep asm-printer | grep 6
 
 define <4 x float> @t3(<4 x float>* %P) nounwind  {
 	%tmp1 = load <4 x float>* %P
diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll
index ada17e0..d1d7608 100644
--- a/test/CodeGen/X86/vec_set-3.ll
+++ b/test/CodeGen/X86/vec_set-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -o %t
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -o %t
 ; RUN: grep pshufd %t | count 2
 
 define <4 x float> @test(float %a) nounwind {
diff --git a/test/CodeGen/X86/vec_set-9.ll b/test/CodeGen/X86/vec_set-9.ll
index 3656e5f..b8ec0cf 100644
--- a/test/CodeGen/X86/vec_set-9.ll
+++ b/test/CodeGen/X86/vec_set-9.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86-64 | grep movd | count 1
-; RUN: llc < %s -march=x86-64 | grep {movlhps.*%xmm0, %xmm0}
+; RUN: llc < %s -march=x86-64 | grep "movlhps.*%xmm0, %xmm0"
 
 define <2 x i64> @test3(i64 %A) nounwind {
 entry:
diff --git a/test/CodeGen/X86/vec_shuffle-16.ll b/test/CodeGen/X86/vec_shuffle-16.ll
index 06f38ed..09d4c1a 100644
--- a/test/CodeGen/X86/vec_shuffle-16.ll
+++ b/test/CodeGen/X86/vec_shuffle-16.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse,-sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse2
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse,-sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse2
 
 ; sse:  t1:
 ; sse2: t1:
diff --git a/test/CodeGen/X86/vec_shuffle-19.ll b/test/CodeGen/X86/vec_shuffle-19.ll
index 861a1cc..b26f920 100644
--- a/test/CodeGen/X86/vec_shuffle-19.ll
+++ b/test/CodeGen/X86/vec_shuffle-19.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -o /dev/null -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 4
+; RUN: llc < %s -o /dev/null -march=x86 -mcpu=penryn -mattr=+sse2 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 4
 ; PR2485
 
 define <4 x i32> @t(<4 x i32> %a, <4 x i32> %b) nounwind  {
diff --git a/test/CodeGen/X86/vec_shuffle-27.ll b/test/CodeGen/X86/vec_shuffle-27.ll
index dec98c7..0aff822 100644
--- a/test/CodeGen/X86/vec_shuffle-27.ll
+++ b/test/CodeGen/X86/vec_shuffle-27.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse41 | FileCheck %s
 
 ; ModuleID = 'vec_shuffle-27.bc'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
@@ -35,4 +35,4 @@ entry:
   store <4 x i64> %vect1487, <4 x i64>* %ap
   store <4 x i64> %vect1488, <4 x i64>* %bp
   ret void;
-}
-\ No newline at end of file
+}
diff --git a/test/CodeGen/X86/vec_shuffle-35.ll b/test/CodeGen/X86/vec_shuffle-35.ll
index 7f0fcb5..f5083b4 100644
--- a/test/CodeGen/X86/vec_shuffle-35.ll
+++ b/test/CodeGen/X86/vec_shuffle-35.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 -mcpu=yonah -stack-alignment=16 -o %t
-; RUN: grep pextrw %t | count 13
-; RUN: grep pinsrw %t | count 14
+; RUN: grep pextrw %t | count 12
+; RUN: grep pinsrw %t | count 13
 ; RUN: grep rolw %t | count 13
 ; RUN: not grep esp %t
 ; RUN: not grep ebp %t
diff --git a/test/CodeGen/X86/vec_shuffle-36.ll b/test/CodeGen/X86/vec_shuffle-36.ll
index 8090afc..9a06015 100644
--- a/test/CodeGen/X86/vec_shuffle-36.ll
+++ b/test/CodeGen/X86/vec_shuffle-36.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=penryn -mattr=sse41 | FileCheck %s
 
 define <8 x i16> @shuf6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
 ; CHECK: pshufb
diff --git a/test/CodeGen/X86/vec_shuffle-37.ll b/test/CodeGen/X86/vec_shuffle-37.ll
index 430aa04..ed285f9 100644
--- a/test/CodeGen/X86/vec_shuffle-37.ll
+++ b/test/CodeGen/X86/vec_shuffle-37.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=core2 | FileCheck %s
 ; RUN: llc -O0 < %s -march=x86 -mcpu=core2 | FileCheck %s --check-prefix=CHECK_O0
 
 define <4 x i32> @t00(<4 x i32>* %a0) nounwind ssp {
diff --git a/test/CodeGen/X86/vec_shuffle-38.ll b/test/CodeGen/X86/vec_shuffle-38.ll
index 96ef883..ec196df 100644
--- a/test/CodeGen/X86/vec_shuffle-38.ll
+++ b/test/CodeGen/X86/vec_shuffle-38.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
 
 define <2 x double> @ld(<2 x double> %p) nounwind optsize ssp {
 ; CHECK: unpcklpd
diff --git a/test/CodeGen/X86/vec_shuffle-39.ll b/test/CodeGen/X86/vec_shuffle-39.ll
index 55531e3..ee8d2d5 100644
--- a/test/CodeGen/X86/vec_shuffle-39.ll
+++ b/test/CodeGen/X86/vec_shuffle-39.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn | FileCheck %s
 ; rdar://10050222, rdar://10134392
 
 define <4 x float> @t1(<4 x float> %a, <1 x i64>* nocapture %p) nounwind {
diff --git a/test/CodeGen/X86/vec_splat-2.ll b/test/CodeGen/X86/vec_splat-2.ll
index cde5ae9..f105de4 100644
--- a/test/CodeGen/X86/vec_splat-2.ll
+++ b/test/CodeGen/X86/vec_splat-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep pshufd | count 1
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 | grep pshufd | count 1
 
 define void @test(<2 x i64>* %P, i8 %x) nounwind {
 	%tmp = insertelement <16 x i8> zeroinitializer, i8 %x, i32 0		; <<16 x i8>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_splat-3.ll b/test/CodeGen/X86/vec_splat-3.ll
index 649b85c..feacc42 100644
--- a/test/CodeGen/X86/vec_splat-3.ll
+++ b/test/CodeGen/X86/vec_splat-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse41 -o %t
 ; RUN: grep punpcklwd %t | count 4
 ; RUN: grep punpckhwd %t | count 4
 ; RUN: grep "pshufd" %t | count 8
diff --git a/test/CodeGen/X86/vec_splat-4.ll b/test/CodeGen/X86/vec_splat-4.ll
index d9941e6..374acfa 100644
--- a/test/CodeGen/X86/vec_splat-4.ll
+++ b/test/CodeGen/X86/vec_splat-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse41 -o %t
 ; RUN: grep punpcklbw %t | count 16
 ; RUN: grep punpckhbw %t | count 16
 ; RUN: grep "pshufd" %t | count 16
diff --git a/test/CodeGen/X86/vec_splat.ll b/test/CodeGen/X86/vec_splat.ll
index a87fbd0..24d8487 100644
--- a/test/CodeGen/X86/vec_splat.ll
+++ b/test/CodeGen/X86/vec_splat.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep pshufd
-; RUN: llc < %s -march=x86 -mattr=+sse3 | grep movddup
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 | grep pshufd
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse3 | grep movddup
 
 define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind {
 	%tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0		; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_ss_load_fold.ll b/test/CodeGen/X86/vec_ss_load_fold.ll
index 3bd3f7b..c294df5 100644
--- a/test/CodeGen/X86/vec_ss_load_fold.ll
+++ b/test/CodeGen/X86/vec_ss_load_fold.ll
@@ -70,3 +70,17 @@ define <4 x float> @test4(<4 x float> %A, float *%b, i32 %C) nounwind {
 ; CHECK: call
 ; CHECK: roundss $4, %xmm{{.*}}, %xmm0
 }
+
+; PR13576 
+define  <2 x double> @test5() nounwind uwtable readnone noinline {
+entry:
+  %0 = tail call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> <double
+4.569870e+02, double 1.233210e+02>, i32 128) nounwind readnone
+  ret <2 x double> %0
+; CHECK: test5:
+; CHECK: mov
+; CHECK: mov
+; CHECK: cvtsi2sd
+}
+
+declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
diff --git a/test/CodeGen/X86/vshift-1.ll b/test/CodeGen/X86/vshift-1.ll
index 4955156..e775750 100644
--- a/test/CodeGen/X86/vshift-1.ll
+++ b/test/CodeGen/X86/vshift-1.ll
@@ -16,7 +16,7 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, i64 %amt) nounwind {
 entry:
 ; CHECK: shift1b:
 ; CHECK: movd
-; CHECK-NEXT: psllq
+; CHECK: psllq
   %0 = insertelement <2 x i64> undef, i64 %amt, i32 0
   %1 = insertelement <2 x i64> %0, i64 %amt, i32 1
   %shl = shl <2 x i64> %val, %1
@@ -38,7 +38,7 @@ define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
 entry:
 ; CHECK: shift2b:
 ; CHECK: movd
-; CHECK-NEXT: pslld
+; CHECK: pslld
   %0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %1 = insertelement <4 x i32> %0, i32 %amt, i32 1
   %2 = insertelement <4 x i32> %1, i32 %amt, i32 2
diff --git a/test/CodeGen/X86/vshift-2.ll b/test/CodeGen/X86/vshift-2.ll
index 9a9b419..9496893 100644
--- a/test/CodeGen/X86/vshift-2.ll
+++ b/test/CodeGen/X86/vshift-2.ll
@@ -16,7 +16,7 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, i64 %amt) nounwind {
 entry:
 ; CHECK: shift1b:
 ; CHECK: movd
-; CHECK-NEXT: psrlq
+; CHECK: psrlq
   %0 = insertelement <2 x i64> undef, i64 %amt, i32 0
   %1 = insertelement <2 x i64> %0, i64 %amt, i32 1
   %lshr = lshr <2 x i64> %val, %1
@@ -37,7 +37,7 @@ define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
 entry:
 ; CHECK: shift2b:
 ; CHECK: movd
-; CHECK-NEXT: psrld
+; CHECK: psrld
   %0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %1 = insertelement <4 x i32> %0, i32 %amt, i32 1
   %2 = insertelement <4 x i32> %1, i32 %amt, i32 2
@@ -63,7 +63,7 @@ entry:
 ; CHECK: shift3b:
 ; CHECK: movzwl
 ; CHECK: movd
-; CHECK-NEXT: psrlw
+; CHECK: psrlw
   %0 = insertelement <8 x i16> undef, i16 %amt, i32 0
   %1 = insertelement <8 x i16> %0, i16 %amt, i32 1
   %2 = insertelement <8 x i16> %0, i16 %amt, i32 2
diff --git a/test/CodeGen/X86/vshift-3.ll b/test/CodeGen/X86/vshift-3.ll
index 8e8a9aa..b2b48b9 100644
--- a/test/CodeGen/X86/vshift-3.ll
+++ b/test/CodeGen/X86/vshift-3.ll
@@ -28,7 +28,7 @@ define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
 entry:
 ; CHECK: shift2b:
 ; CHECK: movd
-; CHECK-NEXT: psrad
+; CHECK: psrad
   %0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %1 = insertelement <4 x i32> %0, i32 %amt, i32 1
   %2 = insertelement <4 x i32> %1, i32 %amt, i32 2
@@ -52,7 +52,7 @@ entry:
 ; CHECK: shift3b:
 ; CHECK: movzwl
 ; CHECK: movd
-; CHECK-NEXT: psraw
+; CHECK: psraw
   %0 = insertelement <8 x i16> undef, i16 %amt, i32 0
   %1 = insertelement <8 x i16> %0, i16 %amt, i32 1
   %2 = insertelement <8 x i16> %0, i16 %amt, i32 2
diff --git a/test/CodeGen/X86/vshift-5.ll b/test/CodeGen/X86/vshift-5.ll
index cb254ae..f6c311d 100644
--- a/test/CodeGen/X86/vshift-5.ll
+++ b/test/CodeGen/X86/vshift-5.ll
@@ -6,7 +6,7 @@ define void @shift5a(<4 x i32> %val, <4 x i32>* %dst, i32* %pamt) nounwind {
 entry:
 ; CHECK: shift5a:
 ; CHECK: movd
-; CHECK-NEXT: pslld
+; CHECK: pslld
   %amt = load i32* %pamt 
   %tmp0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %shamt = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> zeroinitializer 
@@ -20,7 +20,7 @@ define void @shift5b(<4 x i32> %val, <4 x i32>* %dst, i32* %pamt) nounwind {
 entry:
 ; CHECK: shift5b:
 ; CHECK: movd
-; CHECK-NEXT: psrad
+; CHECK: psrad
   %amt = load i32* %pamt 
   %tmp0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %shamt = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> zeroinitializer 
@@ -34,7 +34,7 @@ define void @shift5c(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
 entry:
 ; CHECK: shift5c:
 ; CHECK: movd
-; CHECK-NEXT: pslld
+; CHECK: pslld
   %tmp0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %shamt = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> zeroinitializer
   %shl = shl <4 x i32> %val, %shamt
@@ -47,7 +47,7 @@ define void @shift5d(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
 entry:
 ; CHECK: shift5d:
 ; CHECK: movd
-; CHECK-NEXT: psrad
+; CHECK: psrad
   %tmp0 = insertelement <4 x i32> undef, i32 %amt, i32 0
   %shamt = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> zeroinitializer
   %shr = ashr <4 x i32> %val, %shamt
diff --git a/test/CodeGen/X86/widen_arith-3.ll b/test/CodeGen/X86/widen_arith-3.ll
index f55b184..d86042a 100644
--- a/test/CodeGen/X86/widen_arith-3.ll
+++ b/test/CodeGen/X86/widen_arith-3.ll
@@ -2,7 +2,6 @@
 ; CHECK: incl
 ; CHECK: incl
 ; CHECK: incl
-; CHECK: addl
 
 ; Widen a v3i16 to v8i16 to do a vector add
 
diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll
index 4330aae..ebdfea9 100644
--- a/test/CodeGen/X86/widen_cast-1.ll
+++ b/test/CodeGen/X86/widen_cast-1.ll
@@ -1,7 +1,14 @@
-; RUN: llc -march=x86 -mattr=+sse42 < %s | FileCheck %s
+; RUN: llc -march=x86 -mcpu=generic -mattr=+sse42 < %s | FileCheck %s
+; RUN: llc -march=x86 -mcpu=atom -mattr=+sse42 < %s | FileCheck -check-prefix=ATOM %s
+
 ; CHECK: paddd
-; CHECK: pextrd
-; CHECK: movd
+; CHECK: movl
+; CHECK: movlpd
+
+; Scheduler causes produce a different instruction order
+; ATOM: movl
+; ATOM: paddd
+; ATOM: movlpd
 
 ; bitcast a v4i16 to v2i32
 
diff --git a/test/CodeGen/X86/widen_cast-2.ll b/test/CodeGen/X86/widen_cast-2.ll
index 5c695ea..3979ce4 100644
--- a/test/CodeGen/X86/widen_cast-2.ll
+++ b/test/CodeGen/X86/widen_cast-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=nehalem -mattr=+sse42 | FileCheck %s
 ; CHECK: pextrd
 ; CHECK: pextrd
 ; CHECK: movd
diff --git a/test/CodeGen/X86/widen_cast-5.ll b/test/CodeGen/X86/widen_cast-5.ll
index 136578d..9086d3a 100644
--- a/test/CodeGen/X86/widen_cast-5.ll
+++ b/test/CodeGen/X86/widen_cast-5.ll
@@ -1,9 +1,8 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
 ; CHECK: movl
-; CHECK: movd
+; CHECK: movlpd
 
 ; bitcast a i64 to v2i32
-
 define void @convert(<2 x i32>* %dst.addr, i64 %src) nounwind {
 entry:
 	%conv = bitcast i64 %src to <2 x i32>
diff --git a/test/CodeGen/X86/widen_conv-4.ll b/test/CodeGen/X86/widen_conv-4.ll
index affd796..1158e04 100644
--- a/test/CodeGen/X86/widen_conv-4.ll
+++ b/test/CodeGen/X86/widen_conv-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=nehalem -mattr=+sse42 | FileCheck %s
 ; CHECK-NOT: cvtsi2ss
 
 ; unsigned to float v7i16 to v7f32
diff --git a/test/CodeGen/X86/widen_extract-1.ll b/test/CodeGen/X86/widen_extract-1.ll
index 4bcac58..8672742 100644
--- a/test/CodeGen/X86/widen_extract-1.ll
+++ b/test/CodeGen/X86/widen_extract-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=nehalem -mattr=+sse42 | FileCheck %s
 ; widen extract subvector
 
 define void @convert(<2 x double>* %dst.addr, <3 x double> %src)  {
diff --git a/test/CodeGen/X86/widen_load-0.ll b/test/CodeGen/X86/widen_load-0.ll
index 4aeec91..d543728 100644
--- a/test/CodeGen/X86/widen_load-0.ll
+++ b/test/CodeGen/X86/widen_load-0.ll
@@ -1,18 +1,12 @@
 ; RUN: llc < %s -o - -mtriple=x86_64-linux -mcpu=corei7 | FileCheck %s
-; RUN: llc < %s -o - -mtriple=x86_64-win32 -mcpu=corei7 | FileCheck %s -check-prefix=WIN64
 ; PR4891
 
 ; Both loads should happen before either store.
 
-; CHECK: movd  ({{.*}}), {{.*}}
-; CHECK: movd  ({{.*}}), {{.*}}
-; CHECK: movd  {{.*}}, ({{.*}})
-; CHECK: movd  {{.*}}, ({{.*}})
-
-; WIN64: movd  ({{.*}}), {{.*}}
-; WIN64: movd  ({{.*}}), {{.*}}
-; WIN64: movd  {{.*}}, ({{.*}})
-; WIN64: movd  {{.*}}, ({{.*}})
+; CHECK: movl  ({{.*}}), {{.*}}
+; CHECK: movl  ({{.*}}), {{.*}}
+; CHECK: movl  {{.*}}, ({{.*}})
+; CHECK: movl  {{.*}}, ({{.*}})
 
 define void @short2_int_swap(<2 x i16>* nocapture %b, i32* nocapture %c) nounwind {
 entry:
diff --git a/test/CodeGen/X86/win64_alloca_dynalloca.ll b/test/CodeGen/X86/win64_alloca_dynalloca.ll
index a961c6a..cc11e4c 100644
--- a/test/CodeGen/X86/win64_alloca_dynalloca.ll
+++ b/test/CodeGen/X86/win64_alloca_dynalloca.ll
@@ -1,12 +1,9 @@
-; RUN: llc < %s -join-physregs -mcpu=generic -mtriple=x86_64-mingw32     | FileCheck %s -check-prefix=M64
-; RUN: llc < %s -join-physregs -mcpu=generic -mtriple=x86_64-win32       | FileCheck %s -check-prefix=W64
-; RUN: llc < %s -join-physregs -mcpu=generic -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32     | FileCheck %s -check-prefix=M64
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32       | FileCheck %s -check-prefix=W64
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI
 ; PR8777
 ; PR8778
 
-; Passing the same value in two registers creates a false interference that
-; only -join-physregs resolves. It could also be handled by a parallel copy.
-
 define i64 @foo(i64 %n, i64 %x) nounwind {
 entry:
 
@@ -31,19 +28,19 @@ entry:
 
   %buf1 = alloca i8, i64 %n, align 1
 
-; M64: leaq  15(%rcx), %rax
+; M64: leaq  15(%{{.*}}), %rax
 ; M64: andq  $-16, %rax
 ; M64: callq ___chkstk
 ; M64-NOT:   %rsp
 ; M64: movq  %rsp, %rax
 
-; W64: leaq  15(%rcx), %rax
+; W64: leaq  15(%{{.*}}), %rax
 ; W64: andq  $-16, %rax
 ; W64: callq __chkstk
 ; W64: subq  %rax, %rsp
 ; W64: movq  %rsp, %rax
 
-; EFI: leaq  15(%rcx), [[R1:%r.*]]
+; EFI: leaq  15(%{{.*}}), [[R1:%r.*]]
 ; EFI: andq  $-16, [[R1]]
 ; EFI: movq  %rsp, [[R64:%r.*]]
 ; EFI: subq  [[R1]], [[R64]]
diff --git a/test/CodeGen/X86/x86-64-arg.ll b/test/CodeGen/X86/x86-64-arg.ll
index ec8dd8e..9a959e8 100644
--- a/test/CodeGen/X86/x86-64-arg.ll
+++ b/test/CodeGen/X86/x86-64-arg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | grep {movl	%edi, %eax}
+; RUN: llc < %s | grep "movl	%edi, %eax"
 ; The input value is already sign extended, don't re-extend it.
 ; This testcase corresponds to:
 ;   int test(short X) { return (int)X; }
diff --git a/test/CodeGen/X86/x86-64-dead-stack-adjust.ll b/test/CodeGen/X86/x86-64-dead-stack-adjust.ll
index 79316f2..902c9d5 100644
--- a/test/CodeGen/X86/x86-64-dead-stack-adjust.ll
+++ b/test/CodeGen/X86/x86-64-dead-stack-adjust.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s | not grep rsp
-; RUN: llc < %s | grep cvttsd2siq
+; RUN: llc < %s -mcpu=nehalem | not grep rsp
+; RUN: llc < %s -mcpu=nehalem | grep cvttsd2siq
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin8"
diff --git a/test/CodeGen/X86/x86-64-pic-1.ll b/test/CodeGen/X86/x86-64-pic-1.ll
index 46f6d33..46cd4f8 100644
--- a/test/CodeGen/X86/x86-64-pic-1.ll
+++ b/test/CodeGen/X86/x86-64-pic-1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {callq	f@PLT} %t1
+; RUN: grep "callq	f@PLT" %t1
 
 define void @g() {
 entry:
diff --git a/test/CodeGen/X86/x86-64-pic-10.ll b/test/CodeGen/X86/x86-64-pic-10.ll
index b6f82e2..3ec172b 100644
--- a/test/CodeGen/X86/x86-64-pic-10.ll
+++ b/test/CodeGen/X86/x86-64-pic-10.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {callq	g@PLT} %t1
+; RUN: grep "callq	g@PLT" %t1
 
 @g = alias weak i32 ()* @f
 
diff --git a/test/CodeGen/X86/x86-64-pic-11.ll b/test/CodeGen/X86/x86-64-pic-11.ll
index 4db331c..fd64beb 100644
--- a/test/CodeGen/X86/x86-64-pic-11.ll
+++ b/test/CodeGen/X86/x86-64-pic-11.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {callq	__fixunsxfti@PLT} %t1
+; RUN: grep "callq	__fixunsxfti@PLT" %t1
 
 define i128 @f(x86_fp80 %a) nounwind {
 entry:
diff --git a/test/CodeGen/X86/x86-64-pic-2.ll b/test/CodeGen/X86/x86-64-pic-2.ll
index 1ce2de7..f3f7b1d 100644
--- a/test/CodeGen/X86/x86-64-pic-2.ll
+++ b/test/CodeGen/X86/x86-64-pic-2.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {callq	f} %t1
-; RUN: not grep {callq	f@PLT} %t1
+; RUN: grep "callq	f" %t1
+; RUN: not grep "callq	f@PLT" %t1
 
 define void @g() {
 entry:
diff --git a/test/CodeGen/X86/x86-64-pic-3.ll b/test/CodeGen/X86/x86-64-pic-3.ll
index aa3c888..ba93378 100644
--- a/test/CodeGen/X86/x86-64-pic-3.ll
+++ b/test/CodeGen/X86/x86-64-pic-3.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {callq	f} %t1
-; RUN: not grep {callq	f@PLT} %t1
+; RUN: grep "callq	f" %t1
+; RUN: not grep "callq	f@PLT" %t1
 
 define void @g() {
 entry:
diff --git a/test/CodeGen/X86/x86-64-pic-4.ll b/test/CodeGen/X86/x86-64-pic-4.ll
index 90fc119..33b08c4 100644
--- a/test/CodeGen/X86/x86-64-pic-4.ll
+++ b/test/CodeGen/X86/x86-64-pic-4.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {movq	a@GOTPCREL(%rip),} %t1
+; RUN: grep "movq	a@GOTPCREL(%rip)," %t1
 
 @a = global i32 0
 
diff --git a/test/CodeGen/X86/x86-64-pic-5.ll b/test/CodeGen/X86/x86-64-pic-5.ll
index 6369bde..234bc0d 100644
--- a/test/CodeGen/X86/x86-64-pic-5.ll
+++ b/test/CodeGen/X86/x86-64-pic-5.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {movl	a(%rip),} %t1
+; RUN: grep "movl	a(%rip)," %t1
 ; RUN: not grep GOTPCREL %t1
 
 @a = hidden global i32 0
diff --git a/test/CodeGen/X86/x86-64-pic-6.ll b/test/CodeGen/X86/x86-64-pic-6.ll
index 6e19ad3..ae5b583 100644
--- a/test/CodeGen/X86/x86-64-pic-6.ll
+++ b/test/CodeGen/X86/x86-64-pic-6.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {movl	a(%rip),} %t1
+; RUN: grep "movl	a(%rip)," %t1
 ; RUN: not grep GOTPCREL %t1
 
 @a = internal global i32 0
diff --git a/test/CodeGen/X86/x86-64-pic-7.ll b/test/CodeGen/X86/x86-64-pic-7.ll
index 4d98ee6..de240a3 100644
--- a/test/CodeGen/X86/x86-64-pic-7.ll
+++ b/test/CodeGen/X86/x86-64-pic-7.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {movq	f@GOTPCREL(%rip),} %t1
+; RUN: grep "movq	f@GOTPCREL(%rip)," %t1
 
 define void ()* @g() nounwind {
 entry:
diff --git a/test/CodeGen/X86/x86-64-pic-8.ll b/test/CodeGen/X86/x86-64-pic-8.ll
index d3b567c..db35c33 100644
--- a/test/CodeGen/X86/x86-64-pic-8.ll
+++ b/test/CodeGen/X86/x86-64-pic-8.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {leaq	f(%rip),} %t1
+; RUN: grep "leaq	f(%rip)," %t1
 ; RUN: not grep GOTPCREL %t1
 
 define void ()* @g() {
diff --git a/test/CodeGen/X86/x86-64-pic-9.ll b/test/CodeGen/X86/x86-64-pic-9.ll
index 0761031..6daea84 100644
--- a/test/CodeGen/X86/x86-64-pic-9.ll
+++ b/test/CodeGen/X86/x86-64-pic-9.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1
-; RUN: grep {leaq	f(%rip),} %t1
+; RUN: grep "leaq	f(%rip)," %t1
 ; RUN: not grep GOTPCREL %t1
 
 define void ()* @g() nounwind {
diff --git a/test/CodeGen/X86/xop-intrinsics-x86_64.ll b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
index a2521b0..8af782c 100644
--- a/test/CodeGen/X86/xop-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
@@ -875,37 +875,37 @@ define <8 x i16> @test_int_x86_xop_vpshlw_mr(<8 x i16>* %a0, <8 x i16> %a1) {
 }
 declare <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16>, <8 x i16>) nounwind readnone
 
-define <4 x float> @test_int_x86_xop_vfrcz_ss(<4 x float> %a0, <4 x float> %a1) {
+define <4 x float> @test_int_x86_xop_vfrcz_ss(<4 x float> %a0) {
   ; CHECK-NOT: mov
   ; CHECK: vfrczss
-  %res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0, <4 x float> %a1) ;
+  %res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0) ;
   ret <4 x float> %res
 }
-define <4 x float> @test_int_x86_xop_vfrcz_ss_mem(<4 x float> %a0, float* %a1) {
+define <4 x float> @test_int_x86_xop_vfrcz_ss_mem(float* %a0) {
   ; CHECK-NOT: mov
   ; CHECK: vfrczss
-  %elem = load float* %a1
+  %elem = load float* %a0
   %vec = insertelement <4 x float> undef, float %elem, i32 0
-  %res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0, <4 x float> %vec) ;
+  %res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %vec) ;
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>) nounwind readnone
 
-define <2 x double> @test_int_x86_xop_vfrcz_sd(<2 x double> %a0, <2 x double> %a1) {
+define <2 x double> @test_int_x86_xop_vfrcz_sd(<2 x double> %a0) {
   ; CHECK-NOT: mov
   ; CHECK: vfrczsd
-  %res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0, <2 x double> %a1) ;
+  %res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0) ;
   ret <2 x double> %res
 }
-define <2 x double> @test_int_x86_xop_vfrcz_sd_mem(<2 x double> %a0, double* %a1) {
+define <2 x double> @test_int_x86_xop_vfrcz_sd_mem(double* %a0) {
   ; CHECK-NOT: mov
   ; CHECK: vfrczsd
-  %elem = load double* %a1
+  %elem = load double* %a0
   %vec = insertelement <2 x double> undef, double %elem, i32 0
-  %res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0, <2 x double> %vec) ;
+  %res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %vec) ;
   ret <2 x double> %res
 }
-declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>) nounwind readnone
 
 define <2 x double> @test_int_x86_xop_vfrcz_pd(<2 x double> %a0) {
   ; CHECK: vfrczpd
@@ -967,3 +967,59 @@ define <8 x float> @test_int_x86_xop_vfrcz_ps_256_mem(<8 x float>* %a0) {
 }
 declare <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float>) nounwind readnone
 
+define <16 x i8> @test_int_x86_xop_vpcomb(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK:vpcomb
+  %res = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %a0, <16 x i8> %a1, i8 0) ;
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomw(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: vpcomw
+  %res = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %a0, <8 x i16> %a1, i8 0) ;
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomd(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: vpcomd
+  %res = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %a0, <4 x i32> %a1, i8 0) ;
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomq(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: vpcomq
+  %res = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ;
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <16 x i8> @test_int_x86_xop_vpcomub(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK:vpcomub
+  %res = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %a0, <16 x i8> %a1, i8 0) ;
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+define <8 x i16> @test_int_x86_xop_vpcomuw(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: vpcomuw
+  %res = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %a0, <8 x i16> %a1, i8 0) ;
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone
+
+define <4 x i32> @test_int_x86_xop_vpcomud(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: vpcomud
+  %res = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %a0, <4 x i32> %a1, i8 0) ;
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcomuq(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: vpcomuq
+  %res = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ;
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
diff --git a/test/CodeGen/X86/xor.ll b/test/CodeGen/X86/xor.ll
index ddc4cab..996bfc4 100644
--- a/test/CodeGen/X86/xor.ll
+++ b/test/CodeGen/X86/xor.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2  | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2  | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-linux | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-win32 | FileCheck %s -check-prefix=X64
 
 ; Though it is undefined, we want xor undef,undef to produce zero.
 define <4 x i32> @test1() nounwind {
@@ -31,7 +31,7 @@ entry:
 ; X64: test3:
 ; X64:	notl
 ; X64:	andl
-; X64:	shrl	%eax
+; X64:	shrl
 ; X64:	ret
 
 ; X32: test3: