summaryrefslogtreecommitdiffstats
path: root/test/CodeGen/ARM
diff options
context:
space:
mode:
Diffstat (limited to 'test/CodeGen/ARM')
-rw-r--r--test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll1
-rw-r--r--test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll1
-rw-r--r--test/CodeGen/ARM/2009-05-05-DAGCombineBug.ll2
-rw-r--r--test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll1
-rw-r--r--test/CodeGen/ARM/2009-11-02-NegativeLane.ll14
-rw-r--r--test/CodeGen/ARM/2010-05-14-IllegalType.ll10
-rw-r--r--test/CodeGen/ARM/2010-05-17-DAGCombineAssert.ll17
-rw-r--r--test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll105
-rw-r--r--test/CodeGen/ARM/2010-05-18-LocalAllocCrash.ll37
-rw-r--r--test/CodeGen/ARM/2010-05-18-PostIndexBug.ll25
-rw-r--r--test/CodeGen/ARM/2010-05-19-Shuffles.ll21
-rw-r--r--test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll45
-rw-r--r--test/CodeGen/ARM/2010-05-21-BuildVector.ll43
-rw-r--r--test/CodeGen/ARM/arm-frameaddr.ll9
-rw-r--r--test/CodeGen/ARM/arm-returnaddr.ll24
-rw-r--r--test/CodeGen/ARM/div.ll24
-rw-r--r--test/CodeGen/ARM/fabss.ll2
-rw-r--r--test/CodeGen/ARM/fadds.ll2
-rw-r--r--test/CodeGen/ARM/fdivs.ll2
-rw-r--r--test/CodeGen/ARM/fmacs.ll2
-rw-r--r--test/CodeGen/ARM/fmscs.ll2
-rw-r--r--test/CodeGen/ARM/fmuls.ll2
-rw-r--r--test/CodeGen/ARM/fnmscs.ll4
-rw-r--r--test/CodeGen/ARM/lsr-on-unrolled-loops.ll642
-rw-r--r--test/CodeGen/ARM/mul_const.ll34
-rw-r--r--test/CodeGen/ARM/reg_sequence.ll348
-rw-r--r--test/CodeGen/ARM/spill-q.ll3
-rw-r--r--test/CodeGen/ARM/trap.ll12
-rw-r--r--test/CodeGen/ARM/vcgt.ll13
29 files changed, 1419 insertions, 28 deletions
diff --git a/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll b/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll
index ff01506..f775c61 100644
--- a/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll
+++ b/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -mtriple=arm-linux-gnueabi -regalloc=local
+; RUN: llc < %s -mtriple=arm-linux-gnueabi -regalloc=fast
; PR1925
%struct.encode_aux_nearestmatch = type { i32*, i32*, i32*, i32*, i32, i32 }
diff --git a/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll b/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll
index 06bc987..8ef8c7b 100644
--- a/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll
+++ b/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=local
+; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=fast
; PR1925
%"struct.kc::impl_Ccode_option" = type { %"struct.kc::impl_abstract_phylum" }
diff --git a/test/CodeGen/ARM/2009-05-05-DAGCombineBug.ll b/test/CodeGen/ARM/2009-05-05-DAGCombineBug.ll
index 670d204..a48e41f 100644
--- a/test/CodeGen/ARM/2009-05-05-DAGCombineBug.ll
+++ b/test/CodeGen/ARM/2009-05-05-DAGCombineBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm-linuxeabi-unknown-gnu -mattr=+v6
+; RUN: llc < %s -mtriple=arm-unknown-linux-gnueabi -mattr=+v6
; PR4166
%"byte[]" = type { i32, i8* }
diff --git a/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll b/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll
index 75610ff..912e6f9 100644
--- a/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll
+++ b/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -mtriple=armv5-unknown-linux-gnueabi -O0 -regalloc=local
+; RUN: llc < %s -mtriple=armv5-unknown-linux-gnueabi -O0 -regalloc=fast
; PR4100
@.str = external constant [30 x i8] ; <[30 x i8]*> [#uses=1]
diff --git a/test/CodeGen/ARM/2009-11-02-NegativeLane.ll b/test/CodeGen/ARM/2009-11-02-NegativeLane.ll
index f2288c3..89c9037 100644
--- a/test/CodeGen/ARM/2009-11-02-NegativeLane.ll
+++ b/test/CodeGen/ARM/2009-11-02-NegativeLane.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=cortex-a8 < %s | grep vdup.32
+; RUN: llc -mcpu=cortex-a8 < %s | grep vdup.16
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
target triple = "armv7-eabi"
@@ -7,12 +7,12 @@ entry:
br i1 undef, label %return, label %bb
bb: ; preds = %bb, %entry
- %0 = load float* undef, align 4 ; <float> [#uses=1]
- %1 = insertelement <4 x float> undef, float %0, i32 2 ; <<4 x float>> [#uses=1]
- %2 = insertelement <4 x float> %1, float undef, i32 3 ; <<4 x float>> [#uses=1]
- %3 = fmul <4 x float> undef, %2 ; <<4 x float>> [#uses=1]
- %4 = extractelement <4 x float> %3, i32 1 ; <float> [#uses=1]
- store float %4, float* undef, align 4
+ %0 = load i16* undef, align 2
+ %1 = insertelement <8 x i16> undef, i16 %0, i32 2
+ %2 = insertelement <8 x i16> %1, i16 undef, i32 3
+ %3 = mul <8 x i16> %2, %2
+ %4 = extractelement <8 x i16> %3, i32 2
+ store i16 %4, i16* undef, align 2
br i1 undef, label %return, label %bb
return: ; preds = %bb, %entry
diff --git a/test/CodeGen/ARM/2010-05-14-IllegalType.ll b/test/CodeGen/ARM/2010-05-14-IllegalType.ll
new file mode 100644
index 0000000..99e5b09
--- /dev/null
+++ b/test/CodeGen/ARM/2010-05-14-IllegalType.ll
@@ -0,0 +1,10 @@
+; RUN: llc -march=thumb -mcpu=cortex-a8 -mtriple=thumbv7-eabi -float-abi=hard < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
+target triple = "thumbv7-apple-darwin10"
+
+define <4 x i64> @f_4_i64(<4 x i64> %a, <4 x i64> %b) nounwind {
+; CHECK: vadd.i64
+ %y = add <4 x i64> %a, %b
+ ret <4 x i64> %y
+}
diff --git a/test/CodeGen/ARM/2010-05-17-DAGCombineAssert.ll b/test/CodeGen/ARM/2010-05-17-DAGCombineAssert.ll
new file mode 100644
index 0000000..2a4bbd1
--- /dev/null
+++ b/test/CodeGen/ARM/2010-05-17-DAGCombineAssert.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mtriple=armv7-eabi -mcpu=cortex-a8
+; PR7158
+
+define arm_aapcs_vfpcc i32 @main() nounwind {
+bb.nph55.bb.nph55.split_crit_edge:
+ br label %bb3
+
+bb3: ; preds = %bb3, %bb.nph55.bb.nph55.split_crit_edge
+ br i1 undef, label %bb.i19, label %bb3
+
+bb.i19: ; preds = %bb.i19, %bb3
+ %0 = insertelement <4 x float> undef, float undef, i32 3 ; <<4 x float>> [#uses=3]
+ %1 = fmul <4 x float> %0, %0 ; <<4 x float>> [#uses=1]
+ %2 = bitcast <4 x float> %1 to <2 x double> ; <<2 x double>> [#uses=0]
+ %3 = fmul <4 x float> %0, undef ; <<4 x float>> [#uses=0]
+ br label %bb.i19
+}
diff --git a/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll b/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll
new file mode 100644
index 0000000..813bf3c
--- /dev/null
+++ b/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll
@@ -0,0 +1,105 @@
+; RUN: llc < %s -regalloc=fast -verify-machineinstrs
+target triple = "arm-pc-linux-gnu"
+
+; This test case would accidentally use the same physreg for two virtregs
+; because allocVirtReg forgot to check if registers were already used in the
+; instruction.
+; This caused the RegScavenger to complain, but -verify-machineinstrs also
+; catches it.
+
+%struct.CHESS_POSITION = type { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, i32, i8, i8, [64 x i8], i8, i8, i8, i8, i8 }
+
+@search = external global %struct.CHESS_POSITION ; <%struct.CHESS_POSITION*> [#uses=1]
+@bishop_mobility_rr45 = external global [64 x [256 x i32]] ; <[64 x [256 x i32]]*> [#uses=1]
+
+declare fastcc i32 @FirstOne()
+
+define fastcc void @Evaluate() {
+entry:
+ br i1 false, label %cond_false186, label %cond_true
+
+cond_true: ; preds = %entry
+ ret void
+
+cond_false186: ; preds = %entry
+ br i1 false, label %cond_true293, label %bb203
+
+bb203: ; preds = %cond_false186
+ ret void
+
+cond_true293: ; preds = %cond_false186
+ br i1 false, label %cond_true298, label %cond_next317
+
+cond_true298: ; preds = %cond_true293
+ br i1 false, label %cond_next518, label %cond_true397.preheader
+
+cond_next317: ; preds = %cond_true293
+ ret void
+
+cond_true397.preheader: ; preds = %cond_true298
+ ret void
+
+cond_next518: ; preds = %cond_true298
+ br i1 false, label %bb1069, label %cond_true522
+
+cond_true522: ; preds = %cond_next518
+ ret void
+
+bb1069: ; preds = %cond_next518
+ br i1 false, label %cond_next1131, label %bb1096
+
+bb1096: ; preds = %bb1069
+ ret void
+
+cond_next1131: ; preds = %bb1069
+ br i1 false, label %cond_next1207, label %cond_true1150
+
+cond_true1150: ; preds = %cond_next1131
+ ret void
+
+cond_next1207: ; preds = %cond_next1131
+ br i1 false, label %cond_next1219, label %cond_true1211
+
+cond_true1211: ; preds = %cond_next1207
+ ret void
+
+cond_next1219: ; preds = %cond_next1207
+ br i1 false, label %cond_true1223, label %cond_next1283
+
+cond_true1223: ; preds = %cond_next1219
+ br i1 false, label %cond_true1254, label %cond_true1264
+
+cond_true1254: ; preds = %cond_true1223
+ br i1 false, label %bb1567, label %cond_true1369.preheader
+
+cond_true1264: ; preds = %cond_true1223
+ ret void
+
+cond_next1283: ; preds = %cond_next1219
+ ret void
+
+cond_true1369.preheader: ; preds = %cond_true1254
+ ret void
+
+bb1567: ; preds = %cond_true1254
+ %tmp1591 = load i64* getelementptr inbounds (%struct.CHESS_POSITION* @search, i32 0, i32 4) ; <i64> [#uses=1]
+ %tmp1572 = tail call fastcc i32 @FirstOne() ; <i32> [#uses=1]
+ %tmp1594 = load i32* undef ; <i32> [#uses=1]
+ %tmp1594.upgrd.5 = trunc i32 %tmp1594 to i8 ; <i8> [#uses=1]
+ %shift.upgrd.6 = zext i8 %tmp1594.upgrd.5 to i64 ; <i64> [#uses=1]
+ %tmp1595 = lshr i64 %tmp1591, %shift.upgrd.6 ; <i64> [#uses=1]
+ %tmp1595.upgrd.7 = trunc i64 %tmp1595 to i32 ; <i32> [#uses=1]
+ %tmp1596 = and i32 %tmp1595.upgrd.7, 255 ; <i32> [#uses=1]
+ %gep.upgrd.8 = zext i32 %tmp1596 to i64 ; <i64> [#uses=1]
+ %tmp1598 = getelementptr [64 x [256 x i32]]* @bishop_mobility_rr45, i32 0, i32 %tmp1572, i64 %gep.upgrd.8 ; <i32*> [#uses=1]
+ %tmp1599 = load i32* %tmp1598 ; <i32> [#uses=1]
+ %tmp1602 = sub i32 0, %tmp1599 ; <i32> [#uses=1]
+ br i1 undef, label %cond_next1637, label %cond_true1607
+
+cond_true1607: ; preds = %bb1567
+ ret void
+
+cond_next1637: ; preds = %bb1567
+ %tmp1662 = sub i32 %tmp1602, 0 ; <i32> [#uses=0]
+ ret void
+}
diff --git a/test/CodeGen/ARM/2010-05-18-LocalAllocCrash.ll b/test/CodeGen/ARM/2010-05-18-LocalAllocCrash.ll
new file mode 100644
index 0000000..b158afd
--- /dev/null
+++ b/test/CodeGen/ARM/2010-05-18-LocalAllocCrash.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -regalloc=local
+; RUN: llc < %s -O0 -verify-machineinstrs -regalloc=fast
+; rdar://problem/7948106
+;; This test would spill %R4 before the call to zz, but it forgot to move the
+; 'last use' marker to the spill.
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:64-n32"
+target triple = "armv6-apple-darwin"
+
+%struct.q = type { i32, i32 }
+
+@.str = external constant [1 x i8] ; <[1 x i8]*> [#uses=1]
+
+define arm_apcscc void @yy(%struct.q* %qq) nounwind {
+entry:
+ %vla6 = alloca i8, i32 undef, align 1 ; <i8*> [#uses=1]
+ %vla10 = alloca i8, i32 undef, align 1 ; <i8*> [#uses=1]
+ %vla14 = alloca i8, i32 undef, align 1 ; <i8*> [#uses=1]
+ %vla18 = alloca i8, i32 undef, align 1 ; <i8*> [#uses=1]
+ %tmp21 = load i32* undef ; <i32> [#uses=1]
+ %0 = mul i32 1, %tmp21 ; <i32> [#uses=1]
+ %vla22 = alloca i8, i32 %0, align 1 ; <i8*> [#uses=1]
+ call arm_apcscc void (...)* @zz(i8* getelementptr inbounds ([1 x i8]* @.str, i32 0, i32 0), i32 2, i32 1)
+ br i1 undef, label %if.then, label %if.end36
+
+if.then: ; preds = %entry
+ %call = call arm_apcscc i32 (...)* @x(%struct.q* undef, i8* undef, i8* %vla6, i8* %vla10, i32 undef) ; <i32> [#uses=0]
+ %call35 = call arm_apcscc i32 (...)* @x(%struct.q* undef, i8* %vla14, i8* %vla18, i8* %vla22, i32 undef) ; <i32> [#uses=0]
+ unreachable
+
+if.end36: ; preds = %entry
+ ret void
+}
+
+declare arm_apcscc void @zz(...)
+
+declare arm_apcscc i32 @x(...)
diff --git a/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll b/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll
new file mode 100644
index 0000000..9907228
--- /dev/null
+++ b/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=armv7-apple-darwin | FileCheck %s -check-prefix=ARM
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s -check-prefix=THUMB
+; rdar://7998649
+
+%struct.foo = type { i64, i64 }
+
+define arm_apcscc zeroext i8 @t(%struct.foo* %this) noreturn optsize {
+entry:
+; ARM: t:
+; ARM: str r0, [r1], r0
+
+; THUMB: t:
+; THUMB-NOT: str r0, [r1], r0
+; THUMB: str r0, [r1]
+ %0 = getelementptr inbounds %struct.foo* %this, i32 0, i32 1 ; <i64*> [#uses=1]
+ store i32 undef, i32* inttoptr (i32 8 to i32*), align 8
+ br i1 undef, label %bb.nph96, label %bb3
+
+bb3: ; preds = %entry
+ %1 = load i64* %0, align 4 ; <i64> [#uses=0]
+ unreachable
+
+bb.nph96: ; preds = %entry
+ unreachable
+}
diff --git a/test/CodeGen/ARM/2010-05-19-Shuffles.ll b/test/CodeGen/ARM/2010-05-19-Shuffles.ll
new file mode 100644
index 0000000..587c0af
--- /dev/null
+++ b/test/CodeGen/ARM/2010-05-19-Shuffles.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=armv7-eabi -mcpu=cortex-a8
+; pr7167
+
+define <8 x i8> @f1(<8 x i8> %x) nounwind {
+ %y = shufflevector <8 x i8> %x, <8 x i8> undef,
+ <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
+ ret <8 x i8> %y
+}
+
+define <8 x i8> @f2(<8 x i8> %x) nounwind {
+ %y = shufflevector <8 x i8> %x, <8 x i8> undef,
+ <8 x i32> <i32 1, i32 2, i32 0, i32 5, i32 3, i32 6, i32 7, i32 4>
+ ret <8 x i8> %y
+}
+
+define void @f3(<4 x i64>* %xp) nounwind {
+ %x = load <4 x i64>* %xp
+ %y = shufflevector <4 x i64> %x, <4 x i64> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+ store <4 x i64> %y, <4 x i64>* %xp
+ ret void
+}
diff --git a/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll b/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll
new file mode 100644
index 0000000..b6fbf9b
--- /dev/null
+++ b/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -march=arm -mattr=+neon -O0
+
+; This test would crash the rewriter when trying to handle a spill after one of
+; the @llvm.arm.neon.vld3.v8i8 defined three parts of a register.
+
+%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
+
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*) nounwind readonly
+
+declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
+
+define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind {
+ %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A2) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+ %tmp2b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 0 ; <<8 x i8>> [#uses=1]
+ %tmp4b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 1 ; <<8 x i8>> [#uses=1]
+ %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+ %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1]
+ %tmp4d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 1 ; <<8 x i8>> [#uses=1]
+ %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A5) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+ %tmp2e = extractvalue %struct.__neon_int8x8x3_t %tmp1e, 0 ; <<8 x i8>> [#uses=1]
+ %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+ %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1]
+ %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A7) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+ %tmp2g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 0 ; <<8 x i8>> [#uses=1]
+ %tmp4g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 1 ; <<8 x i8>> [#uses=1]
+ %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A8) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+ %tmp2h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 0 ; <<8 x i8>> [#uses=1]
+ %tmp3h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 2 ; <<8 x i8>> [#uses=1]
+ %tmp2bd = add <8 x i8> %tmp2b, %tmp2d ; <<8 x i8>> [#uses=1]
+ %tmp4bd = add <8 x i8> %tmp4b, %tmp4d ; <<8 x i8>> [#uses=1]
+ %tmp2abcd = mul <8 x i8> undef, %tmp2bd ; <<8 x i8>> [#uses=1]
+ %tmp4abcd = mul <8 x i8> undef, %tmp4bd ; <<8 x i8>> [#uses=2]
+ call void @llvm.arm.neon.vst3.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd)
+ %tmp2ef = sub <8 x i8> %tmp2e, %tmp2f ; <<8 x i8>> [#uses=1]
+ %tmp2gh = sub <8 x i8> %tmp2g, %tmp2h ; <<8 x i8>> [#uses=1]
+ %tmp3gh = sub <8 x i8> zeroinitializer, %tmp3h ; <<8 x i8>> [#uses=1]
+ %tmp4ef = sub <8 x i8> zeroinitializer, %tmp4g ; <<8 x i8>> [#uses=1]
+ %tmp2efgh = mul <8 x i8> %tmp2ef, %tmp2gh ; <<8 x i8>> [#uses=1]
+ %tmp3efgh = mul <8 x i8> undef, %tmp3gh ; <<8 x i8>> [#uses=1]
+ %tmp4efgh = mul <8 x i8> %tmp4ef, undef ; <<8 x i8>> [#uses=2]
+ call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh)
+ %tmp4 = sub <8 x i8> %tmp4efgh, %tmp4abcd ; <<8 x i8>> [#uses=1]
+ tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef)
+ ret <8 x i8> %tmp4
+}
diff --git a/test/CodeGen/ARM/2010-05-21-BuildVector.ll b/test/CodeGen/ARM/2010-05-21-BuildVector.ll
new file mode 100644
index 0000000..6b19490
--- /dev/null
+++ b/test/CodeGen/ARM/2010-05-21-BuildVector.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+; Radar 7872877
+
+define arm_apcscc void @test(float* %fltp, i32 %packedValue, float* %table) nounwind {
+entry:
+ %0 = load float* %fltp
+ %1 = insertelement <4 x float> undef, float %0, i32 0
+ %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+ %3 = shl i32 %packedValue, 16
+ %4 = ashr i32 %3, 30
+ %.sum = add i32 %4, 4
+ %5 = getelementptr inbounds float* %table, i32 %.sum
+;CHECK: vldr.32 s
+ %6 = load float* %5, align 4
+ %tmp11 = insertelement <4 x float> undef, float %6, i32 0
+ %7 = shl i32 %packedValue, 18
+ %8 = ashr i32 %7, 30
+ %.sum12 = add i32 %8, 4
+ %9 = getelementptr inbounds float* %table, i32 %.sum12
+;CHECK: vldr.32 s
+ %10 = load float* %9, align 4
+ %tmp9 = insertelement <4 x float> %tmp11, float %10, i32 1
+ %11 = shl i32 %packedValue, 20
+ %12 = ashr i32 %11, 30
+ %.sum13 = add i32 %12, 4
+ %13 = getelementptr inbounds float* %table, i32 %.sum13
+;CHECK: vldr.32 s
+ %14 = load float* %13, align 4
+ %tmp7 = insertelement <4 x float> %tmp9, float %14, i32 2
+ %15 = shl i32 %packedValue, 22
+ %16 = ashr i32 %15, 30
+ %.sum14 = add i32 %16, 4
+ %17 = getelementptr inbounds float* %table, i32 %.sum14
+;CHECK: vldr.32 s
+ %18 = load float* %17, align 4
+ %tmp5 = insertelement <4 x float> %tmp7, float %18, i32 3
+ %19 = fmul <4 x float> %tmp5, %2
+ %20 = bitcast float* %fltp to i8*
+ tail call void @llvm.arm.neon.vst1.v4f32(i8* %20, <4 x float> %19)
+ ret void
+}
+
+declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>) nounwind
diff --git a/test/CodeGen/ARM/arm-frameaddr.ll b/test/CodeGen/ARM/arm-frameaddr.ll
index 2739860..1c7ac25 100644
--- a/test/CodeGen/ARM/arm-frameaddr.ll
+++ b/test/CodeGen/ARM/arm-frameaddr.ll
@@ -1,10 +1,15 @@
-; RUN: llc < %s -mtriple=arm-apple-darwin | grep mov | grep r7
-; RUN: llc < %s -mtriple=arm-linux-gnueabi | grep mov | grep r11
+; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s -check-prefix=DARWIN
+; RUN: llc < %s -mtriple=arm-linux-gnueabi | FileCheck %s -check-prefix=LINUX
; PR4344
; PR4416
define arm_aapcscc i8* @t() nounwind {
entry:
+; DARWIN: t:
+; DARWIN: mov r0, r7
+
+; LINUX: t:
+; LINUX: mov r0, r11
%0 = call i8* @llvm.frameaddress(i32 0)
ret i8* %0
}
diff --git a/test/CodeGen/ARM/arm-returnaddr.ll b/test/CodeGen/ARM/arm-returnaddr.ll
new file mode 100644
index 0000000..2c8f2ab
--- /dev/null
+++ b/test/CodeGen/ARM/arm-returnaddr.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv6-apple-darwin
+; rdar://8015977
+; rdar://8020118
+
+define arm_apcscc i8* @rt0(i32 %x) nounwind readnone {
+entry:
+; CHECK: rt0:
+; CHECK: mov r0, lr
+ %0 = tail call i8* @llvm.returnaddress(i32 0)
+ ret i8* %0
+}
+
+define arm_apcscc i8* @rt2() nounwind readnone {
+entry:
+; CHECK: rt2:
+; CHECK: ldr r0, [r7]
+; CHECK: ldr r0, [r0]
+; CHECK: ldr r0, [r0, #4]
+ %0 = tail call i8* @llvm.returnaddress(i32 2)
+ ret i8* %0
+}
+
+declare i8* @llvm.returnaddress(i32) nounwind readnone
diff --git a/test/CodeGen/ARM/div.ll b/test/CodeGen/ARM/div.ll
index 2f724e7..d833afa 100644
--- a/test/CodeGen/ARM/div.ll
+++ b/test/CodeGen/ARM/div.ll
@@ -1,29 +1,43 @@
-; RUN: llc < %s -march=arm > %t
-; RUN: grep __divsi3 %t
-; RUN: grep __udivsi3 %t
-; RUN: grep __modsi3 %t
-; RUN: grep __umodsi3 %t
+; RUN: llc < %s -march=arm | FileCheck %s -check-prefix=CHECK-ARM
+; RUN: llc < %s -march=arm -mcpu=cortex-m3 \
+; RUN: | FileCheck %s -check-prefix=CHECK-ARMV7M
define i32 @f1(i32 %a, i32 %b) {
entry:
+; CHECK-ARM: f1
+; CHECK-ARM: __divsi3
+; CHECK-ARMV7M: f1
+; CHECK-ARMV7M: sdiv
%tmp1 = sdiv i32 %a, %b ; <i32> [#uses=1]
ret i32 %tmp1
}
define i32 @f2(i32 %a, i32 %b) {
entry:
+; CHECK-ARM: f2
+; CHECK-ARM: __udivsi3
+; CHECK-ARMV7M: f2
+; CHECK-ARMV7M: udiv
%tmp1 = udiv i32 %a, %b ; <i32> [#uses=1]
ret i32 %tmp1
}
define i32 @f3(i32 %a, i32 %b) {
entry:
+; CHECK-ARM: f3
+; CHECK-ARM: __modsi3
+; CHECK-ARMV7M: f3
+; CHECK-ARMV7M: sdiv
%tmp1 = srem i32 %a, %b ; <i32> [#uses=1]
ret i32 %tmp1
}
define i32 @f4(i32 %a, i32 %b) {
entry:
+; CHECK-ARM: f4
+; CHECK-ARM: __umodsi3
+; CHECK-ARMV7M: f4
+; CHECK-ARMV7M: udiv
%tmp1 = urem i32 %a, %b ; <i32> [#uses=1]
ret i32 %tmp1
}
diff --git a/test/CodeGen/ARM/fabss.ll b/test/CodeGen/ARM/fabss.ll
index f03282b..dfc1e0a 100644
--- a/test/CodeGen/ARM/fabss.ll
+++ b/test/CodeGen/ARM/fabss.ll
@@ -24,4 +24,4 @@ declare float @fabsf(float)
; CORTEXA8: test:
; CORTEXA8: vabs.f32 d1, d1
; CORTEXA9: test:
-; CORTEXA9: vabs.f32 s1, s1
+; CORTEXA9: vabs.f32 s0, s0
diff --git a/test/CodeGen/ARM/fadds.ll b/test/CodeGen/ARM/fadds.ll
index 749690e..113f0e2 100644
--- a/test/CodeGen/ARM/fadds.ll
+++ b/test/CodeGen/ARM/fadds.ll
@@ -20,4 +20,4 @@ entry:
; CORTEXA8: test:
; CORTEXA8: vadd.f32 d0, d1, d0
; CORTEXA9: test:
-; CORTEXA9: vadd.f32 s0, s1, s0
+; CORTEXA9: vadd.f32 s0, s0, s1
diff --git a/test/CodeGen/ARM/fdivs.ll b/test/CodeGen/ARM/fdivs.ll
index 0c31495..9af1217 100644
--- a/test/CodeGen/ARM/fdivs.ll
+++ b/test/CodeGen/ARM/fdivs.ll
@@ -20,4 +20,4 @@ entry:
; CORTEXA8: test:
; CORTEXA8: vdiv.f32 s0, s1, s0
; CORTEXA9: test:
-; CORTEXA9: vdiv.f32 s0, s1, s0
+; CORTEXA9: vdiv.f32 s0, s0, s1
diff --git a/test/CodeGen/ARM/fmacs.ll b/test/CodeGen/ARM/fmacs.ll
index f8b47b5..c4ceca9 100644
--- a/test/CodeGen/ARM/fmacs.ll
+++ b/test/CodeGen/ARM/fmacs.ll
@@ -21,4 +21,4 @@ entry:
; CORTEXA8: test:
; CORTEXA8: vmul.f32 d0, d1, d0
; CORTEXA9: test:
-; CORTEXA9: vmla.f32 s2, s1, s0
+; CORTEXA9: vmla.f32 s0, s1, s2
diff --git a/test/CodeGen/ARM/fmscs.ll b/test/CodeGen/ARM/fmscs.ll
index 7a70543..103ce33 100644
--- a/test/CodeGen/ARM/fmscs.ll
+++ b/test/CodeGen/ARM/fmscs.ll
@@ -21,4 +21,4 @@ entry:
; CORTEXA8: test:
; CORTEXA8: vnmls.f32 s2, s1, s0
; CORTEXA9: test:
-; CORTEXA9: vnmls.f32 s2, s1, s0
+; CORTEXA9: vnmls.f32 s0, s1, s2
diff --git a/test/CodeGen/ARM/fmuls.ll b/test/CodeGen/ARM/fmuls.ll
index ef4e3e5..bfafd20 100644
--- a/test/CodeGen/ARM/fmuls.ll
+++ b/test/CodeGen/ARM/fmuls.ll
@@ -20,4 +20,4 @@ entry:
; CORTEXA8: test:
; CORTEXA8: vmul.f32 d0, d1, d0
; CORTEXA9: test:
-; CORTEXA9: vmul.f32 s0, s1, s0
+; CORTEXA9: vmul.f32 s0, s0, s1
diff --git a/test/CodeGen/ARM/fnmscs.ll b/test/CodeGen/ARM/fnmscs.ll
index 6b7cefa..0b47edd 100644
--- a/test/CodeGen/ARM/fnmscs.ll
+++ b/test/CodeGen/ARM/fnmscs.ll
@@ -4,7 +4,7 @@
; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s
define float @test1(float %acc, float %a, float %b) nounwind {
-; CHECK: vnmla.f32 s2, s1, s0
+; CHECK: vnmla.f32 s{{.*}}, s{{.*}}, s{{.*}}
entry:
%0 = fmul float %a, %b
%1 = fsub float -0.0, %0
@@ -13,7 +13,7 @@ entry:
}
define float @test2(float %acc, float %a, float %b) nounwind {
-; CHECK: vnmla.f32 s2, s1, s0
+; CHECK: vnmla.f32 s{{.*}}, s{{.*}}, s{{.*}}
entry:
%0 = fmul float %a, %b
%1 = fmul float -1.0, %0
diff --git a/test/CodeGen/ARM/lsr-on-unrolled-loops.ll b/test/CodeGen/ARM/lsr-on-unrolled-loops.ll
new file mode 100644
index 0000000..2ac4084
--- /dev/null
+++ b/test/CodeGen/ARM/lsr-on-unrolled-loops.ll
@@ -0,0 +1,642 @@
+; RUN: llc -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 < %s | FileCheck %s
+
+; LSR should recognize that this is an unrolled loop which can use
+; constant offset addressing, so that each of the following stores
+; uses the same register.
+
+; CHECK: vstr.32 s0, [r12, #-128]
+; CHECK: vstr.32 s0, [r12, #-96]
+; CHECK: vstr.32 s0, [r12, #-64]
+; CHECK: vstr.32 s0, [r12, #-32]
+; CHECK: vstr.32 s0, [r12]
+; CHECK: vstr.32 s0, [r12, #32]
+; CHECK: vstr.32 s0, [r12, #64]
+; CHECK: vstr.32 s0, [r12, #96]
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
+
+%0 = type { %1*, %3*, %6*, i8*, i32, i32, %8*, i32, i32, i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8**, i32, i32, i32, i32, i32, [64 x i32]*, [4 x %9*], [4 x %10*], [4 x %10*], i32, %11*, i32, i32, [16 x i8], [16 x i8], [16 x i8], i32, i32, i8, i8, i8, i16, i16, i32, i8, i32, %12*, i32, i32, i32, i32, i8*, i32, [4 x %11*], i32, i32, i32, [10 x i32], i32, i32, i32, i32, i32, %13*, %14*, %15*, %16*, %17*, %18*, %19*, %20*, %21*, %22*, %23* }
+%1 = type { void (%2*)*, void (%2*, i32)*, void (%2*)*, void (%2*, i8*)*, void (%2*)*, i32, %7, i32, i32, i8**, i32, i8**, i32, i32 }
+%2 = type { %1*, %3*, %6*, i8*, i32, i32 }
+%3 = type { i8* (%2*, i32, i32)*, i8* (%2*, i32, i32)*, i8** (%2*, i32, i32, i32)*, [64 x i16]** (%2*, i32, i32, i32)*, %4* (%2*, i32, i32, i32, i32, i32)*, %5* (%2*, i32, i32, i32, i32, i32)*, void (%2*)*, i8** (%2*, %4*, i32, i32, i32)*, [64 x i16]** (%2*, %5*, i32, i32, i32)*, void (%2*, i32)*, void (%2*)*, i32, i32 }
+%4 = type opaque
+%5 = type opaque
+%6 = type { void (%2*)*, i32, i32, i32, i32 }
+%7 = type { [8 x i32], [12 x i32] }
+%8 = type { i8*, i32, void (%0*)*, i32 (%0*)*, void (%0*, i32)*, i32 (%0*, i32)*, void (%0*)* }
+%9 = type { [64 x i16], i32 }
+%10 = type { [17 x i8], [256 x i8], i32 }
+%11 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %9*, i8* }
+%12 = type { %12*, i8, i32, i32, i8* }
+%13 = type { void (%0*)*, void (%0*)*, i32 }
+%14 = type { void (%0*, i32)*, void (%0*, i8**, i32*, i32)* }
+%15 = type { void (%0*)*, i32 (%0*)*, void (%0*)*, i32 (%0*, i8***)*, %5** }
+%16 = type { void (%0*, i32)*, void (%0*, i8***, i32*, i32, i8**, i32*, i32)* }
+%17 = type { i32 (%0*)*, void (%0*)*, void (%0*)*, void (%0*)*, i32, i32 }
+%18 = type { void (%0*)*, i32 (%0*)*, i32 (%0*)*, i32, i32, i32, i32 }
+%19 = type { void (%0*)*, i32 (%0*, [64 x i16]**)*, i32 }
+%20 = type { void (%0*)*, [10 x void (%0*, %11*, i16*, i8**, i32)*] }
+%21 = type { void (%0*)*, void (%0*, i8***, i32*, i32, i8**, i32*, i32)*, i32 }
+%22 = type { void (%0*)*, void (%0*, i8***, i32, i8**, i32)* }
+%23 = type { void (%0*, i32)*, void (%0*, i8**, i8**, i32)*, void (%0*)*, void (%0*)* }
+
+define arm_apcscc void @test(%0* nocapture %a0, %11* nocapture %a1, i16* nocapture %a2, i8** nocapture %a3, i32 %a4) nounwind {
+bb:
+ %t = alloca [64 x float], align 4
+ %t5 = getelementptr inbounds %0* %a0, i32 0, i32 65
+ %t6 = load i8** %t5, align 4
+ %t7 = getelementptr inbounds %11* %a1, i32 0, i32 20
+ %t8 = load i8** %t7, align 4
+ br label %bb9
+
+bb9:
+ %t10 = phi i32 [ 0, %bb ], [ %t157, %bb156 ]
+ %t11 = add i32 %t10, 8
+ %t12 = getelementptr [64 x float]* %t, i32 0, i32 %t11
+ %t13 = add i32 %t10, 16
+ %t14 = getelementptr [64 x float]* %t, i32 0, i32 %t13
+ %t15 = add i32 %t10, 24
+ %t16 = getelementptr [64 x float]* %t, i32 0, i32 %t15
+ %t17 = add i32 %t10, 32
+ %t18 = getelementptr [64 x float]* %t, i32 0, i32 %t17
+ %t19 = add i32 %t10, 40
+ %t20 = getelementptr [64 x float]* %t, i32 0, i32 %t19
+ %t21 = add i32 %t10, 48
+ %t22 = getelementptr [64 x float]* %t, i32 0, i32 %t21
+ %t23 = add i32 %t10, 56
+ %t24 = getelementptr [64 x float]* %t, i32 0, i32 %t23
+ %t25 = getelementptr [64 x float]* %t, i32 0, i32 %t10
+ %t26 = shl i32 %t10, 5
+ %t27 = or i32 %t26, 8
+ %t28 = getelementptr i8* %t8, i32 %t27
+ %t29 = bitcast i8* %t28 to float*
+ %t30 = or i32 %t26, 16
+ %t31 = getelementptr i8* %t8, i32 %t30
+ %t32 = bitcast i8* %t31 to float*
+ %t33 = or i32 %t26, 24
+ %t34 = getelementptr i8* %t8, i32 %t33
+ %t35 = bitcast i8* %t34 to float*
+ %t36 = or i32 %t26, 4
+ %t37 = getelementptr i8* %t8, i32 %t36
+ %t38 = bitcast i8* %t37 to float*
+ %t39 = or i32 %t26, 12
+ %t40 = getelementptr i8* %t8, i32 %t39
+ %t41 = bitcast i8* %t40 to float*
+ %t42 = or i32 %t26, 20
+ %t43 = getelementptr i8* %t8, i32 %t42
+ %t44 = bitcast i8* %t43 to float*
+ %t45 = or i32 %t26, 28
+ %t46 = getelementptr i8* %t8, i32 %t45
+ %t47 = bitcast i8* %t46 to float*
+ %t48 = getelementptr i8* %t8, i32 %t26
+ %t49 = bitcast i8* %t48 to float*
+ %t50 = shl i32 %t10, 3
+ %t51 = or i32 %t50, 1
+ %t52 = getelementptr i16* %a2, i32 %t51
+ %t53 = or i32 %t50, 2
+ %t54 = getelementptr i16* %a2, i32 %t53
+ %t55 = or i32 %t50, 3
+ %t56 = getelementptr i16* %a2, i32 %t55
+ %t57 = or i32 %t50, 4
+ %t58 = getelementptr i16* %a2, i32 %t57
+ %t59 = or i32 %t50, 5
+ %t60 = getelementptr i16* %a2, i32 %t59
+ %t61 = or i32 %t50, 6
+ %t62 = getelementptr i16* %a2, i32 %t61
+ %t63 = or i32 %t50, 7
+ %t64 = getelementptr i16* %a2, i32 %t63
+ %t65 = getelementptr i16* %a2, i32 %t50
+ %t66 = load i16* %t52, align 2
+ %t67 = icmp eq i16 %t66, 0
+ %t68 = load i16* %t54, align 2
+ %t69 = icmp eq i16 %t68, 0
+ %t70 = and i1 %t67, %t69
+ br i1 %t70, label %bb71, label %bb91
+
+bb71:
+ %t72 = load i16* %t56, align 2
+ %t73 = icmp eq i16 %t72, 0
+ br i1 %t73, label %bb74, label %bb91
+
+bb74:
+ %t75 = load i16* %t58, align 2
+ %t76 = icmp eq i16 %t75, 0
+ br i1 %t76, label %bb77, label %bb91
+
+bb77:
+ %t78 = load i16* %t60, align 2
+ %t79 = icmp eq i16 %t78, 0
+ br i1 %t79, label %bb80, label %bb91
+
+bb80:
+ %t81 = load i16* %t62, align 2
+ %t82 = icmp eq i16 %t81, 0
+ br i1 %t82, label %bb83, label %bb91
+
+bb83:
+ %t84 = load i16* %t64, align 2
+ %t85 = icmp eq i16 %t84, 0
+ br i1 %t85, label %bb86, label %bb91
+
+bb86:
+ %t87 = load i16* %t65, align 2
+ %t88 = sitofp i16 %t87 to float
+ %t89 = load float* %t49, align 4
+ %t90 = fmul float %t88, %t89
+ store float %t90, float* %t25, align 4
+ store float %t90, float* %t12, align 4
+ store float %t90, float* %t14, align 4
+ store float %t90, float* %t16, align 4
+ store float %t90, float* %t18, align 4
+ store float %t90, float* %t20, align 4
+ store float %t90, float* %t22, align 4
+ store float %t90, float* %t24, align 4
+ br label %bb156
+
+bb91:
+ %t92 = load i16* %t65, align 2
+ %t93 = sitofp i16 %t92 to float
+ %t94 = load float* %t49, align 4
+ %t95 = fmul float %t93, %t94
+ %t96 = sitofp i16 %t68 to float
+ %t97 = load float* %t29, align 4
+ %t98 = fmul float %t96, %t97
+ %t99 = load i16* %t58, align 2
+ %t100 = sitofp i16 %t99 to float
+ %t101 = load float* %t32, align 4
+ %t102 = fmul float %t100, %t101
+ %t103 = load i16* %t62, align 2
+ %t104 = sitofp i16 %t103 to float
+ %t105 = load float* %t35, align 4
+ %t106 = fmul float %t104, %t105
+ %t107 = fadd float %t95, %t102
+ %t108 = fsub float %t95, %t102
+ %t109 = fadd float %t98, %t106
+ %t110 = fsub float %t98, %t106
+ %t111 = fmul float %t110, 0x3FF6A09E60000000
+ %t112 = fsub float %t111, %t109
+ %t113 = fadd float %t107, %t109
+ %t114 = fsub float %t107, %t109
+ %t115 = fadd float %t108, %t112
+ %t116 = fsub float %t108, %t112
+ %t117 = sitofp i16 %t66 to float
+ %t118 = load float* %t38, align 4
+ %t119 = fmul float %t117, %t118
+ %t120 = load i16* %t56, align 2
+ %t121 = sitofp i16 %t120 to float
+ %t122 = load float* %t41, align 4
+ %t123 = fmul float %t121, %t122
+ %t124 = load i16* %t60, align 2
+ %t125 = sitofp i16 %t124 to float
+ %t126 = load float* %t44, align 4
+ %t127 = fmul float %t125, %t126
+ %t128 = load i16* %t64, align 2
+ %t129 = sitofp i16 %t128 to float
+ %t130 = load float* %t47, align 4
+ %t131 = fmul float %t129, %t130
+ %t132 = fadd float %t127, %t123
+ %t133 = fsub float %t127, %t123
+ %t134 = fadd float %t119, %t131
+ %t135 = fsub float %t119, %t131
+ %t136 = fadd float %t134, %t132
+ %t137 = fsub float %t134, %t132
+ %t138 = fmul float %t137, 0x3FF6A09E60000000
+ %t139 = fadd float %t133, %t135
+ %t140 = fmul float %t139, 0x3FFD906BC0000000
+ %t141 = fmul float %t135, 0x3FF1517A80000000
+ %t142 = fsub float %t141, %t140
+ %t143 = fmul float %t133, 0xC004E7AEA0000000
+ %t144 = fadd float %t143, %t140
+ %t145 = fsub float %t144, %t136
+ %t146 = fsub float %t138, %t145
+ %t147 = fadd float %t142, %t146
+ %t148 = fadd float %t113, %t136
+ store float %t148, float* %t25, align 4
+ %t149 = fsub float %t113, %t136
+ store float %t149, float* %t24, align 4
+ %t150 = fadd float %t115, %t145
+ store float %t150, float* %t12, align 4
+ %t151 = fsub float %t115, %t145
+ store float %t151, float* %t22, align 4
+ %t152 = fadd float %t116, %t146
+ store float %t152, float* %t14, align 4
+ %t153 = fsub float %t116, %t146
+ store float %t153, float* %t20, align 4
+ %t154 = fadd float %t114, %t147
+ store float %t154, float* %t18, align 4
+ %t155 = fsub float %t114, %t147
+ store float %t155, float* %t16, align 4
+ br label %bb156
+
+bb156:
+ %t157 = add i32 %t10, 1
+ %t158 = icmp eq i32 %t157, 8
+ br i1 %t158, label %bb159, label %bb9
+
+bb159:
+ %t160 = add i32 %a4, 7
+ %t161 = add i32 %a4, 1
+ %t162 = add i32 %a4, 6
+ %t163 = add i32 %a4, 2
+ %t164 = add i32 %a4, 5
+ %t165 = add i32 %a4, 4
+ %t166 = add i32 %a4, 3
+ br label %bb167
+
+bb167:
+ %t168 = phi i32 [ 0, %bb159 ], [ %t293, %bb167 ]
+ %t169 = getelementptr i8** %a3, i32 %t168
+ %t170 = shl i32 %t168, 3
+ %t171 = or i32 %t170, 4
+ %t172 = getelementptr [64 x float]* %t, i32 0, i32 %t171
+ %t173 = or i32 %t170, 2
+ %t174 = getelementptr [64 x float]* %t, i32 0, i32 %t173
+ %t175 = or i32 %t170, 6
+ %t176 = getelementptr [64 x float]* %t, i32 0, i32 %t175
+ %t177 = or i32 %t170, 5
+ %t178 = getelementptr [64 x float]* %t, i32 0, i32 %t177
+ %t179 = or i32 %t170, 3
+ %t180 = getelementptr [64 x float]* %t, i32 0, i32 %t179
+ %t181 = or i32 %t170, 1
+ %t182 = getelementptr [64 x float]* %t, i32 0, i32 %t181
+ %t183 = or i32 %t170, 7
+ %t184 = getelementptr [64 x float]* %t, i32 0, i32 %t183
+ %t185 = getelementptr [64 x float]* %t, i32 0, i32 %t170
+ %t186 = load i8** %t169, align 4
+ %t187 = getelementptr inbounds i8* %t186, i32 %a4
+ %t188 = load float* %t185, align 4
+ %t189 = load float* %t172, align 4
+ %t190 = fadd float %t188, %t189
+ %t191 = fsub float %t188, %t189
+ %t192 = load float* %t174, align 4
+ %t193 = load float* %t176, align 4
+ %t194 = fadd float %t192, %t193
+ %t195 = fsub float %t192, %t193
+ %t196 = fmul float %t195, 0x3FF6A09E60000000
+ %t197 = fsub float %t196, %t194
+ %t198 = fadd float %t190, %t194
+ %t199 = fsub float %t190, %t194
+ %t200 = fadd float %t191, %t197
+ %t201 = fsub float %t191, %t197
+ %t202 = load float* %t178, align 4
+ %t203 = load float* %t180, align 4
+ %t204 = fadd float %t202, %t203
+ %t205 = fsub float %t202, %t203
+ %t206 = load float* %t182, align 4
+ %t207 = load float* %t184, align 4
+ %t208 = fadd float %t206, %t207
+ %t209 = fsub float %t206, %t207
+ %t210 = fadd float %t208, %t204
+ %t211 = fsub float %t208, %t204
+ %t212 = fmul float %t211, 0x3FF6A09E60000000
+ %t213 = fadd float %t205, %t209
+ %t214 = fmul float %t213, 0x3FFD906BC0000000
+ %t215 = fmul float %t209, 0x3FF1517A80000000
+ %t216 = fsub float %t215, %t214
+ %t217 = fmul float %t205, 0xC004E7AEA0000000
+ %t218 = fadd float %t217, %t214
+ %t219 = fsub float %t218, %t210
+ %t220 = fsub float %t212, %t219
+ %t221 = fadd float %t216, %t220
+ %t222 = fadd float %t198, %t210
+ %t223 = fptosi float %t222 to i32
+ %t224 = add nsw i32 %t223, 4
+ %t225 = lshr i32 %t224, 3
+ %t226 = and i32 %t225, 1023
+ %t227 = add i32 %t226, 128
+ %t228 = getelementptr inbounds i8* %t6, i32 %t227
+ %t229 = load i8* %t228, align 1
+ store i8 %t229, i8* %t187, align 1
+ %t230 = fsub float %t198, %t210
+ %t231 = fptosi float %t230 to i32
+ %t232 = add nsw i32 %t231, 4
+ %t233 = lshr i32 %t232, 3
+ %t234 = and i32 %t233, 1023
+ %t235 = add i32 %t234, 128
+ %t236 = getelementptr inbounds i8* %t6, i32 %t235
+ %t237 = load i8* %t236, align 1
+ %t238 = getelementptr inbounds i8* %t186, i32 %t160
+ store i8 %t237, i8* %t238, align 1
+ %t239 = fadd float %t200, %t219
+ %t240 = fptosi float %t239 to i32
+ %t241 = add nsw i32 %t240, 4
+ %t242 = lshr i32 %t241, 3
+ %t243 = and i32 %t242, 1023
+ %t244 = add i32 %t243, 128
+ %t245 = getelementptr inbounds i8* %t6, i32 %t244
+ %t246 = load i8* %t245, align 1
+ %t247 = getelementptr inbounds i8* %t186, i32 %t161
+ store i8 %t246, i8* %t247, align 1
+ %t248 = fsub float %t200, %t219
+ %t249 = fptosi float %t248 to i32
+ %t250 = add nsw i32 %t249, 4
+ %t251 = lshr i32 %t250, 3
+ %t252 = and i32 %t251, 1023
+ %t253 = add i32 %t252, 128
+ %t254 = getelementptr inbounds i8* %t6, i32 %t253
+ %t255 = load i8* %t254, align 1
+ %t256 = getelementptr inbounds i8* %t186, i32 %t162
+ store i8 %t255, i8* %t256, align 1
+ %t257 = fadd float %t201, %t220
+ %t258 = fptosi float %t257 to i32
+ %t259 = add nsw i32 %t258, 4
+ %t260 = lshr i32 %t259, 3
+ %t261 = and i32 %t260, 1023
+ %t262 = add i32 %t261, 128
+ %t263 = getelementptr inbounds i8* %t6, i32 %t262
+ %t264 = load i8* %t263, align 1
+ %t265 = getelementptr inbounds i8* %t186, i32 %t163
+ store i8 %t264, i8* %t265, align 1
+ %t266 = fsub float %t201, %t220
+ %t267 = fptosi float %t266 to i32
+ %t268 = add nsw i32 %t267, 4
+ %t269 = lshr i32 %t268, 3
+ %t270 = and i32 %t269, 1023
+ %t271 = add i32 %t270, 128
+ %t272 = getelementptr inbounds i8* %t6, i32 %t271
+ %t273 = load i8* %t272, align 1
+ %t274 = getelementptr inbounds i8* %t186, i32 %t164
+ store i8 %t273, i8* %t274, align 1
+ %t275 = fadd float %t199, %t221
+ %t276 = fptosi float %t275 to i32
+ %t277 = add nsw i32 %t276, 4
+ %t278 = lshr i32 %t277, 3
+ %t279 = and i32 %t278, 1023
+ %t280 = add i32 %t279, 128
+ %t281 = getelementptr inbounds i8* %t6, i32 %t280
+ %t282 = load i8* %t281, align 1
+ %t283 = getelementptr inbounds i8* %t186, i32 %t165
+ store i8 %t282, i8* %t283, align 1
+ %t284 = fsub float %t199, %t221
+ %t285 = fptosi float %t284 to i32
+ %t286 = add nsw i32 %t285, 4
+ %t287 = lshr i32 %t286, 3
+ %t288 = and i32 %t287, 1023
+ %t289 = add i32 %t288, 128
+ %t290 = getelementptr inbounds i8* %t6, i32 %t289
+ %t291 = load i8* %t290, align 1
+ %t292 = getelementptr inbounds i8* %t186, i32 %t166
+ store i8 %t291, i8* %t292, align 1
+ %t293 = add nsw i32 %t168, 1
+ %t294 = icmp eq i32 %t293, 8
+ br i1 %t294, label %bb295, label %bb167
+
+bb295:
+ ret void
+}
+
+%struct.ct_data_s = type { %union.anon, %union.anon }
+%struct.gz_header = type { i32, i32, i32, i32, i8*, i32, i32, i8*, i32, i8*, i32, i32, i32 }
+%struct.internal_state = type { %struct.z_stream*, i32, i8*, i32, i8*, i32, i32, %struct.gz_header*, i32, i8, i32, i32, i32, i32, i8*, i32, i16*, i16*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [573 x %struct.ct_data_s], [61 x %struct.ct_data_s], [39 x %struct.ct_data_s], %struct.tree_desc_s, %struct.tree_desc_s, %struct.tree_desc_s, [16 x i16], [573 x i32], i32, i32, [573 x i8], i8*, i32, i32, i16*, i32, i32, i32, i32, i16, i32 }
+%struct.static_tree_desc = type { i32 }
+%struct.tree_desc_s = type { %struct.ct_data_s*, i32, %struct.static_tree_desc* }
+%struct.z_stream = type { i8*, i32, i32, i8*, i32, i32, i8*, %struct.internal_state*, i8* (i8*, i32, i32)*, void (i8*, i8*)*, i8*, i32, i32, i32 }
+%union.anon = type { i16 }
+
+define arm_apcscc i32 @longest_match(%struct.internal_state* %s, i32 %cur_match) nounwind optsize {
+entry:
+ %0 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 31 ; <i32*> [#uses=1]
+ %1 = load i32* %0, align 4 ; <i32> [#uses=2]
+ %2 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 14 ; <i8**> [#uses=1]
+ %3 = load i8** %2, align 4 ; <i8*> [#uses=27]
+ %4 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 27 ; <i32*> [#uses=1]
+ %5 = load i32* %4, align 4 ; <i32> [#uses=17]
+ %6 = getelementptr inbounds i8* %3, i32 %5 ; <i8*> [#uses=1]
+ %7 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 30 ; <i32*> [#uses=1]
+ %8 = load i32* %7, align 4 ; <i32> [#uses=4]
+ %9 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 36 ; <i32*> [#uses=1]
+ %10 = load i32* %9, align 4 ; <i32> [#uses=2]
+ %11 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 11 ; <i32*> [#uses=1]
+ %12 = load i32* %11, align 4 ; <i32> [#uses=2]
+ %13 = add i32 %12, -262 ; <i32> [#uses=1]
+ %14 = icmp ugt i32 %5, %13 ; <i1> [#uses=1]
+ br i1 %14, label %bb, label %bb2
+
+bb: ; preds = %entry
+ %15 = add i32 %5, 262 ; <i32> [#uses=1]
+ %16 = sub i32 %15, %12 ; <i32> [#uses=1]
+ br label %bb2
+
+bb2: ; preds = %bb, %entry
+ %iftmp.48.0 = phi i32 [ %16, %bb ], [ 0, %entry ] ; <i32> [#uses=1]
+ %17 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 16 ; <i16**> [#uses=1]
+ %18 = load i16** %17, align 4 ; <i16*> [#uses=1]
+ %19 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 13 ; <i32*> [#uses=1]
+ %20 = load i32* %19, align 4 ; <i32> [#uses=1]
+ %.sum = add i32 %5, 258 ; <i32> [#uses=2]
+ %21 = getelementptr inbounds i8* %3, i32 %.sum ; <i8*> [#uses=1]
+ %22 = add nsw i32 %5, -1 ; <i32> [#uses=1]
+ %.sum30 = add i32 %22, %8 ; <i32> [#uses=1]
+ %23 = getelementptr inbounds i8* %3, i32 %.sum30 ; <i8*> [#uses=1]
+ %24 = load i8* %23, align 1 ; <i8> [#uses=1]
+ %.sum31 = add i32 %8, %5 ; <i32> [#uses=1]
+ %25 = getelementptr inbounds i8* %3, i32 %.sum31 ; <i8*> [#uses=1]
+ %26 = load i8* %25, align 1 ; <i8> [#uses=1]
+ %27 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 35 ; <i32*> [#uses=1]
+ %28 = load i32* %27, align 4 ; <i32> [#uses=1]
+ %29 = lshr i32 %1, 2 ; <i32> [#uses=1]
+ %30 = icmp ult i32 %8, %28 ; <i1> [#uses=1]
+ %. = select i1 %30, i32 %1, i32 %29 ; <i32> [#uses=1]
+ %31 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 29 ; <i32*> [#uses=1]
+ %32 = load i32* %31, align 4 ; <i32> [#uses=4]
+ %33 = icmp ugt i32 %10, %32 ; <i1> [#uses=1]
+ %nice_match.0.ph = select i1 %33, i32 %32, i32 %10 ; <i32> [#uses=1]
+ %34 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 28 ; <i32*> [#uses=1]
+ %35 = ptrtoint i8* %21 to i32 ; <i32> [#uses=1]
+ %36 = add nsw i32 %5, 257 ; <i32> [#uses=1]
+ %tmp81 = add i32 %., -1 ; <i32> [#uses=1]
+ br label %bb6
+
+bb6: ; preds = %bb24, %bb2
+ %indvar78 = phi i32 [ 0, %bb2 ], [ %indvar.next79, %bb24 ] ; <i32> [#uses=2]
+ %best_len.2 = phi i32 [ %8, %bb2 ], [ %best_len.0, %bb24 ] ; <i32> [#uses=8]
+ %scan_end1.1 = phi i8 [ %24, %bb2 ], [ %scan_end1.0, %bb24 ] ; <i8> [#uses=6]
+ %cur_match_addr.0 = phi i32 [ %cur_match, %bb2 ], [ %90, %bb24 ] ; <i32> [#uses=14]
+ %scan_end.1 = phi i8 [ %26, %bb2 ], [ %scan_end.0, %bb24 ] ; <i8> [#uses=6]
+ %37 = getelementptr inbounds i8* %3, i32 %cur_match_addr.0 ; <i8*> [#uses=1]
+ %.sum32 = add i32 %cur_match_addr.0, %best_len.2 ; <i32> [#uses=1]
+ %38 = getelementptr inbounds i8* %3, i32 %.sum32 ; <i8*> [#uses=1]
+ %39 = load i8* %38, align 1 ; <i8> [#uses=1]
+ %40 = icmp eq i8 %39, %scan_end.1 ; <i1> [#uses=1]
+ br i1 %40, label %bb7, label %bb23
+
+bb7: ; preds = %bb6
+ %41 = add nsw i32 %best_len.2, -1 ; <i32> [#uses=1]
+ %.sum33 = add i32 %41, %cur_match_addr.0 ; <i32> [#uses=1]
+ %42 = getelementptr inbounds i8* %3, i32 %.sum33 ; <i8*> [#uses=1]
+ %43 = load i8* %42, align 1 ; <i8> [#uses=1]
+ %44 = icmp eq i8 %43, %scan_end1.1 ; <i1> [#uses=1]
+ br i1 %44, label %bb8, label %bb23
+
+bb8: ; preds = %bb7
+ %45 = load i8* %37, align 1 ; <i8> [#uses=1]
+ %46 = load i8* %6, align 1 ; <i8> [#uses=1]
+ %47 = icmp eq i8 %45, %46 ; <i1> [#uses=1]
+ br i1 %47, label %bb9, label %bb23
+
+bb9: ; preds = %bb8
+ %.sum34 = add i32 %cur_match_addr.0, 1 ; <i32> [#uses=1]
+ %48 = getelementptr inbounds i8* %3, i32 %.sum34 ; <i8*> [#uses=1]
+ %49 = load i8* %48, align 1 ; <i8> [#uses=1]
+ %.sum88 = add i32 %5, 1 ; <i32> [#uses=1]
+ %50 = getelementptr inbounds i8* %3, i32 %.sum88 ; <i8*> [#uses=1]
+ %51 = load i8* %50, align 1 ; <i8> [#uses=1]
+ %52 = icmp eq i8 %49, %51 ; <i1> [#uses=1]
+ br i1 %52, label %bb10, label %bb23
+
+bb10: ; preds = %bb9
+ %tmp39 = add i32 %cur_match_addr.0, 10 ; <i32> [#uses=1]
+ %tmp41 = add i32 %cur_match_addr.0, 9 ; <i32> [#uses=1]
+ %tmp44 = add i32 %cur_match_addr.0, 8 ; <i32> [#uses=1]
+ %tmp47 = add i32 %cur_match_addr.0, 7 ; <i32> [#uses=1]
+ %tmp50 = add i32 %cur_match_addr.0, 6 ; <i32> [#uses=1]
+ %tmp53 = add i32 %cur_match_addr.0, 5 ; <i32> [#uses=1]
+ %tmp56 = add i32 %cur_match_addr.0, 4 ; <i32> [#uses=1]
+ %tmp59 = add i32 %cur_match_addr.0, 3 ; <i32> [#uses=1]
+ br label %bb11
+
+bb11: ; preds = %bb18, %bb10
+ %indvar = phi i32 [ %indvar.next, %bb18 ], [ 0, %bb10 ] ; <i32> [#uses=2]
+ %tmp = shl i32 %indvar, 3 ; <i32> [#uses=16]
+ %tmp40 = add i32 %tmp39, %tmp ; <i32> [#uses=1]
+ %scevgep = getelementptr i8* %3, i32 %tmp40 ; <i8*> [#uses=1]
+ %tmp42 = add i32 %tmp41, %tmp ; <i32> [#uses=1]
+ %scevgep43 = getelementptr i8* %3, i32 %tmp42 ; <i8*> [#uses=1]
+ %tmp45 = add i32 %tmp44, %tmp ; <i32> [#uses=1]
+ %scevgep46 = getelementptr i8* %3, i32 %tmp45 ; <i8*> [#uses=1]
+ %tmp48 = add i32 %tmp47, %tmp ; <i32> [#uses=1]
+ %scevgep49 = getelementptr i8* %3, i32 %tmp48 ; <i8*> [#uses=1]
+ %tmp51 = add i32 %tmp50, %tmp ; <i32> [#uses=1]
+ %scevgep52 = getelementptr i8* %3, i32 %tmp51 ; <i8*> [#uses=1]
+ %tmp54 = add i32 %tmp53, %tmp ; <i32> [#uses=1]
+ %scevgep55 = getelementptr i8* %3, i32 %tmp54 ; <i8*> [#uses=1]
+ %tmp60 = add i32 %tmp59, %tmp ; <i32> [#uses=1]
+ %scevgep61 = getelementptr i8* %3, i32 %tmp60 ; <i8*> [#uses=1]
+ %tmp62 = add i32 %tmp, 10 ; <i32> [#uses=1]
+ %.sum89 = add i32 %5, %tmp62 ; <i32> [#uses=2]
+ %scevgep63 = getelementptr i8* %3, i32 %.sum89 ; <i8*> [#uses=2]
+ %tmp64 = add i32 %tmp, 9 ; <i32> [#uses=1]
+ %.sum90 = add i32 %5, %tmp64 ; <i32> [#uses=1]
+ %scevgep65 = getelementptr i8* %3, i32 %.sum90 ; <i8*> [#uses=2]
+ %tmp66 = add i32 %tmp, 8 ; <i32> [#uses=1]
+ %.sum91 = add i32 %5, %tmp66 ; <i32> [#uses=1]
+ %scevgep67 = getelementptr i8* %3, i32 %.sum91 ; <i8*> [#uses=2]
+ %tmp6883 = or i32 %tmp, 7 ; <i32> [#uses=1]
+ %.sum92 = add i32 %5, %tmp6883 ; <i32> [#uses=1]
+ %scevgep69 = getelementptr i8* %3, i32 %.sum92 ; <i8*> [#uses=2]
+ %tmp7084 = or i32 %tmp, 6 ; <i32> [#uses=1]
+ %.sum93 = add i32 %5, %tmp7084 ; <i32> [#uses=1]
+ %scevgep71 = getelementptr i8* %3, i32 %.sum93 ; <i8*> [#uses=2]
+ %tmp7285 = or i32 %tmp, 5 ; <i32> [#uses=1]
+ %.sum94 = add i32 %5, %tmp7285 ; <i32> [#uses=1]
+ %scevgep73 = getelementptr i8* %3, i32 %.sum94 ; <i8*> [#uses=2]
+ %tmp7486 = or i32 %tmp, 4 ; <i32> [#uses=1]
+ %.sum95 = add i32 %5, %tmp7486 ; <i32> [#uses=1]
+ %scevgep75 = getelementptr i8* %3, i32 %.sum95 ; <i8*> [#uses=2]
+ %tmp7687 = or i32 %tmp, 3 ; <i32> [#uses=1]
+ %.sum96 = add i32 %5, %tmp7687 ; <i32> [#uses=1]
+ %scevgep77 = getelementptr i8* %3, i32 %.sum96 ; <i8*> [#uses=2]
+ %53 = load i8* %scevgep77, align 1 ; <i8> [#uses=1]
+ %54 = load i8* %scevgep61, align 1 ; <i8> [#uses=1]
+ %55 = icmp eq i8 %53, %54 ; <i1> [#uses=1]
+ br i1 %55, label %bb12, label %bb20
+
+bb12: ; preds = %bb11
+ %tmp57 = add i32 %tmp56, %tmp ; <i32> [#uses=1]
+ %scevgep58 = getelementptr i8* %3, i32 %tmp57 ; <i8*> [#uses=1]
+ %56 = load i8* %scevgep75, align 1 ; <i8> [#uses=1]
+ %57 = load i8* %scevgep58, align 1 ; <i8> [#uses=1]
+ %58 = icmp eq i8 %56, %57 ; <i1> [#uses=1]
+ br i1 %58, label %bb13, label %bb20
+
+bb13: ; preds = %bb12
+ %59 = load i8* %scevgep73, align 1 ; <i8> [#uses=1]
+ %60 = load i8* %scevgep55, align 1 ; <i8> [#uses=1]
+ %61 = icmp eq i8 %59, %60 ; <i1> [#uses=1]
+ br i1 %61, label %bb14, label %bb20
+
+bb14: ; preds = %bb13
+ %62 = load i8* %scevgep71, align 1 ; <i8> [#uses=1]
+ %63 = load i8* %scevgep52, align 1 ; <i8> [#uses=1]
+ %64 = icmp eq i8 %62, %63 ; <i1> [#uses=1]
+ br i1 %64, label %bb15, label %bb20
+
+bb15: ; preds = %bb14
+ %65 = load i8* %scevgep69, align 1 ; <i8> [#uses=1]
+ %66 = load i8* %scevgep49, align 1 ; <i8> [#uses=1]
+ %67 = icmp eq i8 %65, %66 ; <i1> [#uses=1]
+ br i1 %67, label %bb16, label %bb20
+
+bb16: ; preds = %bb15
+ %68 = load i8* %scevgep67, align 1 ; <i8> [#uses=1]
+ %69 = load i8* %scevgep46, align 1 ; <i8> [#uses=1]
+ %70 = icmp eq i8 %68, %69 ; <i1> [#uses=1]
+ br i1 %70, label %bb17, label %bb20
+
+bb17: ; preds = %bb16
+ %71 = load i8* %scevgep65, align 1 ; <i8> [#uses=1]
+ %72 = load i8* %scevgep43, align 1 ; <i8> [#uses=1]
+ %73 = icmp eq i8 %71, %72 ; <i1> [#uses=1]
+ br i1 %73, label %bb18, label %bb20
+
+bb18: ; preds = %bb17
+ %74 = load i8* %scevgep63, align 1 ; <i8> [#uses=1]
+ %75 = load i8* %scevgep, align 1 ; <i8> [#uses=1]
+ %76 = icmp eq i8 %74, %75 ; <i1> [#uses=1]
+ %77 = icmp slt i32 %.sum89, %.sum ; <i1> [#uses=1]
+ %or.cond = and i1 %76, %77 ; <i1> [#uses=1]
+ %indvar.next = add i32 %indvar, 1 ; <i32> [#uses=1]
+ br i1 %or.cond, label %bb11, label %bb20
+
+bb20: ; preds = %bb18, %bb17, %bb16, %bb15, %bb14, %bb13, %bb12, %bb11
+ %scan.3 = phi i8* [ %scevgep77, %bb11 ], [ %scevgep75, %bb12 ], [ %scevgep73, %bb13 ], [ %scevgep71, %bb14 ], [ %scevgep69, %bb15 ], [ %scevgep67, %bb16 ], [ %scevgep65, %bb17 ], [ %scevgep63, %bb18 ] ; <i8*> [#uses=1]
+ %78 = ptrtoint i8* %scan.3 to i32 ; <i32> [#uses=1]
+ %79 = sub nsw i32 %78, %35 ; <i32> [#uses=2]
+ %80 = add i32 %79, 258 ; <i32> [#uses=5]
+ %81 = icmp sgt i32 %80, %best_len.2 ; <i1> [#uses=1]
+ br i1 %81, label %bb21, label %bb23
+
+bb21: ; preds = %bb20
+ store i32 %cur_match_addr.0, i32* %34, align 4
+ %82 = icmp slt i32 %80, %nice_match.0.ph ; <i1> [#uses=1]
+ br i1 %82, label %bb22, label %bb25
+
+bb22: ; preds = %bb21
+ %.sum37 = add i32 %36, %79 ; <i32> [#uses=1]
+ %83 = getelementptr inbounds i8* %3, i32 %.sum37 ; <i8*> [#uses=1]
+ %84 = load i8* %83, align 1 ; <i8> [#uses=1]
+ %.sum38 = add i32 %80, %5 ; <i32> [#uses=1]
+ %85 = getelementptr inbounds i8* %3, i32 %.sum38 ; <i8*> [#uses=1]
+ %86 = load i8* %85, align 1 ; <i8> [#uses=1]
+ br label %bb23
+
+bb23: ; preds = %bb22, %bb20, %bb9, %bb8, %bb7, %bb6
+ %best_len.0 = phi i32 [ %best_len.2, %bb6 ], [ %best_len.2, %bb7 ], [ %best_len.2, %bb8 ], [ %best_len.2, %bb9 ], [ %80, %bb22 ], [ %best_len.2, %bb20 ] ; <i32> [#uses=3]
+ %scan_end1.0 = phi i8 [ %scan_end1.1, %bb6 ], [ %scan_end1.1, %bb7 ], [ %scan_end1.1, %bb8 ], [ %scan_end1.1, %bb9 ], [ %84, %bb22 ], [ %scan_end1.1, %bb20 ] ; <i8> [#uses=1]
+ %scan_end.0 = phi i8 [ %scan_end.1, %bb6 ], [ %scan_end.1, %bb7 ], [ %scan_end.1, %bb8 ], [ %scan_end.1, %bb9 ], [ %86, %bb22 ], [ %scan_end.1, %bb20 ] ; <i8> [#uses=1]
+ %87 = and i32 %cur_match_addr.0, %20 ; <i32> [#uses=1]
+ %88 = getelementptr inbounds i16* %18, i32 %87 ; <i16*> [#uses=1]
+ %89 = load i16* %88, align 2 ; <i16> [#uses=1]
+ %90 = zext i16 %89 to i32 ; <i32> [#uses=2]
+ %91 = icmp ugt i32 %90, %iftmp.48.0 ; <i1> [#uses=1]
+ br i1 %91, label %bb24, label %bb25
+
+bb24: ; preds = %bb23
+
+; LSR should use count-down iteration to avoid requiring the trip count
+; in a register, and it shouldn't require any reloads here.
+
+; CHECK: sub.w r9, r9, #1
+; CHECK-NEXT: cmp.w r9, #0
+; CHECK-NEXT: bne.w
+
+ %92 = icmp eq i32 %tmp81, %indvar78 ; <i1> [#uses=1]
+ %indvar.next79 = add i32 %indvar78, 1 ; <i32> [#uses=1]
+ br i1 %92, label %bb25, label %bb6
+
+bb25: ; preds = %bb24, %bb23, %bb21
+ %best_len.1 = phi i32 [ %best_len.0, %bb23 ], [ %best_len.0, %bb24 ], [ %80, %bb21 ] ; <i32> [#uses=2]
+ %93 = icmp ugt i32 %best_len.1, %32 ; <i1> [#uses=1]
+ %merge = select i1 %93, i32 %32, i32 %best_len.1 ; <i32> [#uses=1]
+ ret i32 %merge
+}
diff --git a/test/CodeGen/ARM/mul_const.ll b/test/CodeGen/ARM/mul_const.ll
index 93188cd..8c10246 100644
--- a/test/CodeGen/ARM/mul_const.ll
+++ b/test/CodeGen/ARM/mul_const.ll
@@ -1,17 +1,43 @@
; RUN: llc < %s -march=arm | FileCheck %s
-define i32 @t1(i32 %v) nounwind readnone {
+define i32 @t9(i32 %v) nounwind readnone {
entry:
-; CHECK: t1:
+; CHECK: t9:
; CHECK: add r0, r0, r0, lsl #3
%0 = mul i32 %v, 9
ret i32 %0
}
-define i32 @t2(i32 %v) nounwind readnone {
+define i32 @t7(i32 %v) nounwind readnone {
entry:
-; CHECK: t2:
+; CHECK: t7:
; CHECK: rsb r0, r0, r0, lsl #3
%0 = mul i32 %v, 7
ret i32 %0
}
+
+define i32 @t5(i32 %v) nounwind readnone {
+entry:
+; CHECK: t5:
+; CHECK: add r0, r0, r0, lsl #2
+ %0 = mul i32 %v, 5
+ ret i32 %0
+}
+
+define i32 @t3(i32 %v) nounwind readnone {
+entry:
+; CHECK: t3:
+; CHECK: add r0, r0, r0, lsl #1
+ %0 = mul i32 %v, 3
+ ret i32 %0
+}
+
+define i32 @t12288(i32 %v) nounwind readnone {
+entry:
+; CHECK: t12288:
+; CHECK: add r0, r0, r0, lsl #1
+; CHECK: mov r0, r0, lsl #12
+ %0 = mul i32 %v, 12288
+ ret i32 %0
+}
+
diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll
new file mode 100644
index 0000000..3ba82cc
--- /dev/null
+++ b/test/CodeGen/ARM/reg_sequence.ll
@@ -0,0 +1,348 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+; Implementing vld / vst as REG_SEQUENCE eliminates the extra vmov's.
+
+%struct.int16x8_t = type { <8 x i16> }
+%struct.int32x4_t = type { <4 x i32> }
+%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
+%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
+%struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
+%struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
+
+define arm_apcscc void @t1(i16* %i_ptr, i16* %o_ptr, %struct.int32x4_t* nocapture %vT0ptr, %struct.int32x4_t* nocapture %vT1ptr) nounwind {
+entry:
+; CHECK: t1:
+; CHECK: vld1.16
+; CHECK-NOT: vmov d
+; CHECK: vmovl.s16
+; CHECK: vshrn.i32
+; CHECK: vshrn.i32
+; CHECK-NOT: vmov d
+; CHECK-NEXT: vst1.16
+ %0 = getelementptr inbounds %struct.int32x4_t* %vT0ptr, i32 0, i32 0 ; <<4 x i32>*> [#uses=1]
+ %1 = load <4 x i32>* %0, align 16 ; <<4 x i32>> [#uses=1]
+ %2 = getelementptr inbounds %struct.int32x4_t* %vT1ptr, i32 0, i32 0 ; <<4 x i32>*> [#uses=1]
+ %3 = load <4 x i32>* %2, align 16 ; <<4 x i32>> [#uses=1]
+ %4 = bitcast i16* %i_ptr to i8* ; <i8*> [#uses=1]
+ %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4) ; <<8 x i16>> [#uses=1]
+ %6 = bitcast <8 x i16> %5 to <2 x double> ; <<2 x double>> [#uses=2]
+ %7 = extractelement <2 x double> %6, i32 0 ; <double> [#uses=1]
+ %8 = bitcast double %7 to <4 x i16> ; <<4 x i16>> [#uses=1]
+ %9 = tail call <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16> %8) ; <<4 x i32>> [#uses=1]
+ %10 = extractelement <2 x double> %6, i32 1 ; <double> [#uses=1]
+ %11 = bitcast double %10 to <4 x i16> ; <<4 x i16>> [#uses=1]
+ %12 = tail call <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16> %11) ; <<4 x i32>> [#uses=1]
+ %13 = mul <4 x i32> %1, %9 ; <<4 x i32>> [#uses=1]
+ %14 = mul <4 x i32> %3, %12 ; <<4 x i32>> [#uses=1]
+ %15 = tail call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %13, <4 x i32> <i32 -12, i32 -12, i32 -12, i32 -12>) ; <<4 x i16>> [#uses=1]
+ %16 = tail call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %14, <4 x i32> <i32 -12, i32 -12, i32 -12, i32 -12>) ; <<4 x i16>> [#uses=1]
+ %17 = shufflevector <4 x i16> %15, <4 x i16> %16, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; <<8 x i16>> [#uses=1]
+ %18 = bitcast i16* %o_ptr to i8* ; <i8*> [#uses=1]
+ tail call void @llvm.arm.neon.vst1.v8i16(i8* %18, <8 x i16> %17)
+ ret void
+}
+
+define arm_apcscc void @t2(i16* %i_ptr, i16* %o_ptr, %struct.int16x8_t* nocapture %vT0ptr, %struct.int16x8_t* nocapture %vT1ptr) nounwind {
+entry:
+; CHECK: t2:
+; CHECK: vld1.16
+; CHECK: vld1.16
+; CHECK-NOT: vmov
+; CHECK: vmul.i16
+; CHECK: vmul.i16
+; CHECK-NOT: vmov
+; CHECK: vst1.16
+; CHECK: vst1.16
+ %0 = getelementptr inbounds %struct.int16x8_t* %vT0ptr, i32 0, i32 0 ; <<8 x i16>*> [#uses=1]
+ %1 = load <8 x i16>* %0, align 16 ; <<8 x i16>> [#uses=1]
+ %2 = getelementptr inbounds %struct.int16x8_t* %vT1ptr, i32 0, i32 0 ; <<8 x i16>*> [#uses=1]
+ %3 = load <8 x i16>* %2, align 16 ; <<8 x i16>> [#uses=1]
+ %4 = bitcast i16* %i_ptr to i8* ; <i8*> [#uses=1]
+ %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4) ; <<8 x i16>> [#uses=1]
+ %6 = getelementptr inbounds i16* %i_ptr, i32 8 ; <i16*> [#uses=1]
+ %7 = bitcast i16* %6 to i8* ; <i8*> [#uses=1]
+ %8 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %7) ; <<8 x i16>> [#uses=1]
+ %9 = mul <8 x i16> %1, %5 ; <<8 x i16>> [#uses=1]
+ %10 = mul <8 x i16> %3, %8 ; <<8 x i16>> [#uses=1]
+ %11 = bitcast i16* %o_ptr to i8* ; <i8*> [#uses=1]
+ tail call void @llvm.arm.neon.vst1.v8i16(i8* %11, <8 x i16> %9)
+ %12 = getelementptr inbounds i16* %o_ptr, i32 8 ; <i16*> [#uses=1]
+ %13 = bitcast i16* %12 to i8* ; <i8*> [#uses=1]
+ tail call void @llvm.arm.neon.vst1.v8i16(i8* %13, <8 x i16> %10)
+ ret void
+}
+
+define <8 x i8> @t3(i8* %A, i8* %B) nounwind {
+; CHECK: t3:
+; CHECK: vld3.8
+; CHECK: vmul.i8
+; CHECK-NOT: vmov
+; CHECK: vst3.8
+ %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A) ; <%struct.__neon_int8x8x3_t> [#uses=2]
+ %tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0 ; <<8 x i8>> [#uses=1]
+ %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2 ; <<8 x i8>> [#uses=1]
+ %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 1 ; <<8 x i8>> [#uses=1]
+ %tmp5 = sub <8 x i8> %tmp3, %tmp4
+ %tmp6 = add <8 x i8> %tmp2, %tmp3 ; <<8 x i8>> [#uses=1]
+ %tmp7 = mul <8 x i8> %tmp4, %tmp2
+ tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> %tmp5, <8 x i8> %tmp6, <8 x i8> %tmp7)
+ ret <8 x i8> %tmp4
+}
+
+define arm_apcscc void @t4(i32* %in, i32* %out) nounwind {
+entry:
+; CHECK: t4:
+; CHECK: vld2.32
+; CHECK-NOT: vmov
+; CHECK: vld2.32
+; CHECK-NOT: vmov
+; CHECK: bne
+ %tmp1 = bitcast i32* %in to i8* ; <i8*> [#uses=1]
+ %tmp2 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+ %tmp3 = getelementptr inbounds i32* %in, i32 8 ; <i32*> [#uses=1]
+ %tmp4 = bitcast i32* %tmp3 to i8* ; <i8*> [#uses=1]
+ %tmp5 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp4) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+ %tmp8 = bitcast i32* %out to i8* ; <i8*> [#uses=1]
+ br i1 undef, label %return1, label %return2
+
+return1:
+; CHECK: %return1
+; CHECK-NOT: vmov
+; CHECK-NEXT: vadd.i32
+; CHECK-NEXT: vadd.i32
+; CHECK-NEXT: vst2.32
+ %tmp52 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1]
+ %tmp57 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1 ; <<4 x i32>> [#uses=1]
+ %tmp = extractvalue %struct.__neon_int32x4x2_t %tmp5, 0 ; <<4 x i32>> [#uses=1]
+ %tmp39 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
+ %tmp6 = add <4 x i32> %tmp52, %tmp ; <<4 x i32>> [#uses=1]
+ %tmp7 = add <4 x i32> %tmp57, %tmp39 ; <<4 x i32>> [#uses=1]
+ tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp6, <4 x i32> %tmp7)
+ ret void
+
+return2:
+; CHECK: %return2
+; CHECK: vadd.i32
+; CHECK: vmov q1, q3
+; CHECK-NOT: vmov
+; CHECK: vst2.32 {d0, d1, d2, d3}
+ %tmp100 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1]
+ %tmp101 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
+ %tmp102 = add <4 x i32> %tmp100, %tmp101 ; <<4 x i32>> [#uses=1]
+ tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp102, <4 x i32> %tmp101)
+ call void @llvm.trap()
+ unreachable
+}
+
+define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind {
+; CHECK: t5:
+; CHECK: vldmia
+; CHECK: vmov q1, q0
+; CHECK-NOT: vmov
+; CHECK: vld2.16 {d0[1], d2[1]}, [r0]
+; CHECK-NOT: vmov
+; CHECK: vadd.i16
+ %tmp0 = bitcast i16* %A to i8* ; <i8*> [#uses=1]
+ %tmp1 = load <8 x i16>* %B ; <<8 x i16>> [#uses=2]
+ %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1) ; <%struct.__neon_int16x8x2_t> [#uses=2]
+ %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 ; <<8 x i16>> [#uses=1]
+ %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 ; <<8 x i16>> [#uses=1]
+ %tmp5 = add <8 x i16> %tmp3, %tmp4 ; <<8 x i16>> [#uses=1]
+ ret <8 x i16> %tmp5
+}
+
+define <8 x i8> @t6(i8* %A, <8 x i8>* %B) nounwind {
+; CHECK: t6:
+; CHECK: vldr.64
+; CHECK: vmov d1, d0
+; CHECK-NEXT: vld2.8 {d0[1], d1[1]}
+ %tmp1 = load <8 x i8>* %B ; <<8 x i8>> [#uses=2]
+ %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2]
+ %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 ; <<8 x i8>> [#uses=1]
+ %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 ; <<8 x i8>> [#uses=1]
+ %tmp5 = add <8 x i8> %tmp3, %tmp4 ; <<8 x i8>> [#uses=1]
+ ret <8 x i8> %tmp5
+}
+
+define arm_apcscc void @t7(i32* %iptr, i32* %optr) nounwind {
+entry:
+; CHECK: t7:
+; CHECK: vld2.32
+; CHECK: vst2.32
+; CHECK: vld1.32 {d0, d1},
+; CHECK: vmov q1, q0
+; CHECK-NOT: vmov
+; CHECK: vuzp.32 q0, q1
+; CHECK: vst1.32
+ %0 = bitcast i32* %iptr to i8* ; <i8*> [#uses=2]
+ %1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %0) ; <%struct.__neon_int32x4x2_t> [#uses=2]
+ %tmp57 = extractvalue %struct.__neon_int32x4x2_t %1, 0 ; <<4 x i32>> [#uses=1]
+ %tmp60 = extractvalue %struct.__neon_int32x4x2_t %1, 1 ; <<4 x i32>> [#uses=1]
+ %2 = bitcast i32* %optr to i8* ; <i8*> [#uses=2]
+ tail call void @llvm.arm.neon.vst2.v4i32(i8* %2, <4 x i32> %tmp57, <4 x i32> %tmp60)
+ %3 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %0) ; <<4 x i32>> [#uses=1]
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> ; <<4 x i32>> [#uses=1]
+ tail call void @llvm.arm.neon.vst1.v4i32(i8* %2, <4 x i32> %4)
+ ret void
+}
+
+; PR7156
+define arm_aapcs_vfpcc i32 @t8() nounwind {
+; CHECK: t8:
+; CHECK: vrsqrte.f32 q0, q0
+bb.nph55.bb.nph55.split_crit_edge:
+ br label %bb3
+
+bb3: ; preds = %bb3, %bb.nph55.bb.nph55.split_crit_edge
+ br i1 undef, label %bb5, label %bb3
+
+bb5: ; preds = %bb3
+ br label %bb.i25
+
+bb.i25: ; preds = %bb.i25, %bb5
+ %0 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
+ %1 = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %0) nounwind ; <<4 x float>> [#uses=1]
+ %2 = fmul <4 x float> %1, undef ; <<4 x float>> [#uses=1]
+ %3 = fmul <4 x float> undef, %2 ; <<4 x float>> [#uses=1]
+ %tmp26.i = bitcast <4 x float> %3 to <2 x double> ; <<2 x double>> [#uses=1]
+ %4 = extractelement <2 x double> %tmp26.i, i32 0 ; <double> [#uses=1]
+ %5 = bitcast double %4 to <2 x float> ; <<2 x float>> [#uses=1]
+ %6 = extractelement <2 x float> %5, i32 1 ; <float> [#uses=1]
+ store float %6, float* undef, align 4
+ br i1 undef, label %bb6, label %bb.i25
+
+bb6: ; preds = %bb.i25
+ br i1 undef, label %bb7, label %bb14
+
+bb7: ; preds = %bb6
+ br label %bb.i49
+
+bb.i49: ; preds = %bb.i49, %bb7
+ br i1 undef, label %bb.i19, label %bb.i49
+
+bb.i19: ; preds = %bb.i19, %bb.i49
+ br i1 undef, label %exit, label %bb.i19
+
+exit: ; preds = %bb.i19
+ unreachable
+
+bb14: ; preds = %bb6
+ ret i32 0
+}
+
+%0 = type { %1, %1, %1, %1 }
+%1 = type { %2 }
+%2 = type { <4 x float> }
+%3 = type { %0, %1 }
+
+; PR7157
+define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind {
+; CHECK: t9:
+; CHECK: vldr.64
+; CHECK: vmov.i8 d1
+; CHECK-NEXT: vstmia r0, {d2,d3}
+; CHECK-NEXT: vstmia r0, {d0,d1}
+ %3 = bitcast double 0.000000e+00 to <2 x float> ; <<2 x float>> [#uses=2]
+ %4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
+ store <4 x float> %4, <4 x float>* undef, align 16
+ %5 = shufflevector <2 x float> %3, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
+ store <4 x float> %5, <4 x float>* undef, align 16
+ br label %8
+
+; <label>:6 ; preds = %8
+ br i1 undef, label %7, label %10
+
+; <label>:7 ; preds = %6
+ br label %8
+
+; <label>:8 ; preds = %7, %2
+ br i1 undef, label %6, label %9
+
+; <label>:9 ; preds = %8
+ ret float undef
+
+; <label>:10 ; preds = %6
+ ret float 9.990000e+02
+}
+
+; PR7162
+define arm_aapcs_vfpcc i32 @t10() nounwind {
+entry:
+; CHECK: t10:
+; CHECK: vmov.i32 q1, #0x3F000000
+; CHECK: vdup.32 q0, d0[0]
+; CHECK: vmov d0, d1
+; CHECK: vmla.f32 q0, q0, d0[0]
+ %0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
+ %1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1]
+ %2 = insertelement <4 x float> %1, float undef, i32 2 ; <<4 x float>> [#uses=1]
+ %3 = insertelement <4 x float> %2, float undef, i32 3 ; <<4 x float>> [#uses=1]
+ %tmp54.i = bitcast <4 x float> %3 to <2 x double> ; <<2 x double>> [#uses=1]
+ %4 = extractelement <2 x double> %tmp54.i, i32 1 ; <double> [#uses=1]
+ %5 = bitcast double %4 to <2 x float> ; <<2 x float>> [#uses=1]
+ %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
+ %7 = fmul <4 x float> undef, %6 ; <<4 x float>> [#uses=1]
+ %8 = fadd <4 x float> %7, undef ; <<4 x float>> [#uses=1]
+ %9 = fadd <4 x float> %8, undef ; <<4 x float>> [#uses=1]
+ %10 = shufflevector <4 x float> undef, <4 x float> %9, <4 x i32> <i32 0, i32 1, i32 2, i32 7> ; <<4 x float>> [#uses=1]
+ %11 = fmul <4 x float> %10, <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01> ; <<4 x float>> [#uses=1]
+ %12 = shufflevector <4 x float> %11, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> ; <<4 x float>> [#uses=1]
+ %13 = shufflevector <4 x float> %12, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
+ %14 = fmul <4 x float> %13, undef ; <<4 x float>> [#uses=1]
+ %15 = fadd <4 x float> undef, %14 ; <<4 x float>> [#uses=1]
+ %16 = shufflevector <4 x float> undef, <4 x float> %15, <4 x i32> <i32 0, i32 1, i32 6, i32 3> ; <<4 x float>> [#uses=1]
+ %17 = fmul <4 x float> %16, undef ; <<4 x float>> [#uses=1]
+ %18 = extractelement <4 x float> %17, i32 2 ; <float> [#uses=1]
+ store float %18, float* undef, align 4
+ br i1 undef, label %exit, label %bb14
+
+exit: ; preds = %bb.i19
+ unreachable
+
+bb14: ; preds = %bb6
+ ret i32 0
+}
+
+; This test crashes the coalescer because live variables were not updated properly.
+define <8 x i8> @t11(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind {
+ %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+ %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1]
+ %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6) ; <%struct.__neon_int8x8x3_t> [#uses=1]
+ %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1]
+ %tmp2bd = add <8 x i8> zeroinitializer, %tmp2d ; <<8 x i8>> [#uses=1]
+ %tmp2abcd = mul <8 x i8> zeroinitializer, %tmp2bd ; <<8 x i8>> [#uses=1]
+ %tmp2ef = sub <8 x i8> zeroinitializer, %tmp2f ; <<8 x i8>> [#uses=1]
+ %tmp2efgh = mul <8 x i8> %tmp2ef, undef ; <<8 x i8>> [#uses=2]
+ call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp2efgh)
+ %tmp2 = sub <8 x i8> %tmp2efgh, %tmp2abcd ; <<8 x i8>> [#uses=1]
+ %tmp7 = mul <8 x i8> undef, %tmp2 ; <<8 x i8>> [#uses=1]
+ tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp7)
+ ret <8 x i8> undef
+}
+
+declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*) nounwind readonly
+
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*) nounwind readonly
+
+declare <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16>) nounwind readnone
+
+declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>) nounwind
+
+declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>) nounwind
+
+declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
+
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*) nounwind readonly
+
+declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*) nounwind readonly
+
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind readonly
+
+declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind readonly
+
+declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>) nounwind
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+
+declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/ARM/spill-q.ll b/test/CodeGen/ARM/spill-q.ll
index 5ad7ecc..03de0c8 100644
--- a/test/CodeGen/ARM/spill-q.ll
+++ b/test/CodeGen/ARM/spill-q.ll
@@ -46,7 +46,8 @@ bb4: ; preds = %bb193, %entry
%20 = shufflevector <2 x float> %19, <2 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
%21 = fadd <4 x float> zeroinitializer, %20 ; <<4 x float>> [#uses=2]
%22 = fcmp ogt <4 x float> %besterror.0.2264, %21 ; <<4 x i1>> [#uses=0]
- br i1 undef, label %bb193, label %bb186
+ %tmp = extractelement <4 x i1> %22, i32 0
+ br i1 %tmp, label %bb193, label %bb186
bb186: ; preds = %bb4
br label %bb193
diff --git a/test/CodeGen/ARM/trap.ll b/test/CodeGen/ARM/trap.ll
new file mode 100644
index 0000000..763dff3
--- /dev/null
+++ b/test/CodeGen/ARM/trap.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm | FileCheck %s
+; rdar://7961298
+
+define arm_apcscc void @t() nounwind {
+entry:
+; CHECK: t:
+; CHECK: trap
+ call void @llvm.trap()
+ unreachable
+}
+
+declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/ARM/vcgt.ll b/test/CodeGen/ARM/vcgt.ll
index 6b11ba5..194093c 100644
--- a/test/CodeGen/ARM/vcgt.ll
+++ b/test/CodeGen/ARM/vcgt.ll
@@ -158,5 +158,18 @@ define <4 x i32> @vacgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
ret <4 x i32> %tmp3
}
+; rdar://7923010
+define <4 x i32> @vcgt_zext(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK: vcgt_zext:
+;CHECK: vcgt.f32 q0
+;CHECK: vmov.i32 q1, #0x1
+;CHECK: vand q0, q0, q1
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = fcmp ogt <4 x float> %tmp1, %tmp2
+ %tmp4 = zext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) nounwind readnone
OpenPOWER on IntegriCloud