diff options
Diffstat (limited to 'test/CodeGen/ARM')
29 files changed, 1419 insertions, 28 deletions
diff --git a/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll b/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll index ff01506..f775c61 100644 --- a/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll +++ b/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=arm-linux-gnueabi -regalloc=local +; RUN: llc < %s -mtriple=arm-linux-gnueabi -regalloc=fast ; PR1925 %struct.encode_aux_nearestmatch = type { i32*, i32*, i32*, i32*, i32, i32 } diff --git a/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll b/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll index 06bc987..8ef8c7b 100644 --- a/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll +++ b/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=local +; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=fast ; PR1925 %"struct.kc::impl_Ccode_option" = type { %"struct.kc::impl_abstract_phylum" } diff --git a/test/CodeGen/ARM/2009-05-05-DAGCombineBug.ll b/test/CodeGen/ARM/2009-05-05-DAGCombineBug.ll index 670d204..a48e41f 100644 --- a/test/CodeGen/ARM/2009-05-05-DAGCombineBug.ll +++ b/test/CodeGen/ARM/2009-05-05-DAGCombineBug.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=arm-linuxeabi-unknown-gnu -mattr=+v6 +; RUN: llc < %s -mtriple=arm-unknown-linux-gnueabi -mattr=+v6 ; PR4166 %"byte[]" = type { i32, i8* } diff --git a/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll b/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll index 75610ff..912e6f9 100644 --- a/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll +++ b/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=armv5-unknown-linux-gnueabi -O0 -regalloc=local +; RUN: llc < %s -mtriple=armv5-unknown-linux-gnueabi -O0 -regalloc=fast ; PR4100 @.str = external constant [30 x i8] ; <[30 x i8]*> [#uses=1] diff --git a/test/CodeGen/ARM/2009-11-02-NegativeLane.ll b/test/CodeGen/ARM/2009-11-02-NegativeLane.ll index f2288c3..89c9037 100644 --- a/test/CodeGen/ARM/2009-11-02-NegativeLane.ll +++ b/test/CodeGen/ARM/2009-11-02-NegativeLane.ll @@ -1,4 +1,4 @@ -; RUN: llc -mcpu=cortex-a8 < %s | grep vdup.32 +; RUN: llc -mcpu=cortex-a8 < %s | grep vdup.16 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64" target triple = "armv7-eabi" @@ -7,12 +7,12 @@ entry: br i1 undef, label %return, label %bb bb: ; preds = %bb, %entry - %0 = load float* undef, align 4 ; <float> [#uses=1] - %1 = insertelement <4 x float> undef, float %0, i32 2 ; <<4 x float>> [#uses=1] - %2 = insertelement <4 x float> %1, float undef, i32 3 ; <<4 x float>> [#uses=1] - %3 = fmul <4 x float> undef, %2 ; <<4 x float>> [#uses=1] - %4 = extractelement <4 x float> %3, i32 1 ; <float> [#uses=1] - store float %4, float* undef, align 4 + %0 = load i16* undef, align 2 + %1 = insertelement <8 x i16> undef, i16 %0, i32 2 + %2 = insertelement <8 x i16> %1, i16 undef, i32 3 + %3 = mul <8 x i16> %2, %2 + %4 = extractelement <8 x i16> %3, i32 2 + store i16 %4, i16* undef, align 2 br i1 undef, label %return, label %bb return: ; preds = %bb, %entry diff --git a/test/CodeGen/ARM/2010-05-14-IllegalType.ll b/test/CodeGen/ARM/2010-05-14-IllegalType.ll new file mode 100644 index 0000000..99e5b09 --- /dev/null +++ b/test/CodeGen/ARM/2010-05-14-IllegalType.ll @@ -0,0 +1,10 @@ +; RUN: llc -march=thumb -mcpu=cortex-a8 -mtriple=thumbv7-eabi -float-abi=hard < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32" +target triple = "thumbv7-apple-darwin10" + +define <4 x i64> @f_4_i64(<4 x i64> %a, <4 x i64> %b) nounwind { +; CHECK: vadd.i64 + %y = add <4 x i64> %a, %b + ret <4 x i64> %y +} diff --git a/test/CodeGen/ARM/2010-05-17-DAGCombineAssert.ll b/test/CodeGen/ARM/2010-05-17-DAGCombineAssert.ll new file mode 100644 index 0000000..2a4bbd1 --- /dev/null +++ b/test/CodeGen/ARM/2010-05-17-DAGCombineAssert.ll @@ -0,0 +1,17 @@ +; RUN: llc < %s -mtriple=armv7-eabi -mcpu=cortex-a8 +; PR7158 + +define arm_aapcs_vfpcc i32 @main() nounwind { +bb.nph55.bb.nph55.split_crit_edge: + br label %bb3 + +bb3: ; preds = %bb3, %bb.nph55.bb.nph55.split_crit_edge + br i1 undef, label %bb.i19, label %bb3 + +bb.i19: ; preds = %bb.i19, %bb3 + %0 = insertelement <4 x float> undef, float undef, i32 3 ; <<4 x float>> [#uses=3] + %1 = fmul <4 x float> %0, %0 ; <<4 x float>> [#uses=1] + %2 = bitcast <4 x float> %1 to <2 x double> ; <<2 x double>> [#uses=0] + %3 = fmul <4 x float> %0, undef ; <<4 x float>> [#uses=0] + br label %bb.i19 +} diff --git a/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll b/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll new file mode 100644 index 0000000..813bf3c --- /dev/null +++ b/test/CodeGen/ARM/2010-05-17-FastAllocCrash.ll @@ -0,0 +1,105 @@ +; RUN: llc < %s -regalloc=fast -verify-machineinstrs +target triple = "arm-pc-linux-gnu" + +; This test case would accidentally use the same physreg for two virtregs +; because allocVirtReg forgot to check if registers were already used in the +; instruction. +; This caused the RegScavenger to complain, but -verify-machineinstrs also +; catches it. + +%struct.CHESS_POSITION = type { i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, i32, i8, i8, [64 x i8], i8, i8, i8, i8, i8 } + +@search = external global %struct.CHESS_POSITION ; <%struct.CHESS_POSITION*> [#uses=1] +@bishop_mobility_rr45 = external global [64 x [256 x i32]] ; <[64 x [256 x i32]]*> [#uses=1] + +declare fastcc i32 @FirstOne() + +define fastcc void @Evaluate() { +entry: + br i1 false, label %cond_false186, label %cond_true + +cond_true: ; preds = %entry + ret void + +cond_false186: ; preds = %entry + br i1 false, label %cond_true293, label %bb203 + +bb203: ; preds = %cond_false186 + ret void + +cond_true293: ; preds = %cond_false186 + br i1 false, label %cond_true298, label %cond_next317 + +cond_true298: ; preds = %cond_true293 + br i1 false, label %cond_next518, label %cond_true397.preheader + +cond_next317: ; preds = %cond_true293 + ret void + +cond_true397.preheader: ; preds = %cond_true298 + ret void + +cond_next518: ; preds = %cond_true298 + br i1 false, label %bb1069, label %cond_true522 + +cond_true522: ; preds = %cond_next518 + ret void + +bb1069: ; preds = %cond_next518 + br i1 false, label %cond_next1131, label %bb1096 + +bb1096: ; preds = %bb1069 + ret void + +cond_next1131: ; preds = %bb1069 + br i1 false, label %cond_next1207, label %cond_true1150 + +cond_true1150: ; preds = %cond_next1131 + ret void + +cond_next1207: ; preds = %cond_next1131 + br i1 false, label %cond_next1219, label %cond_true1211 + +cond_true1211: ; preds = %cond_next1207 + ret void + +cond_next1219: ; preds = %cond_next1207 + br i1 false, label %cond_true1223, label %cond_next1283 + +cond_true1223: ; preds = %cond_next1219 + br i1 false, label %cond_true1254, label %cond_true1264 + +cond_true1254: ; preds = %cond_true1223 + br i1 false, label %bb1567, label %cond_true1369.preheader + +cond_true1264: ; preds = %cond_true1223 + ret void + +cond_next1283: ; preds = %cond_next1219 + ret void + +cond_true1369.preheader: ; preds = %cond_true1254 + ret void + +bb1567: ; preds = %cond_true1254 + %tmp1591 = load i64* getelementptr inbounds (%struct.CHESS_POSITION* @search, i32 0, i32 4) ; <i64> [#uses=1] + %tmp1572 = tail call fastcc i32 @FirstOne() ; <i32> [#uses=1] + %tmp1594 = load i32* undef ; <i32> [#uses=1] + %tmp1594.upgrd.5 = trunc i32 %tmp1594 to i8 ; <i8> [#uses=1] + %shift.upgrd.6 = zext i8 %tmp1594.upgrd.5 to i64 ; <i64> [#uses=1] + %tmp1595 = lshr i64 %tmp1591, %shift.upgrd.6 ; <i64> [#uses=1] + %tmp1595.upgrd.7 = trunc i64 %tmp1595 to i32 ; <i32> [#uses=1] + %tmp1596 = and i32 %tmp1595.upgrd.7, 255 ; <i32> [#uses=1] + %gep.upgrd.8 = zext i32 %tmp1596 to i64 ; <i64> [#uses=1] + %tmp1598 = getelementptr [64 x [256 x i32]]* @bishop_mobility_rr45, i32 0, i32 %tmp1572, i64 %gep.upgrd.8 ; <i32*> [#uses=1] + %tmp1599 = load i32* %tmp1598 ; <i32> [#uses=1] + %tmp1602 = sub i32 0, %tmp1599 ; <i32> [#uses=1] + br i1 undef, label %cond_next1637, label %cond_true1607 + +cond_true1607: ; preds = %bb1567 + ret void + +cond_next1637: ; preds = %bb1567 + %tmp1662 = sub i32 %tmp1602, 0 ; <i32> [#uses=0] + ret void +} diff --git a/test/CodeGen/ARM/2010-05-18-LocalAllocCrash.ll b/test/CodeGen/ARM/2010-05-18-LocalAllocCrash.ll new file mode 100644 index 0000000..b158afd --- /dev/null +++ b/test/CodeGen/ARM/2010-05-18-LocalAllocCrash.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -O0 -verify-machineinstrs -regalloc=local +; RUN: llc < %s -O0 -verify-machineinstrs -regalloc=fast +; rdar://problem/7948106 +;; This test would spill %R4 before the call to zz, but it forgot to move the +; 'last use' marker to the spill. + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:64-n32" +target triple = "armv6-apple-darwin" + +%struct.q = type { i32, i32 } + +@.str = external constant [1 x i8] ; <[1 x i8]*> [#uses=1] + +define arm_apcscc void @yy(%struct.q* %qq) nounwind { +entry: + %vla6 = alloca i8, i32 undef, align 1 ; <i8*> [#uses=1] + %vla10 = alloca i8, i32 undef, align 1 ; <i8*> [#uses=1] + %vla14 = alloca i8, i32 undef, align 1 ; <i8*> [#uses=1] + %vla18 = alloca i8, i32 undef, align 1 ; <i8*> [#uses=1] + %tmp21 = load i32* undef ; <i32> [#uses=1] + %0 = mul i32 1, %tmp21 ; <i32> [#uses=1] + %vla22 = alloca i8, i32 %0, align 1 ; <i8*> [#uses=1] + call arm_apcscc void (...)* @zz(i8* getelementptr inbounds ([1 x i8]* @.str, i32 0, i32 0), i32 2, i32 1) + br i1 undef, label %if.then, label %if.end36 + +if.then: ; preds = %entry + %call = call arm_apcscc i32 (...)* @x(%struct.q* undef, i8* undef, i8* %vla6, i8* %vla10, i32 undef) ; <i32> [#uses=0] + %call35 = call arm_apcscc i32 (...)* @x(%struct.q* undef, i8* %vla14, i8* %vla18, i8* %vla22, i32 undef) ; <i32> [#uses=0] + unreachable + +if.end36: ; preds = %entry + ret void +} + +declare arm_apcscc void @zz(...) + +declare arm_apcscc i32 @x(...) diff --git a/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll b/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll new file mode 100644 index 0000000..9907228 --- /dev/null +++ b/test/CodeGen/ARM/2010-05-18-PostIndexBug.ll @@ -0,0 +1,25 @@ +; RUN: llc < %s -mtriple=armv7-apple-darwin | FileCheck %s -check-prefix=ARM +; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s -check-prefix=THUMB +; rdar://7998649 + +%struct.foo = type { i64, i64 } + +define arm_apcscc zeroext i8 @t(%struct.foo* %this) noreturn optsize { +entry: +; ARM: t: +; ARM: str r0, [r1], r0 + +; THUMB: t: +; THUMB-NOT: str r0, [r1], r0 +; THUMB: str r0, [r1] + %0 = getelementptr inbounds %struct.foo* %this, i32 0, i32 1 ; <i64*> [#uses=1] + store i32 undef, i32* inttoptr (i32 8 to i32*), align 8 + br i1 undef, label %bb.nph96, label %bb3 + +bb3: ; preds = %entry + %1 = load i64* %0, align 4 ; <i64> [#uses=0] + unreachable + +bb.nph96: ; preds = %entry + unreachable +} diff --git a/test/CodeGen/ARM/2010-05-19-Shuffles.ll b/test/CodeGen/ARM/2010-05-19-Shuffles.ll new file mode 100644 index 0000000..587c0af --- /dev/null +++ b/test/CodeGen/ARM/2010-05-19-Shuffles.ll @@ -0,0 +1,21 @@ +; RUN: llc < %s -mtriple=armv7-eabi -mcpu=cortex-a8 +; pr7167 + +define <8 x i8> @f1(<8 x i8> %x) nounwind { + %y = shufflevector <8 x i8> %x, <8 x i8> undef, + <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> + ret <8 x i8> %y +} + +define <8 x i8> @f2(<8 x i8> %x) nounwind { + %y = shufflevector <8 x i8> %x, <8 x i8> undef, + <8 x i32> <i32 1, i32 2, i32 0, i32 5, i32 3, i32 6, i32 7, i32 4> + ret <8 x i8> %y +} + +define void @f3(<4 x i64>* %xp) nounwind { + %x = load <4 x i64>* %xp + %y = shufflevector <4 x i64> %x, <4 x i64> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 1> + store <4 x i64> %y, <4 x i64>* %xp + ret void +} diff --git a/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll b/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll new file mode 100644 index 0000000..b6fbf9b --- /dev/null +++ b/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll @@ -0,0 +1,45 @@ +; RUN: llc < %s -march=arm -mattr=+neon -O0 + +; This test would crash the rewriter when trying to handle a spill after one of +; the @llvm.arm.neon.vld3.v8i8 defined three parts of a register. + +%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } + +declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*) nounwind readonly + +declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind + +define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind { + %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A2) ; <%struct.__neon_int8x8x3_t> [#uses=2] + %tmp2b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 0 ; <<8 x i8>> [#uses=1] + %tmp4b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 1 ; <<8 x i8>> [#uses=1] + %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4) ; <%struct.__neon_int8x8x3_t> [#uses=2] + %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1] + %tmp4d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 1 ; <<8 x i8>> [#uses=1] + %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A5) ; <%struct.__neon_int8x8x3_t> [#uses=1] + %tmp2e = extractvalue %struct.__neon_int8x8x3_t %tmp1e, 0 ; <<8 x i8>> [#uses=1] + %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6) ; <%struct.__neon_int8x8x3_t> [#uses=1] + %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1] + %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A7) ; <%struct.__neon_int8x8x3_t> [#uses=2] + %tmp2g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 0 ; <<8 x i8>> [#uses=1] + %tmp4g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 1 ; <<8 x i8>> [#uses=1] + %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A8) ; <%struct.__neon_int8x8x3_t> [#uses=2] + %tmp2h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 0 ; <<8 x i8>> [#uses=1] + %tmp3h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 2 ; <<8 x i8>> [#uses=1] + %tmp2bd = add <8 x i8> %tmp2b, %tmp2d ; <<8 x i8>> [#uses=1] + %tmp4bd = add <8 x i8> %tmp4b, %tmp4d ; <<8 x i8>> [#uses=1] + %tmp2abcd = mul <8 x i8> undef, %tmp2bd ; <<8 x i8>> [#uses=1] + %tmp4abcd = mul <8 x i8> undef, %tmp4bd ; <<8 x i8>> [#uses=2] + call void @llvm.arm.neon.vst3.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd) + %tmp2ef = sub <8 x i8> %tmp2e, %tmp2f ; <<8 x i8>> [#uses=1] + %tmp2gh = sub <8 x i8> %tmp2g, %tmp2h ; <<8 x i8>> [#uses=1] + %tmp3gh = sub <8 x i8> zeroinitializer, %tmp3h ; <<8 x i8>> [#uses=1] + %tmp4ef = sub <8 x i8> zeroinitializer, %tmp4g ; <<8 x i8>> [#uses=1] + %tmp2efgh = mul <8 x i8> %tmp2ef, %tmp2gh ; <<8 x i8>> [#uses=1] + %tmp3efgh = mul <8 x i8> undef, %tmp3gh ; <<8 x i8>> [#uses=1] + %tmp4efgh = mul <8 x i8> %tmp4ef, undef ; <<8 x i8>> [#uses=2] + call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh) + %tmp4 = sub <8 x i8> %tmp4efgh, %tmp4abcd ; <<8 x i8>> [#uses=1] + tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef) + ret <8 x i8> %tmp4 +} diff --git a/test/CodeGen/ARM/2010-05-21-BuildVector.ll b/test/CodeGen/ARM/2010-05-21-BuildVector.ll new file mode 100644 index 0000000..6b19490 --- /dev/null +++ b/test/CodeGen/ARM/2010-05-21-BuildVector.ll @@ -0,0 +1,43 @@ +; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s +; Radar 7872877 + +define arm_apcscc void @test(float* %fltp, i32 %packedValue, float* %table) nounwind { +entry: + %0 = load float* %fltp + %1 = insertelement <4 x float> undef, float %0, i32 0 + %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer + %3 = shl i32 %packedValue, 16 + %4 = ashr i32 %3, 30 + %.sum = add i32 %4, 4 + %5 = getelementptr inbounds float* %table, i32 %.sum +;CHECK: vldr.32 s + %6 = load float* %5, align 4 + %tmp11 = insertelement <4 x float> undef, float %6, i32 0 + %7 = shl i32 %packedValue, 18 + %8 = ashr i32 %7, 30 + %.sum12 = add i32 %8, 4 + %9 = getelementptr inbounds float* %table, i32 %.sum12 +;CHECK: vldr.32 s + %10 = load float* %9, align 4 + %tmp9 = insertelement <4 x float> %tmp11, float %10, i32 1 + %11 = shl i32 %packedValue, 20 + %12 = ashr i32 %11, 30 + %.sum13 = add i32 %12, 4 + %13 = getelementptr inbounds float* %table, i32 %.sum13 +;CHECK: vldr.32 s + %14 = load float* %13, align 4 + %tmp7 = insertelement <4 x float> %tmp9, float %14, i32 2 + %15 = shl i32 %packedValue, 22 + %16 = ashr i32 %15, 30 + %.sum14 = add i32 %16, 4 + %17 = getelementptr inbounds float* %table, i32 %.sum14 +;CHECK: vldr.32 s + %18 = load float* %17, align 4 + %tmp5 = insertelement <4 x float> %tmp7, float %18, i32 3 + %19 = fmul <4 x float> %tmp5, %2 + %20 = bitcast float* %fltp to i8* + tail call void @llvm.arm.neon.vst1.v4f32(i8* %20, <4 x float> %19) + ret void +} + +declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>) nounwind diff --git a/test/CodeGen/ARM/arm-frameaddr.ll b/test/CodeGen/ARM/arm-frameaddr.ll index 2739860..1c7ac25 100644 --- a/test/CodeGen/ARM/arm-frameaddr.ll +++ b/test/CodeGen/ARM/arm-frameaddr.ll @@ -1,10 +1,15 @@ -; RUN: llc < %s -mtriple=arm-apple-darwin | grep mov | grep r7 -; RUN: llc < %s -mtriple=arm-linux-gnueabi | grep mov | grep r11 +; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s -check-prefix=DARWIN +; RUN: llc < %s -mtriple=arm-linux-gnueabi | FileCheck %s -check-prefix=LINUX ; PR4344 ; PR4416 define arm_aapcscc i8* @t() nounwind { entry: +; DARWIN: t: +; DARWIN: mov r0, r7 + +; LINUX: t: +; LINUX: mov r0, r11 %0 = call i8* @llvm.frameaddress(i32 0) ret i8* %0 } diff --git a/test/CodeGen/ARM/arm-returnaddr.ll b/test/CodeGen/ARM/arm-returnaddr.ll new file mode 100644 index 0000000..2c8f2ab --- /dev/null +++ b/test/CodeGen/ARM/arm-returnaddr.ll @@ -0,0 +1,24 @@ +; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s +; RUN: llc < %s -mtriple=thumbv6-apple-darwin +; rdar://8015977 +; rdar://8020118 + +define arm_apcscc i8* @rt0(i32 %x) nounwind readnone { +entry: +; CHECK: rt0: +; CHECK: mov r0, lr + %0 = tail call i8* @llvm.returnaddress(i32 0) + ret i8* %0 +} + +define arm_apcscc i8* @rt2() nounwind readnone { +entry: +; CHECK: rt2: +; CHECK: ldr r0, [r7] +; CHECK: ldr r0, [r0] +; CHECK: ldr r0, [r0, #4] + %0 = tail call i8* @llvm.returnaddress(i32 2) + ret i8* %0 +} + +declare i8* @llvm.returnaddress(i32) nounwind readnone diff --git a/test/CodeGen/ARM/div.ll b/test/CodeGen/ARM/div.ll index 2f724e7..d833afa 100644 --- a/test/CodeGen/ARM/div.ll +++ b/test/CodeGen/ARM/div.ll @@ -1,29 +1,43 @@ -; RUN: llc < %s -march=arm > %t -; RUN: grep __divsi3 %t -; RUN: grep __udivsi3 %t -; RUN: grep __modsi3 %t -; RUN: grep __umodsi3 %t +; RUN: llc < %s -march=arm | FileCheck %s -check-prefix=CHECK-ARM +; RUN: llc < %s -march=arm -mcpu=cortex-m3 \ +; RUN: | FileCheck %s -check-prefix=CHECK-ARMV7M define i32 @f1(i32 %a, i32 %b) { entry: +; CHECK-ARM: f1 +; CHECK-ARM: __divsi3 +; CHECK-ARMV7M: f1 +; CHECK-ARMV7M: sdiv %tmp1 = sdiv i32 %a, %b ; <i32> [#uses=1] ret i32 %tmp1 } define i32 @f2(i32 %a, i32 %b) { entry: +; CHECK-ARM: f2 +; CHECK-ARM: __udivsi3 +; CHECK-ARMV7M: f2 +; CHECK-ARMV7M: udiv %tmp1 = udiv i32 %a, %b ; <i32> [#uses=1] ret i32 %tmp1 } define i32 @f3(i32 %a, i32 %b) { entry: +; CHECK-ARM: f3 +; CHECK-ARM: __modsi3 +; CHECK-ARMV7M: f3 +; CHECK-ARMV7M: sdiv %tmp1 = srem i32 %a, %b ; <i32> [#uses=1] ret i32 %tmp1 } define i32 @f4(i32 %a, i32 %b) { entry: +; CHECK-ARM: f4 +; CHECK-ARM: __umodsi3 +; CHECK-ARMV7M: f4 +; CHECK-ARMV7M: udiv %tmp1 = urem i32 %a, %b ; <i32> [#uses=1] ret i32 %tmp1 } diff --git a/test/CodeGen/ARM/fabss.ll b/test/CodeGen/ARM/fabss.ll index f03282b..dfc1e0a 100644 --- a/test/CodeGen/ARM/fabss.ll +++ b/test/CodeGen/ARM/fabss.ll @@ -24,4 +24,4 @@ declare float @fabsf(float) ; CORTEXA8: test: ; CORTEXA8: vabs.f32 d1, d1 ; CORTEXA9: test: -; CORTEXA9: vabs.f32 s1, s1 +; CORTEXA9: vabs.f32 s0, s0 diff --git a/test/CodeGen/ARM/fadds.ll b/test/CodeGen/ARM/fadds.ll index 749690e..113f0e2 100644 --- a/test/CodeGen/ARM/fadds.ll +++ b/test/CodeGen/ARM/fadds.ll @@ -20,4 +20,4 @@ entry: ; CORTEXA8: test: ; CORTEXA8: vadd.f32 d0, d1, d0 ; CORTEXA9: test: -; CORTEXA9: vadd.f32 s0, s1, s0 +; CORTEXA9: vadd.f32 s0, s0, s1 diff --git a/test/CodeGen/ARM/fdivs.ll b/test/CodeGen/ARM/fdivs.ll index 0c31495..9af1217 100644 --- a/test/CodeGen/ARM/fdivs.ll +++ b/test/CodeGen/ARM/fdivs.ll @@ -20,4 +20,4 @@ entry: ; CORTEXA8: test: ; CORTEXA8: vdiv.f32 s0, s1, s0 ; CORTEXA9: test: -; CORTEXA9: vdiv.f32 s0, s1, s0 +; CORTEXA9: vdiv.f32 s0, s0, s1 diff --git a/test/CodeGen/ARM/fmacs.ll b/test/CodeGen/ARM/fmacs.ll index f8b47b5..c4ceca9 100644 --- a/test/CodeGen/ARM/fmacs.ll +++ b/test/CodeGen/ARM/fmacs.ll @@ -21,4 +21,4 @@ entry: ; CORTEXA8: test: ; CORTEXA8: vmul.f32 d0, d1, d0 ; CORTEXA9: test: -; CORTEXA9: vmla.f32 s2, s1, s0 +; CORTEXA9: vmla.f32 s0, s1, s2 diff --git a/test/CodeGen/ARM/fmscs.ll b/test/CodeGen/ARM/fmscs.ll index 7a70543..103ce33 100644 --- a/test/CodeGen/ARM/fmscs.ll +++ b/test/CodeGen/ARM/fmscs.ll @@ -21,4 +21,4 @@ entry: ; CORTEXA8: test: ; CORTEXA8: vnmls.f32 s2, s1, s0 ; CORTEXA9: test: -; CORTEXA9: vnmls.f32 s2, s1, s0 +; CORTEXA9: vnmls.f32 s0, s1, s2 diff --git a/test/CodeGen/ARM/fmuls.ll b/test/CodeGen/ARM/fmuls.ll index ef4e3e5..bfafd20 100644 --- a/test/CodeGen/ARM/fmuls.ll +++ b/test/CodeGen/ARM/fmuls.ll @@ -20,4 +20,4 @@ entry: ; CORTEXA8: test: ; CORTEXA8: vmul.f32 d0, d1, d0 ; CORTEXA9: test: -; CORTEXA9: vmul.f32 s0, s1, s0 +; CORTEXA9: vmul.f32 s0, s0, s1 diff --git a/test/CodeGen/ARM/fnmscs.ll b/test/CodeGen/ARM/fnmscs.ll index 6b7cefa..0b47edd 100644 --- a/test/CodeGen/ARM/fnmscs.ll +++ b/test/CodeGen/ARM/fnmscs.ll @@ -4,7 +4,7 @@ ; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s define float @test1(float %acc, float %a, float %b) nounwind { -; CHECK: vnmla.f32 s2, s1, s0 +; CHECK: vnmla.f32 s{{.*}}, s{{.*}}, s{{.*}} entry: %0 = fmul float %a, %b %1 = fsub float -0.0, %0 @@ -13,7 +13,7 @@ entry: } define float @test2(float %acc, float %a, float %b) nounwind { -; CHECK: vnmla.f32 s2, s1, s0 +; CHECK: vnmla.f32 s{{.*}}, s{{.*}}, s{{.*}} entry: %0 = fmul float %a, %b %1 = fmul float -1.0, %0 diff --git a/test/CodeGen/ARM/lsr-on-unrolled-loops.ll b/test/CodeGen/ARM/lsr-on-unrolled-loops.ll new file mode 100644 index 0000000..2ac4084 --- /dev/null +++ b/test/CodeGen/ARM/lsr-on-unrolled-loops.ll @@ -0,0 +1,642 @@ +; RUN: llc -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 < %s | FileCheck %s + +; LSR should recognize that this is an unrolled loop which can use +; constant offset addressing, so that each of the following stores +; uses the same register. + +; CHECK: vstr.32 s0, [r12, #-128] +; CHECK: vstr.32 s0, [r12, #-96] +; CHECK: vstr.32 s0, [r12, #-64] +; CHECK: vstr.32 s0, [r12, #-32] +; CHECK: vstr.32 s0, [r12] +; CHECK: vstr.32 s0, [r12, #32] +; CHECK: vstr.32 s0, [r12, #64] +; CHECK: vstr.32 s0, [r12, #96] + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32" + +%0 = type { %1*, %3*, %6*, i8*, i32, i32, %8*, i32, i32, i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8**, i32, i32, i32, i32, i32, [64 x i32]*, [4 x %9*], [4 x %10*], [4 x %10*], i32, %11*, i32, i32, [16 x i8], [16 x i8], [16 x i8], i32, i32, i8, i8, i8, i16, i16, i32, i8, i32, %12*, i32, i32, i32, i32, i8*, i32, [4 x %11*], i32, i32, i32, [10 x i32], i32, i32, i32, i32, i32, %13*, %14*, %15*, %16*, %17*, %18*, %19*, %20*, %21*, %22*, %23* } +%1 = type { void (%2*)*, void (%2*, i32)*, void (%2*)*, void (%2*, i8*)*, void (%2*)*, i32, %7, i32, i32, i8**, i32, i8**, i32, i32 } +%2 = type { %1*, %3*, %6*, i8*, i32, i32 } +%3 = type { i8* (%2*, i32, i32)*, i8* (%2*, i32, i32)*, i8** (%2*, i32, i32, i32)*, [64 x i16]** (%2*, i32, i32, i32)*, %4* (%2*, i32, i32, i32, i32, i32)*, %5* (%2*, i32, i32, i32, i32, i32)*, void (%2*)*, i8** (%2*, %4*, i32, i32, i32)*, [64 x i16]** (%2*, %5*, i32, i32, i32)*, void (%2*, i32)*, void (%2*)*, i32, i32 } +%4 = type opaque +%5 = type opaque +%6 = type { void (%2*)*, i32, i32, i32, i32 } +%7 = type { [8 x i32], [12 x i32] } +%8 = type { i8*, i32, void (%0*)*, i32 (%0*)*, void (%0*, i32)*, i32 (%0*, i32)*, void (%0*)* } +%9 = type { [64 x i16], i32 } +%10 = type { [17 x i8], [256 x i8], i32 } +%11 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %9*, i8* } +%12 = type { %12*, i8, i32, i32, i8* } +%13 = type { void (%0*)*, void (%0*)*, i32 } +%14 = type { void (%0*, i32)*, void (%0*, i8**, i32*, i32)* } +%15 = type { void (%0*)*, i32 (%0*)*, void (%0*)*, i32 (%0*, i8***)*, %5** } +%16 = type { void (%0*, i32)*, void (%0*, i8***, i32*, i32, i8**, i32*, i32)* } +%17 = type { i32 (%0*)*, void (%0*)*, void (%0*)*, void (%0*)*, i32, i32 } +%18 = type { void (%0*)*, i32 (%0*)*, i32 (%0*)*, i32, i32, i32, i32 } +%19 = type { void (%0*)*, i32 (%0*, [64 x i16]**)*, i32 } +%20 = type { void (%0*)*, [10 x void (%0*, %11*, i16*, i8**, i32)*] } +%21 = type { void (%0*)*, void (%0*, i8***, i32*, i32, i8**, i32*, i32)*, i32 } +%22 = type { void (%0*)*, void (%0*, i8***, i32, i8**, i32)* } +%23 = type { void (%0*, i32)*, void (%0*, i8**, i8**, i32)*, void (%0*)*, void (%0*)* } + +define arm_apcscc void @test(%0* nocapture %a0, %11* nocapture %a1, i16* nocapture %a2, i8** nocapture %a3, i32 %a4) nounwind { +bb: + %t = alloca [64 x float], align 4 + %t5 = getelementptr inbounds %0* %a0, i32 0, i32 65 + %t6 = load i8** %t5, align 4 + %t7 = getelementptr inbounds %11* %a1, i32 0, i32 20 + %t8 = load i8** %t7, align 4 + br label %bb9 + +bb9: + %t10 = phi i32 [ 0, %bb ], [ %t157, %bb156 ] + %t11 = add i32 %t10, 8 + %t12 = getelementptr [64 x float]* %t, i32 0, i32 %t11 + %t13 = add i32 %t10, 16 + %t14 = getelementptr [64 x float]* %t, i32 0, i32 %t13 + %t15 = add i32 %t10, 24 + %t16 = getelementptr [64 x float]* %t, i32 0, i32 %t15 + %t17 = add i32 %t10, 32 + %t18 = getelementptr [64 x float]* %t, i32 0, i32 %t17 + %t19 = add i32 %t10, 40 + %t20 = getelementptr [64 x float]* %t, i32 0, i32 %t19 + %t21 = add i32 %t10, 48 + %t22 = getelementptr [64 x float]* %t, i32 0, i32 %t21 + %t23 = add i32 %t10, 56 + %t24 = getelementptr [64 x float]* %t, i32 0, i32 %t23 + %t25 = getelementptr [64 x float]* %t, i32 0, i32 %t10 + %t26 = shl i32 %t10, 5 + %t27 = or i32 %t26, 8 + %t28 = getelementptr i8* %t8, i32 %t27 + %t29 = bitcast i8* %t28 to float* + %t30 = or i32 %t26, 16 + %t31 = getelementptr i8* %t8, i32 %t30 + %t32 = bitcast i8* %t31 to float* + %t33 = or i32 %t26, 24 + %t34 = getelementptr i8* %t8, i32 %t33 + %t35 = bitcast i8* %t34 to float* + %t36 = or i32 %t26, 4 + %t37 = getelementptr i8* %t8, i32 %t36 + %t38 = bitcast i8* %t37 to float* + %t39 = or i32 %t26, 12 + %t40 = getelementptr i8* %t8, i32 %t39 + %t41 = bitcast i8* %t40 to float* + %t42 = or i32 %t26, 20 + %t43 = getelementptr i8* %t8, i32 %t42 + %t44 = bitcast i8* %t43 to float* + %t45 = or i32 %t26, 28 + %t46 = getelementptr i8* %t8, i32 %t45 + %t47 = bitcast i8* %t46 to float* + %t48 = getelementptr i8* %t8, i32 %t26 + %t49 = bitcast i8* %t48 to float* + %t50 = shl i32 %t10, 3 + %t51 = or i32 %t50, 1 + %t52 = getelementptr i16* %a2, i32 %t51 + %t53 = or i32 %t50, 2 + %t54 = getelementptr i16* %a2, i32 %t53 + %t55 = or i32 %t50, 3 + %t56 = getelementptr i16* %a2, i32 %t55 + %t57 = or i32 %t50, 4 + %t58 = getelementptr i16* %a2, i32 %t57 + %t59 = or i32 %t50, 5 + %t60 = getelementptr i16* %a2, i32 %t59 + %t61 = or i32 %t50, 6 + %t62 = getelementptr i16* %a2, i32 %t61 + %t63 = or i32 %t50, 7 + %t64 = getelementptr i16* %a2, i32 %t63 + %t65 = getelementptr i16* %a2, i32 %t50 + %t66 = load i16* %t52, align 2 + %t67 = icmp eq i16 %t66, 0 + %t68 = load i16* %t54, align 2 + %t69 = icmp eq i16 %t68, 0 + %t70 = and i1 %t67, %t69 + br i1 %t70, label %bb71, label %bb91 + +bb71: + %t72 = load i16* %t56, align 2 + %t73 = icmp eq i16 %t72, 0 + br i1 %t73, label %bb74, label %bb91 + +bb74: + %t75 = load i16* %t58, align 2 + %t76 = icmp eq i16 %t75, 0 + br i1 %t76, label %bb77, label %bb91 + +bb77: + %t78 = load i16* %t60, align 2 + %t79 = icmp eq i16 %t78, 0 + br i1 %t79, label %bb80, label %bb91 + +bb80: + %t81 = load i16* %t62, align 2 + %t82 = icmp eq i16 %t81, 0 + br i1 %t82, label %bb83, label %bb91 + +bb83: + %t84 = load i16* %t64, align 2 + %t85 = icmp eq i16 %t84, 0 + br i1 %t85, label %bb86, label %bb91 + +bb86: + %t87 = load i16* %t65, align 2 + %t88 = sitofp i16 %t87 to float + %t89 = load float* %t49, align 4 + %t90 = fmul float %t88, %t89 + store float %t90, float* %t25, align 4 + store float %t90, float* %t12, align 4 + store float %t90, float* %t14, align 4 + store float %t90, float* %t16, align 4 + store float %t90, float* %t18, align 4 + store float %t90, float* %t20, align 4 + store float %t90, float* %t22, align 4 + store float %t90, float* %t24, align 4 + br label %bb156 + +bb91: + %t92 = load i16* %t65, align 2 + %t93 = sitofp i16 %t92 to float + %t94 = load float* %t49, align 4 + %t95 = fmul float %t93, %t94 + %t96 = sitofp i16 %t68 to float + %t97 = load float* %t29, align 4 + %t98 = fmul float %t96, %t97 + %t99 = load i16* %t58, align 2 + %t100 = sitofp i16 %t99 to float + %t101 = load float* %t32, align 4 + %t102 = fmul float %t100, %t101 + %t103 = load i16* %t62, align 2 + %t104 = sitofp i16 %t103 to float + %t105 = load float* %t35, align 4 + %t106 = fmul float %t104, %t105 + %t107 = fadd float %t95, %t102 + %t108 = fsub float %t95, %t102 + %t109 = fadd float %t98, %t106 + %t110 = fsub float %t98, %t106 + %t111 = fmul float %t110, 0x3FF6A09E60000000 + %t112 = fsub float %t111, %t109 + %t113 = fadd float %t107, %t109 + %t114 = fsub float %t107, %t109 + %t115 = fadd float %t108, %t112 + %t116 = fsub float %t108, %t112 + %t117 = sitofp i16 %t66 to float + %t118 = load float* %t38, align 4 + %t119 = fmul float %t117, %t118 + %t120 = load i16* %t56, align 2 + %t121 = sitofp i16 %t120 to float + %t122 = load float* %t41, align 4 + %t123 = fmul float %t121, %t122 + %t124 = load i16* %t60, align 2 + %t125 = sitofp i16 %t124 to float + %t126 = load float* %t44, align 4 + %t127 = fmul float %t125, %t126 + %t128 = load i16* %t64, align 2 + %t129 = sitofp i16 %t128 to float + %t130 = load float* %t47, align 4 + %t131 = fmul float %t129, %t130 + %t132 = fadd float %t127, %t123 + %t133 = fsub float %t127, %t123 + %t134 = fadd float %t119, %t131 + %t135 = fsub float %t119, %t131 + %t136 = fadd float %t134, %t132 + %t137 = fsub float %t134, %t132 + %t138 = fmul float %t137, 0x3FF6A09E60000000 + %t139 = fadd float %t133, %t135 + %t140 = fmul float %t139, 0x3FFD906BC0000000 + %t141 = fmul float %t135, 0x3FF1517A80000000 + %t142 = fsub float %t141, %t140 + %t143 = fmul float %t133, 0xC004E7AEA0000000 + %t144 = fadd float %t143, %t140 + %t145 = fsub float %t144, %t136 + %t146 = fsub float %t138, %t145 + %t147 = fadd float %t142, %t146 + %t148 = fadd float %t113, %t136 + store float %t148, float* %t25, align 4 + %t149 = fsub float %t113, %t136 + store float %t149, float* %t24, align 4 + %t150 = fadd float %t115, %t145 + store float %t150, float* %t12, align 4 + %t151 = fsub float %t115, %t145 + store float %t151, float* %t22, align 4 + %t152 = fadd float %t116, %t146 + store float %t152, float* %t14, align 4 + %t153 = fsub float %t116, %t146 + store float %t153, float* %t20, align 4 + %t154 = fadd float %t114, %t147 + store float %t154, float* %t18, align 4 + %t155 = fsub float %t114, %t147 + store float %t155, float* %t16, align 4 + br label %bb156 + +bb156: + %t157 = add i32 %t10, 1 + %t158 = icmp eq i32 %t157, 8 + br i1 %t158, label %bb159, label %bb9 + +bb159: + %t160 = add i32 %a4, 7 + %t161 = add i32 %a4, 1 + %t162 = add i32 %a4, 6 + %t163 = add i32 %a4, 2 + %t164 = add i32 %a4, 5 + %t165 = add i32 %a4, 4 + %t166 = add i32 %a4, 3 + br label %bb167 + +bb167: + %t168 = phi i32 [ 0, %bb159 ], [ %t293, %bb167 ] + %t169 = getelementptr i8** %a3, i32 %t168 + %t170 = shl i32 %t168, 3 + %t171 = or i32 %t170, 4 + %t172 = getelementptr [64 x float]* %t, i32 0, i32 %t171 + %t173 = or i32 %t170, 2 + %t174 = getelementptr [64 x float]* %t, i32 0, i32 %t173 + %t175 = or i32 %t170, 6 + %t176 = getelementptr [64 x float]* %t, i32 0, i32 %t175 + %t177 = or i32 %t170, 5 + %t178 = getelementptr [64 x float]* %t, i32 0, i32 %t177 + %t179 = or i32 %t170, 3 + %t180 = getelementptr [64 x float]* %t, i32 0, i32 %t179 + %t181 = or i32 %t170, 1 + %t182 = getelementptr [64 x float]* %t, i32 0, i32 %t181 + %t183 = or i32 %t170, 7 + %t184 = getelementptr [64 x float]* %t, i32 0, i32 %t183 + %t185 = getelementptr [64 x float]* %t, i32 0, i32 %t170 + %t186 = load i8** %t169, align 4 + %t187 = getelementptr inbounds i8* %t186, i32 %a4 + %t188 = load float* %t185, align 4 + %t189 = load float* %t172, align 4 + %t190 = fadd float %t188, %t189 + %t191 = fsub float %t188, %t189 + %t192 = load float* %t174, align 4 + %t193 = load float* %t176, align 4 + %t194 = fadd float %t192, %t193 + %t195 = fsub float %t192, %t193 + %t196 = fmul float %t195, 0x3FF6A09E60000000 + %t197 = fsub float %t196, %t194 + %t198 = fadd float %t190, %t194 + %t199 = fsub float %t190, %t194 + %t200 = fadd float %t191, %t197 + %t201 = fsub float %t191, %t197 + %t202 = load float* %t178, align 4 + %t203 = load float* %t180, align 4 + %t204 = fadd float %t202, %t203 + %t205 = fsub float %t202, %t203 + %t206 = load float* %t182, align 4 + %t207 = load float* %t184, align 4 + %t208 = fadd float %t206, %t207 + %t209 = fsub float %t206, %t207 + %t210 = fadd float %t208, %t204 + %t211 = fsub float %t208, %t204 + %t212 = fmul float %t211, 0x3FF6A09E60000000 + %t213 = fadd float %t205, %t209 + %t214 = fmul float %t213, 0x3FFD906BC0000000 + %t215 = fmul float %t209, 0x3FF1517A80000000 + %t216 = fsub float %t215, %t214 + %t217 = fmul float %t205, 0xC004E7AEA0000000 + %t218 = fadd float %t217, %t214 + %t219 = fsub float %t218, %t210 + %t220 = fsub float %t212, %t219 + %t221 = fadd float %t216, %t220 + %t222 = fadd float %t198, %t210 + %t223 = fptosi float %t222 to i32 + %t224 = add nsw i32 %t223, 4 + %t225 = lshr i32 %t224, 3 + %t226 = and i32 %t225, 1023 + %t227 = add i32 %t226, 128 + %t228 = getelementptr inbounds i8* %t6, i32 %t227 + %t229 = load i8* %t228, align 1 + store i8 %t229, i8* %t187, align 1 + %t230 = fsub float %t198, %t210 + %t231 = fptosi float %t230 to i32 + %t232 = add nsw i32 %t231, 4 + %t233 = lshr i32 %t232, 3 + %t234 = and i32 %t233, 1023 + %t235 = add i32 %t234, 128 + %t236 = getelementptr inbounds i8* %t6, i32 %t235 + %t237 = load i8* %t236, align 1 + %t238 = getelementptr inbounds i8* %t186, i32 %t160 + store i8 %t237, i8* %t238, align 1 + %t239 = fadd float %t200, %t219 + %t240 = fptosi float %t239 to i32 + %t241 = add nsw i32 %t240, 4 + %t242 = lshr i32 %t241, 3 + %t243 = and i32 %t242, 1023 + %t244 = add i32 %t243, 128 + %t245 = getelementptr inbounds i8* %t6, i32 %t244 + %t246 = load i8* %t245, align 1 + %t247 = getelementptr inbounds i8* %t186, i32 %t161 + store i8 %t246, i8* %t247, align 1 + %t248 = fsub float %t200, %t219 + %t249 = fptosi float %t248 to i32 + %t250 = add nsw i32 %t249, 4 + %t251 = lshr i32 %t250, 3 + %t252 = and i32 %t251, 1023 + %t253 = add i32 %t252, 128 + %t254 = getelementptr inbounds i8* %t6, i32 %t253 + %t255 = load i8* %t254, align 1 + %t256 = getelementptr inbounds i8* %t186, i32 %t162 + store i8 %t255, i8* %t256, align 1 + %t257 = fadd float %t201, %t220 + %t258 = fptosi float %t257 to i32 + %t259 = add nsw i32 %t258, 4 + %t260 = lshr i32 %t259, 3 + %t261 = and i32 %t260, 1023 + %t262 = add i32 %t261, 128 + %t263 = getelementptr inbounds i8* %t6, i32 %t262 + %t264 = load i8* %t263, align 1 + %t265 = getelementptr inbounds i8* %t186, i32 %t163 + store i8 %t264, i8* %t265, align 1 + %t266 = fsub float %t201, %t220 + %t267 = fptosi float %t266 to i32 + %t268 = add nsw i32 %t267, 4 + %t269 = lshr i32 %t268, 3 + %t270 = and i32 %t269, 1023 + %t271 = add i32 %t270, 128 + %t272 = getelementptr inbounds i8* %t6, i32 %t271 + %t273 = load i8* %t272, align 1 + %t274 = getelementptr inbounds i8* %t186, i32 %t164 + store i8 %t273, i8* %t274, align 1 + %t275 = fadd float %t199, %t221 + %t276 = fptosi float %t275 to i32 + %t277 = add nsw i32 %t276, 4 + %t278 = lshr i32 %t277, 3 + %t279 = and i32 %t278, 1023 + %t280 = add i32 %t279, 128 + %t281 = getelementptr inbounds i8* %t6, i32 %t280 + %t282 = load i8* %t281, align 1 + %t283 = getelementptr inbounds i8* %t186, i32 %t165 + store i8 %t282, i8* %t283, align 1 + %t284 = fsub float %t199, %t221 + %t285 = fptosi float %t284 to i32 + %t286 = add nsw i32 %t285, 4 + %t287 = lshr i32 %t286, 3 + %t288 = and i32 %t287, 1023 + %t289 = add i32 %t288, 128 + %t290 = getelementptr inbounds i8* %t6, i32 %t289 + %t291 = load i8* %t290, align 1 + %t292 = getelementptr inbounds i8* %t186, i32 %t166 + store i8 %t291, i8* %t292, align 1 + %t293 = add nsw i32 %t168, 1 + %t294 = icmp eq i32 %t293, 8 + br i1 %t294, label %bb295, label %bb167 + +bb295: + ret void +} + +%struct.ct_data_s = type { %union.anon, %union.anon } +%struct.gz_header = type { i32, i32, i32, i32, i8*, i32, i32, i8*, i32, i8*, i32, i32, i32 } +%struct.internal_state = type { %struct.z_stream*, i32, i8*, i32, i8*, i32, i32, %struct.gz_header*, i32, i8, i32, i32, i32, i32, i8*, i32, i16*, i16*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [573 x %struct.ct_data_s], [61 x %struct.ct_data_s], [39 x %struct.ct_data_s], %struct.tree_desc_s, %struct.tree_desc_s, %struct.tree_desc_s, [16 x i16], [573 x i32], i32, i32, [573 x i8], i8*, i32, i32, i16*, i32, i32, i32, i32, i16, i32 } +%struct.static_tree_desc = type { i32 } +%struct.tree_desc_s = type { %struct.ct_data_s*, i32, %struct.static_tree_desc* } +%struct.z_stream = type { i8*, i32, i32, i8*, i32, i32, i8*, %struct.internal_state*, i8* (i8*, i32, i32)*, void (i8*, i8*)*, i8*, i32, i32, i32 } +%union.anon = type { i16 } + +define arm_apcscc i32 @longest_match(%struct.internal_state* %s, i32 %cur_match) nounwind optsize { +entry: + %0 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 31 ; <i32*> [#uses=1] + %1 = load i32* %0, align 4 ; <i32> [#uses=2] + %2 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 14 ; <i8**> [#uses=1] + %3 = load i8** %2, align 4 ; <i8*> [#uses=27] + %4 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 27 ; <i32*> [#uses=1] + %5 = load i32* %4, align 4 ; <i32> [#uses=17] + %6 = getelementptr inbounds i8* %3, i32 %5 ; <i8*> [#uses=1] + %7 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 30 ; <i32*> [#uses=1] + %8 = load i32* %7, align 4 ; <i32> [#uses=4] + %9 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 36 ; <i32*> [#uses=1] + %10 = load i32* %9, align 4 ; <i32> [#uses=2] + %11 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 11 ; <i32*> [#uses=1] + %12 = load i32* %11, align 4 ; <i32> [#uses=2] + %13 = add i32 %12, -262 ; <i32> [#uses=1] + %14 = icmp ugt i32 %5, %13 ; <i1> [#uses=1] + br i1 %14, label %bb, label %bb2 + +bb: ; preds = %entry + %15 = add i32 %5, 262 ; <i32> [#uses=1] + %16 = sub i32 %15, %12 ; <i32> [#uses=1] + br label %bb2 + +bb2: ; preds = %bb, %entry + %iftmp.48.0 = phi i32 [ %16, %bb ], [ 0, %entry ] ; <i32> [#uses=1] + %17 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 16 ; <i16**> [#uses=1] + %18 = load i16** %17, align 4 ; <i16*> [#uses=1] + %19 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 13 ; <i32*> [#uses=1] + %20 = load i32* %19, align 4 ; <i32> [#uses=1] + %.sum = add i32 %5, 258 ; <i32> [#uses=2] + %21 = getelementptr inbounds i8* %3, i32 %.sum ; <i8*> [#uses=1] + %22 = add nsw i32 %5, -1 ; <i32> [#uses=1] + %.sum30 = add i32 %22, %8 ; <i32> [#uses=1] + %23 = getelementptr inbounds i8* %3, i32 %.sum30 ; <i8*> [#uses=1] + %24 = load i8* %23, align 1 ; <i8> [#uses=1] + %.sum31 = add i32 %8, %5 ; <i32> [#uses=1] + %25 = getelementptr inbounds i8* %3, i32 %.sum31 ; <i8*> [#uses=1] + %26 = load i8* %25, align 1 ; <i8> [#uses=1] + %27 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 35 ; <i32*> [#uses=1] + %28 = load i32* %27, align 4 ; <i32> [#uses=1] + %29 = lshr i32 %1, 2 ; <i32> [#uses=1] + %30 = icmp ult i32 %8, %28 ; <i1> [#uses=1] + %. = select i1 %30, i32 %1, i32 %29 ; <i32> [#uses=1] + %31 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 29 ; <i32*> [#uses=1] + %32 = load i32* %31, align 4 ; <i32> [#uses=4] + %33 = icmp ugt i32 %10, %32 ; <i1> [#uses=1] + %nice_match.0.ph = select i1 %33, i32 %32, i32 %10 ; <i32> [#uses=1] + %34 = getelementptr inbounds %struct.internal_state* %s, i32 0, i32 28 ; <i32*> [#uses=1] + %35 = ptrtoint i8* %21 to i32 ; <i32> [#uses=1] + %36 = add nsw i32 %5, 257 ; <i32> [#uses=1] + %tmp81 = add i32 %., -1 ; <i32> [#uses=1] + br label %bb6 + +bb6: ; preds = %bb24, %bb2 + %indvar78 = phi i32 [ 0, %bb2 ], [ %indvar.next79, %bb24 ] ; <i32> [#uses=2] + %best_len.2 = phi i32 [ %8, %bb2 ], [ %best_len.0, %bb24 ] ; <i32> [#uses=8] + %scan_end1.1 = phi i8 [ %24, %bb2 ], [ %scan_end1.0, %bb24 ] ; <i8> [#uses=6] + %cur_match_addr.0 = phi i32 [ %cur_match, %bb2 ], [ %90, %bb24 ] ; <i32> [#uses=14] + %scan_end.1 = phi i8 [ %26, %bb2 ], [ %scan_end.0, %bb24 ] ; <i8> [#uses=6] + %37 = getelementptr inbounds i8* %3, i32 %cur_match_addr.0 ; <i8*> [#uses=1] + %.sum32 = add i32 %cur_match_addr.0, %best_len.2 ; <i32> [#uses=1] + %38 = getelementptr inbounds i8* %3, i32 %.sum32 ; <i8*> [#uses=1] + %39 = load i8* %38, align 1 ; <i8> [#uses=1] + %40 = icmp eq i8 %39, %scan_end.1 ; <i1> [#uses=1] + br i1 %40, label %bb7, label %bb23 + +bb7: ; preds = %bb6 + %41 = add nsw i32 %best_len.2, -1 ; <i32> [#uses=1] + %.sum33 = add i32 %41, %cur_match_addr.0 ; <i32> [#uses=1] + %42 = getelementptr inbounds i8* %3, i32 %.sum33 ; <i8*> [#uses=1] + %43 = load i8* %42, align 1 ; <i8> [#uses=1] + %44 = icmp eq i8 %43, %scan_end1.1 ; <i1> [#uses=1] + br i1 %44, label %bb8, label %bb23 + +bb8: ; preds = %bb7 + %45 = load i8* %37, align 1 ; <i8> [#uses=1] + %46 = load i8* %6, align 1 ; <i8> [#uses=1] + %47 = icmp eq i8 %45, %46 ; <i1> [#uses=1] + br i1 %47, label %bb9, label %bb23 + +bb9: ; preds = %bb8 + %.sum34 = add i32 %cur_match_addr.0, 1 ; <i32> [#uses=1] + %48 = getelementptr inbounds i8* %3, i32 %.sum34 ; <i8*> [#uses=1] + %49 = load i8* %48, align 1 ; <i8> [#uses=1] + %.sum88 = add i32 %5, 1 ; <i32> [#uses=1] + %50 = getelementptr inbounds i8* %3, i32 %.sum88 ; <i8*> [#uses=1] + %51 = load i8* %50, align 1 ; <i8> [#uses=1] + %52 = icmp eq i8 %49, %51 ; <i1> [#uses=1] + br i1 %52, label %bb10, label %bb23 + +bb10: ; preds = %bb9 + %tmp39 = add i32 %cur_match_addr.0, 10 ; <i32> [#uses=1] + %tmp41 = add i32 %cur_match_addr.0, 9 ; <i32> [#uses=1] + %tmp44 = add i32 %cur_match_addr.0, 8 ; <i32> [#uses=1] + %tmp47 = add i32 %cur_match_addr.0, 7 ; <i32> [#uses=1] + %tmp50 = add i32 %cur_match_addr.0, 6 ; <i32> [#uses=1] + %tmp53 = add i32 %cur_match_addr.0, 5 ; <i32> [#uses=1] + %tmp56 = add i32 %cur_match_addr.0, 4 ; <i32> [#uses=1] + %tmp59 = add i32 %cur_match_addr.0, 3 ; <i32> [#uses=1] + br label %bb11 + +bb11: ; preds = %bb18, %bb10 + %indvar = phi i32 [ %indvar.next, %bb18 ], [ 0, %bb10 ] ; <i32> [#uses=2] + %tmp = shl i32 %indvar, 3 ; <i32> [#uses=16] + %tmp40 = add i32 %tmp39, %tmp ; <i32> [#uses=1] + %scevgep = getelementptr i8* %3, i32 %tmp40 ; <i8*> [#uses=1] + %tmp42 = add i32 %tmp41, %tmp ; <i32> [#uses=1] + %scevgep43 = getelementptr i8* %3, i32 %tmp42 ; <i8*> [#uses=1] + %tmp45 = add i32 %tmp44, %tmp ; <i32> [#uses=1] + %scevgep46 = getelementptr i8* %3, i32 %tmp45 ; <i8*> [#uses=1] + %tmp48 = add i32 %tmp47, %tmp ; <i32> [#uses=1] + %scevgep49 = getelementptr i8* %3, i32 %tmp48 ; <i8*> [#uses=1] + %tmp51 = add i32 %tmp50, %tmp ; <i32> [#uses=1] + %scevgep52 = getelementptr i8* %3, i32 %tmp51 ; <i8*> [#uses=1] + %tmp54 = add i32 %tmp53, %tmp ; <i32> [#uses=1] + %scevgep55 = getelementptr i8* %3, i32 %tmp54 ; <i8*> [#uses=1] + %tmp60 = add i32 %tmp59, %tmp ; <i32> [#uses=1] + %scevgep61 = getelementptr i8* %3, i32 %tmp60 ; <i8*> [#uses=1] + %tmp62 = add i32 %tmp, 10 ; <i32> [#uses=1] + %.sum89 = add i32 %5, %tmp62 ; <i32> [#uses=2] + %scevgep63 = getelementptr i8* %3, i32 %.sum89 ; <i8*> [#uses=2] + %tmp64 = add i32 %tmp, 9 ; <i32> [#uses=1] + %.sum90 = add i32 %5, %tmp64 ; <i32> [#uses=1] + %scevgep65 = getelementptr i8* %3, i32 %.sum90 ; <i8*> [#uses=2] + %tmp66 = add i32 %tmp, 8 ; <i32> [#uses=1] + %.sum91 = add i32 %5, %tmp66 ; <i32> [#uses=1] + %scevgep67 = getelementptr i8* %3, i32 %.sum91 ; <i8*> [#uses=2] + %tmp6883 = or i32 %tmp, 7 ; <i32> [#uses=1] + %.sum92 = add i32 %5, %tmp6883 ; <i32> [#uses=1] + %scevgep69 = getelementptr i8* %3, i32 %.sum92 ; <i8*> [#uses=2] + %tmp7084 = or i32 %tmp, 6 ; <i32> [#uses=1] + %.sum93 = add i32 %5, %tmp7084 ; <i32> [#uses=1] + %scevgep71 = getelementptr i8* %3, i32 %.sum93 ; <i8*> [#uses=2] + %tmp7285 = or i32 %tmp, 5 ; <i32> [#uses=1] + %.sum94 = add i32 %5, %tmp7285 ; <i32> [#uses=1] + %scevgep73 = getelementptr i8* %3, i32 %.sum94 ; <i8*> [#uses=2] + %tmp7486 = or i32 %tmp, 4 ; <i32> [#uses=1] + %.sum95 = add i32 %5, %tmp7486 ; <i32> [#uses=1] + %scevgep75 = getelementptr i8* %3, i32 %.sum95 ; <i8*> [#uses=2] + %tmp7687 = or i32 %tmp, 3 ; <i32> [#uses=1] + %.sum96 = add i32 %5, %tmp7687 ; <i32> [#uses=1] + %scevgep77 = getelementptr i8* %3, i32 %.sum96 ; <i8*> [#uses=2] + %53 = load i8* %scevgep77, align 1 ; <i8> [#uses=1] + %54 = load i8* %scevgep61, align 1 ; <i8> [#uses=1] + %55 = icmp eq i8 %53, %54 ; <i1> [#uses=1] + br i1 %55, label %bb12, label %bb20 + +bb12: ; preds = %bb11 + %tmp57 = add i32 %tmp56, %tmp ; <i32> [#uses=1] + %scevgep58 = getelementptr i8* %3, i32 %tmp57 ; <i8*> [#uses=1] + %56 = load i8* %scevgep75, align 1 ; <i8> [#uses=1] + %57 = load i8* %scevgep58, align 1 ; <i8> [#uses=1] + %58 = icmp eq i8 %56, %57 ; <i1> [#uses=1] + br i1 %58, label %bb13, label %bb20 + +bb13: ; preds = %bb12 + %59 = load i8* %scevgep73, align 1 ; <i8> [#uses=1] + %60 = load i8* %scevgep55, align 1 ; <i8> [#uses=1] + %61 = icmp eq i8 %59, %60 ; <i1> [#uses=1] + br i1 %61, label %bb14, label %bb20 + +bb14: ; preds = %bb13 + %62 = load i8* %scevgep71, align 1 ; <i8> [#uses=1] + %63 = load i8* %scevgep52, align 1 ; <i8> [#uses=1] + %64 = icmp eq i8 %62, %63 ; <i1> [#uses=1] + br i1 %64, label %bb15, label %bb20 + +bb15: ; preds = %bb14 + %65 = load i8* %scevgep69, align 1 ; <i8> [#uses=1] + %66 = load i8* %scevgep49, align 1 ; <i8> [#uses=1] + %67 = icmp eq i8 %65, %66 ; <i1> [#uses=1] + br i1 %67, label %bb16, label %bb20 + +bb16: ; preds = %bb15 + %68 = load i8* %scevgep67, align 1 ; <i8> [#uses=1] + %69 = load i8* %scevgep46, align 1 ; <i8> [#uses=1] + %70 = icmp eq i8 %68, %69 ; <i1> [#uses=1] + br i1 %70, label %bb17, label %bb20 + +bb17: ; preds = %bb16 + %71 = load i8* %scevgep65, align 1 ; <i8> [#uses=1] + %72 = load i8* %scevgep43, align 1 ; <i8> [#uses=1] + %73 = icmp eq i8 %71, %72 ; <i1> [#uses=1] + br i1 %73, label %bb18, label %bb20 + +bb18: ; preds = %bb17 + %74 = load i8* %scevgep63, align 1 ; <i8> [#uses=1] + %75 = load i8* %scevgep, align 1 ; <i8> [#uses=1] + %76 = icmp eq i8 %74, %75 ; <i1> [#uses=1] + %77 = icmp slt i32 %.sum89, %.sum ; <i1> [#uses=1] + %or.cond = and i1 %76, %77 ; <i1> [#uses=1] + %indvar.next = add i32 %indvar, 1 ; <i32> [#uses=1] + br i1 %or.cond, label %bb11, label %bb20 + +bb20: ; preds = %bb18, %bb17, %bb16, %bb15, %bb14, %bb13, %bb12, %bb11 + %scan.3 = phi i8* [ %scevgep77, %bb11 ], [ %scevgep75, %bb12 ], [ %scevgep73, %bb13 ], [ %scevgep71, %bb14 ], [ %scevgep69, %bb15 ], [ %scevgep67, %bb16 ], [ %scevgep65, %bb17 ], [ %scevgep63, %bb18 ] ; <i8*> [#uses=1] + %78 = ptrtoint i8* %scan.3 to i32 ; <i32> [#uses=1] + %79 = sub nsw i32 %78, %35 ; <i32> [#uses=2] + %80 = add i32 %79, 258 ; <i32> [#uses=5] + %81 = icmp sgt i32 %80, %best_len.2 ; <i1> [#uses=1] + br i1 %81, label %bb21, label %bb23 + +bb21: ; preds = %bb20 + store i32 %cur_match_addr.0, i32* %34, align 4 + %82 = icmp slt i32 %80, %nice_match.0.ph ; <i1> [#uses=1] + br i1 %82, label %bb22, label %bb25 + +bb22: ; preds = %bb21 + %.sum37 = add i32 %36, %79 ; <i32> [#uses=1] + %83 = getelementptr inbounds i8* %3, i32 %.sum37 ; <i8*> [#uses=1] + %84 = load i8* %83, align 1 ; <i8> [#uses=1] + %.sum38 = add i32 %80, %5 ; <i32> [#uses=1] + %85 = getelementptr inbounds i8* %3, i32 %.sum38 ; <i8*> [#uses=1] + %86 = load i8* %85, align 1 ; <i8> [#uses=1] + br label %bb23 + +bb23: ; preds = %bb22, %bb20, %bb9, %bb8, %bb7, %bb6 + %best_len.0 = phi i32 [ %best_len.2, %bb6 ], [ %best_len.2, %bb7 ], [ %best_len.2, %bb8 ], [ %best_len.2, %bb9 ], [ %80, %bb22 ], [ %best_len.2, %bb20 ] ; <i32> [#uses=3] + %scan_end1.0 = phi i8 [ %scan_end1.1, %bb6 ], [ %scan_end1.1, %bb7 ], [ %scan_end1.1, %bb8 ], [ %scan_end1.1, %bb9 ], [ %84, %bb22 ], [ %scan_end1.1, %bb20 ] ; <i8> [#uses=1] + %scan_end.0 = phi i8 [ %scan_end.1, %bb6 ], [ %scan_end.1, %bb7 ], [ %scan_end.1, %bb8 ], [ %scan_end.1, %bb9 ], [ %86, %bb22 ], [ %scan_end.1, %bb20 ] ; <i8> [#uses=1] + %87 = and i32 %cur_match_addr.0, %20 ; <i32> [#uses=1] + %88 = getelementptr inbounds i16* %18, i32 %87 ; <i16*> [#uses=1] + %89 = load i16* %88, align 2 ; <i16> [#uses=1] + %90 = zext i16 %89 to i32 ; <i32> [#uses=2] + %91 = icmp ugt i32 %90, %iftmp.48.0 ; <i1> [#uses=1] + br i1 %91, label %bb24, label %bb25 + +bb24: ; preds = %bb23 + +; LSR should use count-down iteration to avoid requiring the trip count +; in a register, and it shouldn't require any reloads here. + +; CHECK: sub.w r9, r9, #1 +; CHECK-NEXT: cmp.w r9, #0 +; CHECK-NEXT: bne.w + + %92 = icmp eq i32 %tmp81, %indvar78 ; <i1> [#uses=1] + %indvar.next79 = add i32 %indvar78, 1 ; <i32> [#uses=1] + br i1 %92, label %bb25, label %bb6 + +bb25: ; preds = %bb24, %bb23, %bb21 + %best_len.1 = phi i32 [ %best_len.0, %bb23 ], [ %best_len.0, %bb24 ], [ %80, %bb21 ] ; <i32> [#uses=2] + %93 = icmp ugt i32 %best_len.1, %32 ; <i1> [#uses=1] + %merge = select i1 %93, i32 %32, i32 %best_len.1 ; <i32> [#uses=1] + ret i32 %merge +} diff --git a/test/CodeGen/ARM/mul_const.ll b/test/CodeGen/ARM/mul_const.ll index 93188cd..8c10246 100644 --- a/test/CodeGen/ARM/mul_const.ll +++ b/test/CodeGen/ARM/mul_const.ll @@ -1,17 +1,43 @@ ; RUN: llc < %s -march=arm | FileCheck %s -define i32 @t1(i32 %v) nounwind readnone { +define i32 @t9(i32 %v) nounwind readnone { entry: -; CHECK: t1: +; CHECK: t9: ; CHECK: add r0, r0, r0, lsl #3 %0 = mul i32 %v, 9 ret i32 %0 } -define i32 @t2(i32 %v) nounwind readnone { +define i32 @t7(i32 %v) nounwind readnone { entry: -; CHECK: t2: +; CHECK: t7: ; CHECK: rsb r0, r0, r0, lsl #3 %0 = mul i32 %v, 7 ret i32 %0 } + +define i32 @t5(i32 %v) nounwind readnone { +entry: +; CHECK: t5: +; CHECK: add r0, r0, r0, lsl #2 + %0 = mul i32 %v, 5 + ret i32 %0 +} + +define i32 @t3(i32 %v) nounwind readnone { +entry: +; CHECK: t3: +; CHECK: add r0, r0, r0, lsl #1 + %0 = mul i32 %v, 3 + ret i32 %0 +} + +define i32 @t12288(i32 %v) nounwind readnone { +entry: +; CHECK: t12288: +; CHECK: add r0, r0, r0, lsl #1 +; CHECK: mov r0, r0, lsl #12 + %0 = mul i32 %v, 12288 + ret i32 %0 +} + diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll new file mode 100644 index 0000000..3ba82cc --- /dev/null +++ b/test/CodeGen/ARM/reg_sequence.ll @@ -0,0 +1,348 @@ +; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s +; Implementing vld / vst as REG_SEQUENCE eliminates the extra vmov's. + +%struct.int16x8_t = type { <8 x i16> } +%struct.int32x4_t = type { <4 x i32> } +%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> } +%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } +%struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> } +%struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> } + +define arm_apcscc void @t1(i16* %i_ptr, i16* %o_ptr, %struct.int32x4_t* nocapture %vT0ptr, %struct.int32x4_t* nocapture %vT1ptr) nounwind { +entry: +; CHECK: t1: +; CHECK: vld1.16 +; CHECK-NOT: vmov d +; CHECK: vmovl.s16 +; CHECK: vshrn.i32 +; CHECK: vshrn.i32 +; CHECK-NOT: vmov d +; CHECK-NEXT: vst1.16 + %0 = getelementptr inbounds %struct.int32x4_t* %vT0ptr, i32 0, i32 0 ; <<4 x i32>*> [#uses=1] + %1 = load <4 x i32>* %0, align 16 ; <<4 x i32>> [#uses=1] + %2 = getelementptr inbounds %struct.int32x4_t* %vT1ptr, i32 0, i32 0 ; <<4 x i32>*> [#uses=1] + %3 = load <4 x i32>* %2, align 16 ; <<4 x i32>> [#uses=1] + %4 = bitcast i16* %i_ptr to i8* ; <i8*> [#uses=1] + %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4) ; <<8 x i16>> [#uses=1] + %6 = bitcast <8 x i16> %5 to <2 x double> ; <<2 x double>> [#uses=2] + %7 = extractelement <2 x double> %6, i32 0 ; <double> [#uses=1] + %8 = bitcast double %7 to <4 x i16> ; <<4 x i16>> [#uses=1] + %9 = tail call <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16> %8) ; <<4 x i32>> [#uses=1] + %10 = extractelement <2 x double> %6, i32 1 ; <double> [#uses=1] + %11 = bitcast double %10 to <4 x i16> ; <<4 x i16>> [#uses=1] + %12 = tail call <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16> %11) ; <<4 x i32>> [#uses=1] + %13 = mul <4 x i32> %1, %9 ; <<4 x i32>> [#uses=1] + %14 = mul <4 x i32> %3, %12 ; <<4 x i32>> [#uses=1] + %15 = tail call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %13, <4 x i32> <i32 -12, i32 -12, i32 -12, i32 -12>) ; <<4 x i16>> [#uses=1] + %16 = tail call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %14, <4 x i32> <i32 -12, i32 -12, i32 -12, i32 -12>) ; <<4 x i16>> [#uses=1] + %17 = shufflevector <4 x i16> %15, <4 x i16> %16, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; <<8 x i16>> [#uses=1] + %18 = bitcast i16* %o_ptr to i8* ; <i8*> [#uses=1] + tail call void @llvm.arm.neon.vst1.v8i16(i8* %18, <8 x i16> %17) + ret void +} + +define arm_apcscc void @t2(i16* %i_ptr, i16* %o_ptr, %struct.int16x8_t* nocapture %vT0ptr, %struct.int16x8_t* nocapture %vT1ptr) nounwind { +entry: +; CHECK: t2: +; CHECK: vld1.16 +; CHECK: vld1.16 +; CHECK-NOT: vmov +; CHECK: vmul.i16 +; CHECK: vmul.i16 +; CHECK-NOT: vmov +; CHECK: vst1.16 +; CHECK: vst1.16 + %0 = getelementptr inbounds %struct.int16x8_t* %vT0ptr, i32 0, i32 0 ; <<8 x i16>*> [#uses=1] + %1 = load <8 x i16>* %0, align 16 ; <<8 x i16>> [#uses=1] + %2 = getelementptr inbounds %struct.int16x8_t* %vT1ptr, i32 0, i32 0 ; <<8 x i16>*> [#uses=1] + %3 = load <8 x i16>* %2, align 16 ; <<8 x i16>> [#uses=1] + %4 = bitcast i16* %i_ptr to i8* ; <i8*> [#uses=1] + %5 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %4) ; <<8 x i16>> [#uses=1] + %6 = getelementptr inbounds i16* %i_ptr, i32 8 ; <i16*> [#uses=1] + %7 = bitcast i16* %6 to i8* ; <i8*> [#uses=1] + %8 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %7) ; <<8 x i16>> [#uses=1] + %9 = mul <8 x i16> %1, %5 ; <<8 x i16>> [#uses=1] + %10 = mul <8 x i16> %3, %8 ; <<8 x i16>> [#uses=1] + %11 = bitcast i16* %o_ptr to i8* ; <i8*> [#uses=1] + tail call void @llvm.arm.neon.vst1.v8i16(i8* %11, <8 x i16> %9) + %12 = getelementptr inbounds i16* %o_ptr, i32 8 ; <i16*> [#uses=1] + %13 = bitcast i16* %12 to i8* ; <i8*> [#uses=1] + tail call void @llvm.arm.neon.vst1.v8i16(i8* %13, <8 x i16> %10) + ret void +} + +define <8 x i8> @t3(i8* %A, i8* %B) nounwind { +; CHECK: t3: +; CHECK: vld3.8 +; CHECK: vmul.i8 +; CHECK-NOT: vmov +; CHECK: vst3.8 + %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A) ; <%struct.__neon_int8x8x3_t> [#uses=2] + %tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0 ; <<8 x i8>> [#uses=1] + %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2 ; <<8 x i8>> [#uses=1] + %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 1 ; <<8 x i8>> [#uses=1] + %tmp5 = sub <8 x i8> %tmp3, %tmp4 + %tmp6 = add <8 x i8> %tmp2, %tmp3 ; <<8 x i8>> [#uses=1] + %tmp7 = mul <8 x i8> %tmp4, %tmp2 + tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> %tmp5, <8 x i8> %tmp6, <8 x i8> %tmp7) + ret <8 x i8> %tmp4 +} + +define arm_apcscc void @t4(i32* %in, i32* %out) nounwind { +entry: +; CHECK: t4: +; CHECK: vld2.32 +; CHECK-NOT: vmov +; CHECK: vld2.32 +; CHECK-NOT: vmov +; CHECK: bne + %tmp1 = bitcast i32* %in to i8* ; <i8*> [#uses=1] + %tmp2 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp1) ; <%struct.__neon_int32x4x2_t> [#uses=2] + %tmp3 = getelementptr inbounds i32* %in, i32 8 ; <i32*> [#uses=1] + %tmp4 = bitcast i32* %tmp3 to i8* ; <i8*> [#uses=1] + %tmp5 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp4) ; <%struct.__neon_int32x4x2_t> [#uses=2] + %tmp8 = bitcast i32* %out to i8* ; <i8*> [#uses=1] + br i1 undef, label %return1, label %return2 + +return1: +; CHECK: %return1 +; CHECK-NOT: vmov +; CHECK-NEXT: vadd.i32 +; CHECK-NEXT: vadd.i32 +; CHECK-NEXT: vst2.32 + %tmp52 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1] + %tmp57 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1 ; <<4 x i32>> [#uses=1] + %tmp = extractvalue %struct.__neon_int32x4x2_t %tmp5, 0 ; <<4 x i32>> [#uses=1] + %tmp39 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1] + %tmp6 = add <4 x i32> %tmp52, %tmp ; <<4 x i32>> [#uses=1] + %tmp7 = add <4 x i32> %tmp57, %tmp39 ; <<4 x i32>> [#uses=1] + tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp6, <4 x i32> %tmp7) + ret void + +return2: +; CHECK: %return2 +; CHECK: vadd.i32 +; CHECK: vmov q1, q3 +; CHECK-NOT: vmov +; CHECK: vst2.32 {d0, d1, d2, d3} + %tmp100 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1] + %tmp101 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1] + %tmp102 = add <4 x i32> %tmp100, %tmp101 ; <<4 x i32>> [#uses=1] + tail call void @llvm.arm.neon.vst2.v4i32(i8* %tmp8, <4 x i32> %tmp102, <4 x i32> %tmp101) + call void @llvm.trap() + unreachable +} + +define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind { +; CHECK: t5: +; CHECK: vldmia +; CHECK: vmov q1, q0 +; CHECK-NOT: vmov +; CHECK: vld2.16 {d0[1], d2[1]}, [r0] +; CHECK-NOT: vmov +; CHECK: vadd.i16 + %tmp0 = bitcast i16* %A to i8* ; <i8*> [#uses=1] + %tmp1 = load <8 x i16>* %B ; <<8 x i16>> [#uses=2] + %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1) ; <%struct.__neon_int16x8x2_t> [#uses=2] + %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 ; <<8 x i16>> [#uses=1] + %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 ; <<8 x i16>> [#uses=1] + %tmp5 = add <8 x i16> %tmp3, %tmp4 ; <<8 x i16>> [#uses=1] + ret <8 x i16> %tmp5 +} + +define <8 x i8> @t6(i8* %A, <8 x i8>* %B) nounwind { +; CHECK: t6: +; CHECK: vldr.64 +; CHECK: vmov d1, d0 +; CHECK-NEXT: vld2.8 {d0[1], d1[1]} + %tmp1 = load <8 x i8>* %B ; <<8 x i8>> [#uses=2] + %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2] + %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 ; <<8 x i8>> [#uses=1] + %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1 ; <<8 x i8>> [#uses=1] + %tmp5 = add <8 x i8> %tmp3, %tmp4 ; <<8 x i8>> [#uses=1] + ret <8 x i8> %tmp5 +} + +define arm_apcscc void @t7(i32* %iptr, i32* %optr) nounwind { +entry: +; CHECK: t7: +; CHECK: vld2.32 +; CHECK: vst2.32 +; CHECK: vld1.32 {d0, d1}, +; CHECK: vmov q1, q0 +; CHECK-NOT: vmov +; CHECK: vuzp.32 q0, q1 +; CHECK: vst1.32 + %0 = bitcast i32* %iptr to i8* ; <i8*> [#uses=2] + %1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %0) ; <%struct.__neon_int32x4x2_t> [#uses=2] + %tmp57 = extractvalue %struct.__neon_int32x4x2_t %1, 0 ; <<4 x i32>> [#uses=1] + %tmp60 = extractvalue %struct.__neon_int32x4x2_t %1, 1 ; <<4 x i32>> [#uses=1] + %2 = bitcast i32* %optr to i8* ; <i8*> [#uses=2] + tail call void @llvm.arm.neon.vst2.v4i32(i8* %2, <4 x i32> %tmp57, <4 x i32> %tmp60) + %3 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %0) ; <<4 x i32>> [#uses=1] + %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> ; <<4 x i32>> [#uses=1] + tail call void @llvm.arm.neon.vst1.v4i32(i8* %2, <4 x i32> %4) + ret void +} + +; PR7156 +define arm_aapcs_vfpcc i32 @t8() nounwind { +; CHECK: t8: +; CHECK: vrsqrte.f32 q0, q0 +bb.nph55.bb.nph55.split_crit_edge: + br label %bb3 + +bb3: ; preds = %bb3, %bb.nph55.bb.nph55.split_crit_edge + br i1 undef, label %bb5, label %bb3 + +bb5: ; preds = %bb3 + br label %bb.i25 + +bb.i25: ; preds = %bb.i25, %bb5 + %0 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1] + %1 = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %0) nounwind ; <<4 x float>> [#uses=1] + %2 = fmul <4 x float> %1, undef ; <<4 x float>> [#uses=1] + %3 = fmul <4 x float> undef, %2 ; <<4 x float>> [#uses=1] + %tmp26.i = bitcast <4 x float> %3 to <2 x double> ; <<2 x double>> [#uses=1] + %4 = extractelement <2 x double> %tmp26.i, i32 0 ; <double> [#uses=1] + %5 = bitcast double %4 to <2 x float> ; <<2 x float>> [#uses=1] + %6 = extractelement <2 x float> %5, i32 1 ; <float> [#uses=1] + store float %6, float* undef, align 4 + br i1 undef, label %bb6, label %bb.i25 + +bb6: ; preds = %bb.i25 + br i1 undef, label %bb7, label %bb14 + +bb7: ; preds = %bb6 + br label %bb.i49 + +bb.i49: ; preds = %bb.i49, %bb7 + br i1 undef, label %bb.i19, label %bb.i49 + +bb.i19: ; preds = %bb.i19, %bb.i49 + br i1 undef, label %exit, label %bb.i19 + +exit: ; preds = %bb.i19 + unreachable + +bb14: ; preds = %bb6 + ret i32 0 +} + +%0 = type { %1, %1, %1, %1 } +%1 = type { %2 } +%2 = type { <4 x float> } +%3 = type { %0, %1 } + +; PR7157 +define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind { +; CHECK: t9: +; CHECK: vldr.64 +; CHECK: vmov.i8 d1 +; CHECK-NEXT: vstmia r0, {d2,d3} +; CHECK-NEXT: vstmia r0, {d0,d1} + %3 = bitcast double 0.000000e+00 to <2 x float> ; <<2 x float>> [#uses=2] + %4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1] + store <4 x float> %4, <4 x float>* undef, align 16 + %5 = shufflevector <2 x float> %3, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1] + store <4 x float> %5, <4 x float>* undef, align 16 + br label %8 + +; <label>:6 ; preds = %8 + br i1 undef, label %7, label %10 + +; <label>:7 ; preds = %6 + br label %8 + +; <label>:8 ; preds = %7, %2 + br i1 undef, label %6, label %9 + +; <label>:9 ; preds = %8 + ret float undef + +; <label>:10 ; preds = %6 + ret float 9.990000e+02 +} + +; PR7162 +define arm_aapcs_vfpcc i32 @t10() nounwind { +entry: +; CHECK: t10: +; CHECK: vmov.i32 q1, #0x3F000000 +; CHECK: vdup.32 q0, d0[0] +; CHECK: vmov d0, d1 +; CHECK: vmla.f32 q0, q0, d0[0] + %0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1] + %1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1] + %2 = insertelement <4 x float> %1, float undef, i32 2 ; <<4 x float>> [#uses=1] + %3 = insertelement <4 x float> %2, float undef, i32 3 ; <<4 x float>> [#uses=1] + %tmp54.i = bitcast <4 x float> %3 to <2 x double> ; <<2 x double>> [#uses=1] + %4 = extractelement <2 x double> %tmp54.i, i32 1 ; <double> [#uses=1] + %5 = bitcast double %4 to <2 x float> ; <<2 x float>> [#uses=1] + %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1] + %7 = fmul <4 x float> undef, %6 ; <<4 x float>> [#uses=1] + %8 = fadd <4 x float> %7, undef ; <<4 x float>> [#uses=1] + %9 = fadd <4 x float> %8, undef ; <<4 x float>> [#uses=1] + %10 = shufflevector <4 x float> undef, <4 x float> %9, <4 x i32> <i32 0, i32 1, i32 2, i32 7> ; <<4 x float>> [#uses=1] + %11 = fmul <4 x float> %10, <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01> ; <<4 x float>> [#uses=1] + %12 = shufflevector <4 x float> %11, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> ; <<4 x float>> [#uses=1] + %13 = shufflevector <4 x float> %12, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1] + %14 = fmul <4 x float> %13, undef ; <<4 x float>> [#uses=1] + %15 = fadd <4 x float> undef, %14 ; <<4 x float>> [#uses=1] + %16 = shufflevector <4 x float> undef, <4 x float> %15, <4 x i32> <i32 0, i32 1, i32 6, i32 3> ; <<4 x float>> [#uses=1] + %17 = fmul <4 x float> %16, undef ; <<4 x float>> [#uses=1] + %18 = extractelement <4 x float> %17, i32 2 ; <float> [#uses=1] + store float %18, float* undef, align 4 + br i1 undef, label %exit, label %bb14 + +exit: ; preds = %bb.i19 + unreachable + +bb14: ; preds = %bb6 + ret i32 0 +} + +; This test crashes the coalescer because live variables were not updated properly. +define <8 x i8> @t11(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind { + %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4) ; <%struct.__neon_int8x8x3_t> [#uses=1] + %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1] + %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6) ; <%struct.__neon_int8x8x3_t> [#uses=1] + %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1] + %tmp2bd = add <8 x i8> zeroinitializer, %tmp2d ; <<8 x i8>> [#uses=1] + %tmp2abcd = mul <8 x i8> zeroinitializer, %tmp2bd ; <<8 x i8>> [#uses=1] + %tmp2ef = sub <8 x i8> zeroinitializer, %tmp2f ; <<8 x i8>> [#uses=1] + %tmp2efgh = mul <8 x i8> %tmp2ef, undef ; <<8 x i8>> [#uses=2] + call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp2efgh) + %tmp2 = sub <8 x i8> %tmp2efgh, %tmp2abcd ; <<8 x i8>> [#uses=1] + %tmp7 = mul <8 x i8> undef, %tmp2 ; <<8 x i8>> [#uses=1] + tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> undef, <8 x i8> undef, <8 x i8> %tmp7) + ret <8 x i8> undef +} + +declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*) nounwind readonly + +declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*) nounwind readonly + +declare <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16>) nounwind readnone + +declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone + +declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>) nounwind + +declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>) nounwind + +declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind + +declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*) nounwind readonly + +declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*) nounwind readonly + +declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind readonly + +declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind readonly + +declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>) nounwind + +declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone + +declare void @llvm.trap() nounwind diff --git a/test/CodeGen/ARM/spill-q.ll b/test/CodeGen/ARM/spill-q.ll index 5ad7ecc..03de0c8 100644 --- a/test/CodeGen/ARM/spill-q.ll +++ b/test/CodeGen/ARM/spill-q.ll @@ -46,7 +46,8 @@ bb4: ; preds = %bb193, %entry %20 = shufflevector <2 x float> %19, <2 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1] %21 = fadd <4 x float> zeroinitializer, %20 ; <<4 x float>> [#uses=2] %22 = fcmp ogt <4 x float> %besterror.0.2264, %21 ; <<4 x i1>> [#uses=0] - br i1 undef, label %bb193, label %bb186 + %tmp = extractelement <4 x i1> %22, i32 0 + br i1 %tmp, label %bb193, label %bb186 bb186: ; preds = %bb4 br label %bb193 diff --git a/test/CodeGen/ARM/trap.ll b/test/CodeGen/ARM/trap.ll new file mode 100644 index 0000000..763dff3 --- /dev/null +++ b/test/CodeGen/ARM/trap.ll @@ -0,0 +1,12 @@ +; RUN: llc < %s -march=arm | FileCheck %s +; rdar://7961298 + +define arm_apcscc void @t() nounwind { +entry: +; CHECK: t: +; CHECK: trap + call void @llvm.trap() + unreachable +} + +declare void @llvm.trap() nounwind diff --git a/test/CodeGen/ARM/vcgt.ll b/test/CodeGen/ARM/vcgt.ll index 6b11ba5..194093c 100644 --- a/test/CodeGen/ARM/vcgt.ll +++ b/test/CodeGen/ARM/vcgt.ll @@ -158,5 +158,18 @@ define <4 x i32> @vacgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind { ret <4 x i32> %tmp3 } +; rdar://7923010 +define <4 x i32> @vcgt_zext(<4 x float>* %A, <4 x float>* %B) nounwind { +;CHECK: vcgt_zext: +;CHECK: vcgt.f32 q0 +;CHECK: vmov.i32 q1, #0x1 +;CHECK: vand q0, q0, q1 + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = fcmp ogt <4 x float> %tmp1, %tmp2 + %tmp4 = zext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) nounwind readnone declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) nounwind readnone |