diff options
Diffstat (limited to 'test/Transforms/LoopVectorize/ARM')
-rw-r--r-- | test/Transforms/LoopVectorize/ARM/arm-unroll.ll | 32 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/ARM/gcc-examples.ll | 60 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/ARM/lit.local.cfg | 6 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll | 114 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/ARM/width-detect.ll | 52 |
5 files changed, 264 insertions, 0 deletions
diff --git a/test/Transforms/LoopVectorize/ARM/arm-unroll.ll b/test/Transforms/LoopVectorize/ARM/arm-unroll.ll new file mode 100644 index 0000000..c8d307f --- /dev/null +++ b/test/Transforms/LoopVectorize/ARM/arm-unroll.ll @@ -0,0 +1,32 @@ +; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFT + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" +target triple = "thumbv7-apple-ios3.0.0" + +;CHECK: @foo +;CHECK: load <4 x i32> +;CHECK-NOT: load <4 x i32> +;CHECK: ret +;SWIFT: @foo +;SWIFT: load <4 x i32> +;SWIFT: load <4 x i32> +;SWIFT: ret +define i32 @foo(i32* nocapture %A, i32 %n) nounwind readonly ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ] + %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i32* %A, i32 %i.02 + %3 = load i32* %2, align 4 + %4 = add nsw i32 %3, %sum.01 + %5 = add nsw i32 %i.02, 1 + %exitcond = icmp eq i32 %5, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ] + ret i32 %sum.0.lcssa +} diff --git a/test/Transforms/LoopVectorize/ARM/gcc-examples.ll b/test/Transforms/LoopVectorize/ARM/gcc-examples.ll new file mode 100644 index 0000000..6a68e81 --- /dev/null +++ b/test/Transforms/LoopVectorize/ARM/gcc-examples.ll @@ -0,0 +1,60 @@ +; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S -dce | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" +target triple = "thumbv7-apple-ios3.0.0" + +@b = common global [2048 x i32] zeroinitializer, align 16 +@c = common global [2048 x i32] zeroinitializer, align 16 +@a = common global [2048 x i32] zeroinitializer, align 16 + +; Select VF = 8; +;CHECK: @example1 +;CHECK: load <4 x i32> +;CHECK: add nsw <4 x i32> +;CHECK: store <4 x i32> +;CHECK: ret void +define void @example1() nounwind uwtable ssp { + br label %1 + +; <label>:1 ; preds = %1, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv + %5 = load i32* %4, align 4 + %6 = add nsw i32 %5, %3 + %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %6, i32* %7, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 256 + br i1 %exitcond, label %8, label %1 + +; <label>:8 ; preds = %1 + ret void +} + +;CHECK: @example10b +;CHECK: load <4 x i16> +;CHECK: sext <4 x i16> +;CHECK: store <4 x i32> +;CHECK: ret void +define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp { + br label %1 + +; <label>:1 ; preds = %1, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds i16* %sb, i64 %indvars.iv + %3 = load i16* %2, align 2 + %4 = sext i16 %3 to i32 + %5 = getelementptr inbounds i32* %ia, i64 %indvars.iv + store i32 %4, i32* %5, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 1024 + br i1 %exitcond, label %6, label %1 + +; <label>:6 ; preds = %1 + ret void +} + diff --git a/test/Transforms/LoopVectorize/ARM/lit.local.cfg b/test/Transforms/LoopVectorize/ARM/lit.local.cfg new file mode 100644 index 0000000..cb77b09 --- /dev/null +++ b/test/Transforms/LoopVectorize/ARM/lit.local.cfg @@ -0,0 +1,6 @@ +config.suffixes = ['.ll', '.c', '.cpp'] + +targets = set(config.root.targets_to_build.split()) +if not 'ARM' in targets: + config.unsupported = True + diff --git a/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll b/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll new file mode 100644 index 0000000..d2e3de2 --- /dev/null +++ b/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll @@ -0,0 +1,114 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s +; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s +; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM + +; ModuleID = 'arm.ll' +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64" +target triple = "armv7--linux-gnueabihf" + +%T216 = type <2 x i16> +%T232 = type <2 x i32> +%T264 = type <2 x i64> + +%T416 = type <4 x i16> +%T432 = type <4 x i32> +%T464 = type <4 x i64> + +define void @direct(%T432* %loadaddr, %T432* %loadaddr2, %T432* %storeaddr) { +; COST: function 'direct': + %v0 = load %T432* %loadaddr +; ASM: vld1.64 + %v1 = load %T432* %loadaddr2 +; ASM: vld1.64 + %r3 = mul %T432 %v0, %v1 +; COST: cost of 2 for instruction: {{.*}} mul <4 x i32> +; ASM: vmul.i32 + store %T432 %r3, %T432* %storeaddr +; ASM: vst1.64 + ret void +} + +define void @ups1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) { +; COST: function 'ups1632': + %v0 = load %T416* %loadaddr +; ASM: vldr + %v1 = load %T416* %loadaddr2 +; ASM: vldr + %r1 = sext %T416 %v0 to %T432 + %r2 = sext %T416 %v1 to %T432 +; COST: cost of 0 for instruction: {{.*}} sext <4 x i16> {{.*}} to <4 x i32> + %r3 = mul %T432 %r1, %r2 +; COST: cost of 2 for instruction: {{.*}} mul <4 x i32> +; ASM: vmull.s16 + store %T432 %r3, %T432* %storeaddr +; ASM: vst1.64 + ret void +} + +define void @upu1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) { +; COST: function 'upu1632': + %v0 = load %T416* %loadaddr +; ASM: vldr + %v1 = load %T416* %loadaddr2 +; ASM: vldr + %r1 = zext %T416 %v0 to %T432 + %r2 = zext %T416 %v1 to %T432 +; COST: cost of 0 for instruction: {{.*}} zext <4 x i16> {{.*}} to <4 x i32> + %r3 = mul %T432 %r1, %r2 +; COST: cost of 2 for instruction: {{.*}} mul <4 x i32> +; ASM: vmull.u16 + store %T432 %r3, %T432* %storeaddr +; ASM: vst1.64 + ret void +} + +define void @ups3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) { +; COST: function 'ups3264': + %v0 = load %T232* %loadaddr +; ASM: vldr + %v1 = load %T232* %loadaddr2 +; ASM: vldr + %r3 = mul %T232 %v0, %v1 +; ASM: vmul.i32 +; COST: cost of 1 for instruction: {{.*}} mul <2 x i32> + %st = sext %T232 %r3 to %T264 +; ASM: vmovl.s32 +; COST: cost of 1 for instruction: {{.*}} sext <2 x i32> {{.*}} to <2 x i64> + store %T264 %st, %T264* %storeaddr +; ASM: vst1.64 + ret void +} + +define void @upu3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) { +; COST: function 'upu3264': + %v0 = load %T232* %loadaddr +; ASM: vldr + %v1 = load %T232* %loadaddr2 +; ASM: vldr + %r3 = mul %T232 %v0, %v1 +; ASM: vmul.i32 +; COST: cost of 1 for instruction: {{.*}} mul <2 x i32> + %st = zext %T232 %r3 to %T264 +; ASM: vmovl.u32 +; COST: cost of 1 for instruction: {{.*}} zext <2 x i32> {{.*}} to <2 x i64> + store %T264 %st, %T264* %storeaddr +; ASM: vst1.64 + ret void +} + +define void @dn3216(%T432* %loadaddr, %T432* %loadaddr2, %T416* %storeaddr) { +; COST: function 'dn3216': + %v0 = load %T432* %loadaddr +; ASM: vld1.64 + %v1 = load %T432* %loadaddr2 +; ASM: vld1.64 + %r3 = mul %T432 %v0, %v1 +; ASM: vmul.i32 +; COST: cost of 2 for instruction: {{.*}} mul <4 x i32> + %st = trunc %T432 %r3 to %T416 +; ASM: vmovn.i32 +; COST: cost of 1 for instruction: {{.*}} trunc <4 x i32> {{.*}} to <4 x i16> + store %T416 %st, %T416* %storeaddr +; ASM: vstr + ret void +} diff --git a/test/Transforms/LoopVectorize/ARM/width-detect.ll b/test/Transforms/LoopVectorize/ARM/width-detect.ll new file mode 100644 index 0000000..c0795b6 --- /dev/null +++ b/test/Transforms/LoopVectorize/ARM/width-detect.ll @@ -0,0 +1,52 @@ +; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" +target triple = "thumbv7-apple-ios3.0.0" + +;CHECK:foo_F64 +;CHECK: <2 x double> +;CHECK:ret +define double @foo_F64(double* nocapture %A, i32 %n) nounwind uwtable readonly ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %prod.01 = phi double [ %4, %.lr.ph ], [ 0.000000e+00, %0 ] + %2 = getelementptr inbounds double* %A, i64 %indvars.iv + %3 = load double* %2, align 8 + %4 = fmul fast double %prod.01, %3 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %prod.0.lcssa = phi double [ 0.000000e+00, %0 ], [ %4, %.lr.ph ] + ret double %prod.0.lcssa +} + +;CHECK:foo_I8 +;CHECK: xor <16 x i8> +;CHECK:ret +define signext i8 @foo_I8(i8* nocapture %A, i32 %n) nounwind uwtable readonly ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %red.01 = phi i8 [ %4, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i8* %A, i64 %indvars.iv + %3 = load i8* %2, align 1 + %4 = xor i8 %3, %red.01 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + %red.0.lcssa = phi i8 [ 0, %0 ], [ %4, %.lr.ph ] + ret i8 %red.0.lcssa +} + + |