summaryrefslogtreecommitdiffstats
path: root/test/CodeGen/ARM
diff options
context:
space:
mode:
Diffstat (limited to 'test/CodeGen/ARM')
-rw-r--r--test/CodeGen/ARM/2006-11-10-CycleInDAG.ll2
-rw-r--r--test/CodeGen/ARM/2007-01-19-InfiniteLoop.ll2
-rw-r--r--test/CodeGen/ARM/2007-03-07-CombinerCrash.ll2
-rw-r--r--test/CodeGen/ARM/2007-03-13-InstrSched.ll4
-rw-r--r--test/CodeGen/ARM/2007-03-21-JoinIntervalsCrash.ll2
-rw-r--r--test/CodeGen/ARM/2007-03-26-RegScavengerAssert.ll2
-rw-r--r--test/CodeGen/ARM/2007-03-27-RegScavengerAssert.ll2
-rw-r--r--test/CodeGen/ARM/2007-03-30-RegScavengerAssert.ll2
-rw-r--r--test/CodeGen/ARM/2007-04-02-RegScavengerAssert.ll2
-rw-r--r--test/CodeGen/ARM/2007-04-03-PEIBug.ll2
-rw-r--r--test/CodeGen/ARM/2007-04-03-UndefinedSymbol.ll2
-rw-r--r--test/CodeGen/ARM/2007-04-30-CombinerCrash.ll2
-rw-r--r--test/CodeGen/ARM/2007-05-03-BadPostIndexedLd.ll2
-rw-r--r--test/CodeGen/ARM/2007-05-07-jumptoentry.ll2
-rw-r--r--test/CodeGen/ARM/2007-05-07-tailmerge-1.ll8
-rw-r--r--test/CodeGen/ARM/2007-05-09-tailmerge-2.ll8
-rw-r--r--test/CodeGen/ARM/2007-05-14-InlineAsmCstCrash.ll2
-rw-r--r--test/CodeGen/ARM/2007-05-14-RegScavengerAssert.ll2
-rw-r--r--test/CodeGen/ARM/2007-05-22-tailmerge-3.ll16
-rw-r--r--test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll2
-rw-r--r--test/CodeGen/ARM/2007-05-31-RegScavengerInfiniteLoop.ll2
-rw-r--r--test/CodeGen/ARM/2007-08-15-ReuseBug.ll2
-rw-r--r--test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll2
-rw-r--r--test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll2
-rw-r--r--test/CodeGen/ARM/2008-03-05-SxtInRegBug.ll2
-rw-r--r--test/CodeGen/ARM/2008-03-07-RegScavengerAssert.ll2
-rw-r--r--test/CodeGen/ARM/2008-04-04-ScavengerAssert.ll2
-rw-r--r--test/CodeGen/ARM/2008-04-10-ScavengerAssert.ll2
-rw-r--r--test/CodeGen/ARM/2008-04-11-PHIofImpDef.ll2
-rw-r--r--test/CodeGen/ARM/2008-05-19-LiveIntervalsBug.ll2
-rw-r--r--test/CodeGen/ARM/2008-05-19-ScavengerAssert.ll2
-rw-r--r--test/CodeGen/ARM/2008-07-17-Fdiv.ll2
-rw-r--r--test/CodeGen/ARM/2008-07-24-CodeGenPrepCrash.ll2
-rw-r--r--test/CodeGen/ARM/2008-08-07-AsmPrintBug.ll2
-rw-r--r--test/CodeGen/ARM/2008-09-14-CoalescerBug.ll2
-rw-r--r--test/CodeGen/ARM/2008-09-17-CoalescerBug.ll2
-rw-r--r--test/CodeGen/ARM/2008-11-18-ScavengerAssert.ll2
-rw-r--r--test/CodeGen/ARM/2008-11-19-ScavengerAssert.ll2
-rw-r--r--test/CodeGen/ARM/2009-02-16-SpillerBug.ll2
-rw-r--r--test/CodeGen/ARM/2009-02-22-SoftenFloatVaArg.ll2
-rw-r--r--test/CodeGen/ARM/2009-02-27-SpillerBug.ll2
-rw-r--r--test/CodeGen/ARM/2009-03-07-SpillerBug.ll2
-rw-r--r--test/CodeGen/ARM/2009-03-09-AddrModeBug.ll2
-rw-r--r--test/CodeGen/ARM/2009-04-06-AsmModifier.ll2
-rw-r--r--test/CodeGen/ARM/2009-04-08-AggregateAddr.ll2
-rw-r--r--test/CodeGen/ARM/2009-04-08-FREM.ll2
-rw-r--r--test/CodeGen/ARM/2009-04-08-FloatUndef.ll2
-rw-r--r--test/CodeGen/ARM/2009-04-09-RegScavengerAsm.ll2
-rw-r--r--test/CodeGen/ARM/2009-05-05-DAGCombineBug.ll2
-rw-r--r--test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll2
-rw-r--r--test/CodeGen/ARM/2009-05-11-CodePlacementCrash.ll2
-rw-r--r--test/CodeGen/ARM/2009-05-18-InlineAsmMem.ll4
-rw-r--r--test/CodeGen/ARM/2009-06-02-ISelCrash.ll2
-rw-r--r--test/CodeGen/ARM/2009-06-04-MissingLiveIn.ll2
-rw-r--r--test/CodeGen/ARM/2009-06-12-RegScavengerAssert.ll2
-rw-r--r--test/CodeGen/ARM/2009-06-15-RegScavengerAssert.ll2
-rw-r--r--test/CodeGen/ARM/2009-06-19-RegScavengerAssert.ll2
-rw-r--r--test/CodeGen/ARM/2009-06-22-CoalescerBug.ll2
-rw-r--r--test/CodeGen/ARM/2009-06-30-RegScavengerAssert.ll2
-rw-r--r--test/CodeGen/ARM/2009-06-30-RegScavengerAssert2.ll2
-rw-r--r--test/CodeGen/ARM/2009-06-30-RegScavengerAssert3.ll2
-rw-r--r--test/CodeGen/ARM/2009-06-30-RegScavengerAssert4.ll2
-rw-r--r--test/CodeGen/ARM/2009-06-30-RegScavengerAssert5.ll2
-rw-r--r--test/CodeGen/ARM/2009-07-01-CommuteBug.ll2
-rw-r--r--test/CodeGen/ARM/2009-07-09-asm-p-constraint.ll7
-rw-r--r--test/CodeGen/ARM/2009-07-18-RewriterBug.ll1323
-rw-r--r--test/CodeGen/ARM/2009-07-22-ScavengerAssert.ll94
-rw-r--r--test/CodeGen/ARM/2009-07-22-SchedulerAssert.ll95
-rw-r--r--test/CodeGen/ARM/2009-07-29-VFP3Registers.ll108
-rw-r--r--test/CodeGen/ARM/2009-08-02-RegScavengerAssert-Neon.ll29
-rw-r--r--test/CodeGen/ARM/2009-08-04-RegScavengerAssert-2.ll33
-rw-r--r--test/CodeGen/ARM/2009-08-04-RegScavengerAssert.ll25
-rw-r--r--test/CodeGen/ARM/2009-08-15-RegScavenger-EarlyClobber.ll42
-rw-r--r--test/CodeGen/ARM/2009-08-15-RegScavengerAssert.ll10
-rw-r--r--test/CodeGen/ARM/2009-08-21-PostRAKill.ll40
-rw-r--r--test/CodeGen/ARM/2009-08-21-PostRAKill2.ll38
-rw-r--r--test/CodeGen/ARM/2009-08-21-PostRAKill3.ll31
-rw-r--r--test/CodeGen/ARM/2009-08-21-PostRAKill4.ll26
-rw-r--r--test/CodeGen/ARM/2009-08-23-linkerprivate.ll8
-rw-r--r--test/CodeGen/ARM/2009-08-26-ScalarToVector.ll27
-rw-r--r--test/CodeGen/ARM/2009-08-27-ScalarToVector.ll35
-rw-r--r--test/CodeGen/ARM/2009-08-29-ExtractEltf32.ll25
-rw-r--r--test/CodeGen/ARM/2009-08-29-TooLongSplat.ll23
-rw-r--r--test/CodeGen/ARM/2009-08-31-LSDA-Name.ll103
-rw-r--r--test/CodeGen/ARM/2009-08-31-TwoRegShuffle.ll9
-rw-r--r--test/CodeGen/ARM/2009-09-01-PostRAProlog.ll106
-rw-r--r--test/CodeGen/ARM/2009-09-09-AllOnes.ll10
-rw-r--r--test/CodeGen/ARM/2009-09-09-fpcmp-ole.ll18
-rw-r--r--test/CodeGen/ARM/2009-09-10-postdec.ll11
-rw-r--r--test/CodeGen/ARM/2009-09-13-InvalidSubreg.ll61
-rw-r--r--test/CodeGen/ARM/2009-09-13-InvalidSuperReg.ll41
-rw-r--r--test/CodeGen/ARM/2009-09-20-LiveIntervalsBug.ll34
-rw-r--r--test/CodeGen/ARM/2009-09-21-LiveVariablesBug.ll14
-rw-r--r--test/CodeGen/ARM/2009-09-22-LiveVariablesBug.ll23
-rw-r--r--test/CodeGen/ARM/2009-09-23-LiveVariablesBug.ll21
-rw-r--r--test/CodeGen/ARM/2009-09-24-spill-align.ll17
-rw-r--r--test/CodeGen/ARM/2009-09-27-CoalescerBug.ll24
-rw-r--r--test/CodeGen/ARM/2009-09-28-LdStOptiBug.ll19
-rw-r--r--test/CodeGen/ARM/addrmode.ll2
-rw-r--r--test/CodeGen/ARM/aliases.ll3
-rw-r--r--test/CodeGen/ARM/align.ll8
-rw-r--r--test/CodeGen/ARM/alloca.ll4
-rw-r--r--test/CodeGen/ARM/argaddr.ll2
-rw-r--r--test/CodeGen/ARM/arguments-nosplit-double.ll2
-rw-r--r--test/CodeGen/ARM/arguments-nosplit-i64.ll2
-rw-r--r--test/CodeGen/ARM/arguments.ll4
-rw-r--r--test/CodeGen/ARM/arguments2.ll4
-rw-r--r--test/CodeGen/ARM/arguments3.ll4
-rw-r--r--test/CodeGen/ARM/arguments4.ll4
-rw-r--r--test/CodeGen/ARM/arguments5.ll4
-rw-r--r--test/CodeGen/ARM/arguments6.ll4
-rw-r--r--test/CodeGen/ARM/arguments7.ll4
-rw-r--r--test/CodeGen/ARM/arguments8.ll4
-rw-r--r--test/CodeGen/ARM/arguments_f64_backfill.ll2
-rw-r--r--test/CodeGen/ARM/arm-asm.ll2
-rw-r--r--test/CodeGen/ARM/arm-frameaddr.ll4
-rw-r--r--test/CodeGen/ARM/arm-negative-stride.ll2
-rw-r--r--test/CodeGen/ARM/bfc.ll19
-rw-r--r--test/CodeGen/ARM/bic.ll2
-rw-r--r--test/CodeGen/ARM/bits.ll2
-rw-r--r--test/CodeGen/ARM/bx_fold.ll4
-rw-r--r--test/CodeGen/ARM/call.ll6
-rw-r--r--test/CodeGen/ARM/call_nolink.ll2
-rw-r--r--test/CodeGen/ARM/carry.ll6
-rw-r--r--test/CodeGen/ARM/clz.ll2
-rw-r--r--test/CodeGen/ARM/compare-call.ll2
-rw-r--r--test/CodeGen/ARM/constants.ll14
-rw-r--r--test/CodeGen/ARM/cse-libcalls.ll2
-rw-r--r--test/CodeGen/ARM/ctors_dtors.ll24
-rw-r--r--test/CodeGen/ARM/div.ll2
-rw-r--r--test/CodeGen/ARM/dyn-stackalloc.ll2
-rw-r--r--test/CodeGen/ARM/extloadi1.ll2
-rw-r--r--test/CodeGen/ARM/fabss.ll15
-rw-r--r--test/CodeGen/ARM/fadds.ll12
-rw-r--r--test/CodeGen/ARM/fcopysign.ll4
-rw-r--r--test/CodeGen/ARM/fdivs.ll12
-rw-r--r--test/CodeGen/ARM/fixunsdfdi.ll4
-rw-r--r--test/CodeGen/ARM/fmacs.ll13
-rw-r--r--test/CodeGen/ARM/fmdrr-fmrrd.ll4
-rw-r--r--test/CodeGen/ARM/fmscs.ll13
-rw-r--r--test/CodeGen/ARM/fmuls.ll12
-rw-r--r--test/CodeGen/ARM/fnegs.ll25
-rw-r--r--test/CodeGen/ARM/fnmacs.ll13
-rw-r--r--test/CodeGen/ARM/fnmscs.ll24
-rw-r--r--test/CodeGen/ARM/fnmul.ll4
-rw-r--r--test/CodeGen/ARM/fnmuls.ll23
-rw-r--r--test/CodeGen/ARM/formal.ll2
-rw-r--r--test/CodeGen/ARM/fp.ll38
-rw-r--r--test/CodeGen/ARM/fp_convert.ll49
-rw-r--r--test/CodeGen/ARM/fparith.ll34
-rw-r--r--test/CodeGen/ARM/fpcmp.ll30
-rw-r--r--test/CodeGen/ARM/fpcmp_ueq.ll4
-rw-r--r--test/CodeGen/ARM/fpconv.ll64
-rw-r--r--test/CodeGen/ARM/fpmem.ll6
-rw-r--r--test/CodeGen/ARM/fpow.ll2
-rw-r--r--test/CodeGen/ARM/fpowi.ll2
-rw-r--r--test/CodeGen/ARM/fptoint.ll4
-rw-r--r--test/CodeGen/ARM/fsubs.ll10
-rw-r--r--test/CodeGen/ARM/hardfloat_neon.ll13
-rw-r--r--test/CodeGen/ARM/hello.ll8
-rw-r--r--test/CodeGen/ARM/hidden-vis-2.ll5
-rw-r--r--test/CodeGen/ARM/hidden-vis-3.ll9
-rw-r--r--test/CodeGen/ARM/hidden-vis.ll19
-rw-r--r--test/CodeGen/ARM/iabs.ll2
-rw-r--r--test/CodeGen/ARM/ifcvt1.ll4
-rw-r--r--test/CodeGen/ARM/ifcvt2.ll8
-rw-r--r--test/CodeGen/ARM/ifcvt3.ll6
-rw-r--r--test/CodeGen/ARM/ifcvt4.ll6
-rw-r--r--test/CodeGen/ARM/ifcvt5.ll5
-rw-r--r--test/CodeGen/ARM/ifcvt6.ll8
-rw-r--r--test/CodeGen/ARM/ifcvt7.ll11
-rw-r--r--test/CodeGen/ARM/ifcvt8.ll5
-rw-r--r--test/CodeGen/ARM/ifcvt9.ll2
-rw-r--r--test/CodeGen/ARM/illegal-vector-bitcast.ll3
-rw-r--r--test/CodeGen/ARM/imm.ll2
-rw-r--r--test/CodeGen/ARM/inlineasm-imm-arm.ll2
-rw-r--r--test/CodeGen/ARM/inlineasm.ll2
-rw-r--r--test/CodeGen/ARM/inlineasm2.ll2
-rw-r--r--test/CodeGen/ARM/insn-sched1.ll4
-rw-r--r--test/CodeGen/ARM/ispositive.ll2
-rw-r--r--test/CodeGen/ARM/large-stack.ll2
-rw-r--r--test/CodeGen/ARM/ldm.ll6
-rw-r--r--test/CodeGen/ARM/ldr.ll10
-rw-r--r--test/CodeGen/ARM/ldr_ext.ll31
-rw-r--r--test/CodeGen/ARM/ldr_frame.ll2
-rw-r--r--test/CodeGen/ARM/ldr_post.ll2
-rw-r--r--test/CodeGen/ARM/ldr_pre.ll2
-rw-r--r--test/CodeGen/ARM/ldrd.ll14
-rw-r--r--test/CodeGen/ARM/load-global.ll12
-rw-r--r--test/CodeGen/ARM/load.ll2
-rw-r--r--test/CodeGen/ARM/long-setcc.ll2
-rw-r--r--test/CodeGen/ARM/long.ll16
-rw-r--r--test/CodeGen/ARM/long_shift.ll2
-rw-r--r--test/CodeGen/ARM/lsr-code-insertion.ll4
-rw-r--r--test/CodeGen/ARM/lsr-scale-addr-mode.ll2
-rw-r--r--test/CodeGen/ARM/mem.ll4
-rw-r--r--test/CodeGen/ARM/memcpy-inline.ll8
-rw-r--r--test/CodeGen/ARM/memfunc.ll2
-rw-r--r--test/CodeGen/ARM/mls.ll14
-rw-r--r--test/CodeGen/ARM/mul.ll4
-rw-r--r--test/CodeGen/ARM/mul_const.ll17
-rw-r--r--test/CodeGen/ARM/mulhi.ll6
-rw-r--r--test/CodeGen/ARM/mvn.ll2
-rw-r--r--test/CodeGen/ARM/neon_arith1.ll2
-rw-r--r--test/CodeGen/ARM/neon_ld1.ll6
-rw-r--r--test/CodeGen/ARM/neon_ld2.ll6
-rw-r--r--test/CodeGen/ARM/pack.ll4
-rw-r--r--test/CodeGen/ARM/pr3502.ll2
-rw-r--r--test/CodeGen/ARM/private.ll2
-rw-r--r--test/CodeGen/ARM/remat.ll4
-rw-r--r--test/CodeGen/ARM/ret0.ll2
-rw-r--r--test/CodeGen/ARM/ret_arg1.ll2
-rw-r--r--test/CodeGen/ARM/ret_arg2.ll2
-rw-r--r--test/CodeGen/ARM/ret_arg3.ll2
-rw-r--r--test/CodeGen/ARM/ret_arg4.ll2
-rw-r--r--test/CodeGen/ARM/ret_arg5.ll2
-rw-r--r--test/CodeGen/ARM/ret_f32_arg2.ll2
-rw-r--r--test/CodeGen/ARM/ret_f32_arg5.ll2
-rw-r--r--test/CodeGen/ARM/ret_f64_arg2.ll2
-rw-r--r--test/CodeGen/ARM/ret_f64_arg_reg_split.ll2
-rw-r--r--test/CodeGen/ARM/ret_f64_arg_split.ll2
-rw-r--r--test/CodeGen/ARM/ret_f64_arg_stack.ll2
-rw-r--r--test/CodeGen/ARM/ret_i128_arg2.ll2
-rw-r--r--test/CodeGen/ARM/ret_i64_arg2.ll2
-rw-r--r--test/CodeGen/ARM/ret_i64_arg3.ll2
-rw-r--r--test/CodeGen/ARM/ret_i64_arg_split.ll2
-rw-r--r--test/CodeGen/ARM/ret_void.ll2
-rw-r--r--test/CodeGen/ARM/rev.ll4
-rw-r--r--test/CodeGen/ARM/sbfx.ll37
-rw-r--r--test/CodeGen/ARM/section.ll4
-rw-r--r--test/CodeGen/ARM/select.ll27
-rw-r--r--test/CodeGen/ARM/select_xform.ll2
-rw-r--r--test/CodeGen/ARM/shifter_operand.ll4
-rw-r--r--test/CodeGen/ARM/smul.ll10
-rw-r--r--test/CodeGen/ARM/spill-q.ll57
-rw-r--r--test/CodeGen/ARM/stack-frame.ll4
-rw-r--r--test/CodeGen/ARM/stm.ll2
-rw-r--r--test/CodeGen/ARM/str_post.ll4
-rw-r--r--test/CodeGen/ARM/str_pre-2.ll4
-rw-r--r--test/CodeGen/ARM/str_pre.ll2
-rw-r--r--test/CodeGen/ARM/str_trunc.ll4
-rw-r--r--test/CodeGen/ARM/sxt_rot.ll6
-rw-r--r--test/CodeGen/ARM/t2-imm.ll9
-rw-r--r--test/CodeGen/ARM/thread_pointer.ll2
-rw-r--r--test/CodeGen/ARM/tls1.ll6
-rw-r--r--test/CodeGen/ARM/tls2.ll6
-rw-r--r--test/CodeGen/ARM/tls3.ll2
-rw-r--r--test/CodeGen/ARM/trunc_ldr.ll4
-rw-r--r--test/CodeGen/ARM/truncstore-dag-combine.ll4
-rw-r--r--test/CodeGen/ARM/tst_teq.ll4
-rw-r--r--test/CodeGen/ARM/uint64tof64.ll2
-rw-r--r--test/CodeGen/ARM/unaligned_load_store.ll39
-rw-r--r--test/CodeGen/ARM/unord.ll4
-rw-r--r--test/CodeGen/ARM/uxt_rot.ll6
-rw-r--r--test/CodeGen/ARM/uxtb.ll2
-rw-r--r--test/CodeGen/ARM/vaba.ll100
-rw-r--r--test/CodeGen/ARM/vabd.ll107
-rw-r--r--test/CodeGen/ARM/vabs.ll85
-rw-r--r--test/CodeGen/ARM/vadd.ll213
-rw-r--r--test/CodeGen/ARM/vargs.ll2
-rw-r--r--test/CodeGen/ARM/vargs_align.ll10
-rw-r--r--test/CodeGen/ARM/vbits.ll507
-rw-r--r--test/CodeGen/ARM/vbsl.ll20
-rw-r--r--test/CodeGen/ARM/vceq.ll62
-rw-r--r--test/CodeGen/ARM/vcge.ll126
-rw-r--r--test/CodeGen/ARM/vcgt.ll126
-rw-r--r--test/CodeGen/ARM/vcnt.ll119
-rw-r--r--test/CodeGen/ARM/vcombine.ll36
-rw-r--r--test/CodeGen/ARM/vcvt.ll97
-rw-r--r--test/CodeGen/ARM/vdup.ll143
-rw-r--r--test/CodeGen/ARM/vext.ll56
-rw-r--r--test/CodeGen/ARM/vfcmp.ll101
-rw-r--r--test/CodeGen/ARM/vfp.ll43
-rw-r--r--test/CodeGen/ARM/vget_lane.ll146
-rw-r--r--test/CodeGen/ARM/vhadd.ll156
-rw-r--r--test/CodeGen/ARM/vhsub.ll32
-rw-r--r--test/CodeGen/ARM/vicmp.ll88
-rw-r--r--test/CodeGen/ARM/vld1.ll83
-rw-r--r--test/CodeGen/ARM/vld2.ll113
-rw-r--r--test/CodeGen/ARM/vld3.ll117
-rw-r--r--test/CodeGen/ARM/vld4.ll117
-rw-r--r--test/CodeGen/ARM/vldlane.ll328
-rw-r--r--test/CodeGen/ARM/vminmax.ll293
-rw-r--r--test/CodeGen/ARM/vmla.ll126
-rw-r--r--test/CodeGen/ARM/vmls.ll126
-rw-r--r--test/CodeGen/ARM/vmov.ll214
-rw-r--r--test/CodeGen/ARM/vmul.ll190
-rw-r--r--test/CodeGen/ARM/vneg.ll78
-rw-r--r--test/CodeGen/ARM/vpadal.ll32
-rw-r--r--test/CodeGen/ARM/vpadd.ll142
-rw-r--r--test/CodeGen/ARM/vpminmax.ll147
-rw-r--r--test/CodeGen/ARM/vqadd.ll42
-rw-r--r--test/CodeGen/ARM/vqdmul.ll281
-rw-r--r--test/CodeGen/ARM/vqshl.ll266
-rw-r--r--test/CodeGen/ARM/vqshrn.ll113
-rw-r--r--test/CodeGen/ARM/vqsub.ll42
-rw-r--r--test/CodeGen/ARM/vrec.ll119
-rw-r--r--test/CodeGen/ARM/vrev.ll113
-rw-r--r--test/CodeGen/ARM/vshift.ll145
-rw-r--r--test/CodeGen/ARM/vshiftins.ll42
-rw-r--r--test/CodeGen/ARM/vshl.ll394
-rw-r--r--test/CodeGen/ARM/vshll.ll29
-rw-r--r--test/CodeGen/ARM/vshrn.ll39
-rw-r--r--test/CodeGen/ARM/vsra.ll82
-rw-r--r--test/CodeGen/ARM/vst1.ll93
-rw-r--r--test/CodeGen/ARM/vst2.ll84
-rw-r--r--test/CodeGen/ARM/vst3.ll88
-rw-r--r--test/CodeGen/ARM/vst4.ll88
-rw-r--r--test/CodeGen/ARM/vstlane.ll197
-rw-r--r--test/CodeGen/ARM/vsub.ll213
-rw-r--r--test/CodeGen/ARM/vtbl.ll109
-rw-r--r--test/CodeGen/ARM/vtrn.ll97
-rw-r--r--test/CodeGen/ARM/vuzp.ll75
-rw-r--r--test/CodeGen/ARM/vzip.ll75
-rw-r--r--test/CodeGen/ARM/weak.ll4
-rw-r--r--test/CodeGen/ARM/weak2.ll2
316 files changed, 10336 insertions, 859 deletions
diff --git a/test/CodeGen/ARM/2006-11-10-CycleInDAG.ll b/test/CodeGen/ARM/2006-11-10-CycleInDAG.ll
index caa9a98..a0235f7 100644
--- a/test/CodeGen/ARM/2006-11-10-CycleInDAG.ll
+++ b/test/CodeGen/ARM/2006-11-10-CycleInDAG.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6
+; RUN: llc < %s -march=arm -mattr=+v6
%struct.layer_data = type { i32, [2048 x i8], i8*, [16 x i8], i32, i8*, i32, i32, [64 x i32], [64 x i32], [64 x i32], [64 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [12 x [64 x i16]] }
@ld = external global %struct.layer_data* ; <%struct.layer_data**> [#uses=1]
diff --git a/test/CodeGen/ARM/2007-01-19-InfiniteLoop.ll b/test/CodeGen/ARM/2007-01-19-InfiniteLoop.ll
index 6e11b16..81483cb 100644
--- a/test/CodeGen/ARM/2007-01-19-InfiniteLoop.ll
+++ b/test/CodeGen/ARM/2007-01-19-InfiniteLoop.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6,+vfp2
+; RUN: llc < %s -march=arm -mattr=+v6,+vfp2
@quant_coef = external global [6 x [4 x [4 x i32]]] ; <[6 x [4 x [4 x i32]]]*> [#uses=1]
@dequant_coef = external global [6 x [4 x [4 x i32]]] ; <[6 x [4 x [4 x i32]]]*> [#uses=1]
diff --git a/test/CodeGen/ARM/2007-03-07-CombinerCrash.ll b/test/CodeGen/ARM/2007-03-07-CombinerCrash.ll
index 7317e62..83b26d3 100644
--- a/test/CodeGen/ARM/2007-03-07-CombinerCrash.ll
+++ b/test/CodeGen/ARM/2007-03-07-CombinerCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -mattr=+v6,+vfp2
+; RUN: llc < %s -mtriple=arm-apple-darwin -mattr=+v6,+vfp2
define fastcc i8* @read_sleb128(i8* %p, i32* %val) {
br label %bb
diff --git a/test/CodeGen/ARM/2007-03-13-InstrSched.ll b/test/CodeGen/ARM/2007-03-13-InstrSched.ll
index 07390ad..33f935e 100644
--- a/test/CodeGen/ARM/2007-03-13-InstrSched.ll
+++ b/test/CodeGen/ARM/2007-03-13-InstrSched.ll
@@ -1,6 +1,6 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -relocation-model=pic \
+; RUN: llc < %s -mtriple=arm-apple-darwin -relocation-model=pic \
; RUN: -mattr=+v6 | grep r9
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -relocation-model=pic \
+; RUN: llc < %s -mtriple=arm-apple-darwin -relocation-model=pic \
; RUN: -mattr=+v6 -arm-reserve-r9 -ifcvt-limit=0 -stats |& grep asm-printer
; | grep 35
diff --git a/test/CodeGen/ARM/2007-03-21-JoinIntervalsCrash.ll b/test/CodeGen/ARM/2007-03-21-JoinIntervalsCrash.ll
index 32daf83..b0953dc 100644
--- a/test/CodeGen/ARM/2007-03-21-JoinIntervalsCrash.ll
+++ b/test/CodeGen/ARM/2007-03-21-JoinIntervalsCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi
+; RUN: llc < %s -mtriple=arm-linux-gnueabi
; PR1257
%struct.CUMULATIVE_ARGS = type { i32, i32, i32, i32, i32, i32 }
diff --git a/test/CodeGen/ARM/2007-03-26-RegScavengerAssert.ll b/test/CodeGen/ARM/2007-03-26-RegScavengerAssert.ll
index 6d3f640..d741112 100644
--- a/test/CodeGen/ARM/2007-03-26-RegScavengerAssert.ll
+++ b/test/CodeGen/ARM/2007-03-26-RegScavengerAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
; PR1266
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/ARM/2007-03-27-RegScavengerAssert.ll b/test/CodeGen/ARM/2007-03-27-RegScavengerAssert.ll
index f927ef4..e4635f5 100644
--- a/test/CodeGen/ARM/2007-03-27-RegScavengerAssert.ll
+++ b/test/CodeGen/ARM/2007-03-27-RegScavengerAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mtriple=arm-linux-gnueabi
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi
; PR1279
%struct.rtx_def = type { i16, i8, i8, %struct.u }
diff --git a/test/CodeGen/ARM/2007-03-30-RegScavengerAssert.ll b/test/CodeGen/ARM/2007-03-30-RegScavengerAssert.ll
index 55d2993..ea27676 100644
--- a/test/CodeGen/ARM/2007-03-30-RegScavengerAssert.ll
+++ b/test/CodeGen/ARM/2007-03-30-RegScavengerAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mtriple=arm-linux-gnueabi
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi
; PR1279
%struct.CUMULATIVE_ARGS = type { i32, i32, i32, i32, i32, i32 }
diff --git a/test/CodeGen/ARM/2007-04-02-RegScavengerAssert.ll b/test/CodeGen/ARM/2007-04-02-RegScavengerAssert.ll
index ef5a1ae..f24def3 100644
--- a/test/CodeGen/ARM/2007-04-02-RegScavengerAssert.ll
+++ b/test/CodeGen/ARM/2007-04-02-RegScavengerAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mtriple=arm-apple-darwin
+; RUN: llc < %s -march=arm -mtriple=arm-apple-darwin
%struct.H_TBL = type { [17 x i8], [256 x i8], i32 }
%struct.Q_TBL = type { [64 x i16], i32 }
diff --git a/test/CodeGen/ARM/2007-04-03-PEIBug.ll b/test/CodeGen/ARM/2007-04-03-PEIBug.ll
index e412127..b543c57 100644
--- a/test/CodeGen/ARM/2007-04-03-PEIBug.ll
+++ b/test/CodeGen/ARM/2007-04-03-PEIBug.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm | not grep {add.*#0}
+; RUN: llc < %s -march=arm | not grep {add.*#0}
define i32 @foo() {
entry:
diff --git a/test/CodeGen/ARM/2007-04-03-UndefinedSymbol.ll b/test/CodeGen/ARM/2007-04-03-UndefinedSymbol.ll
index 42f5034..e001cde 100644
--- a/test/CodeGen/ARM/2007-04-03-UndefinedSymbol.ll
+++ b/test/CodeGen/ARM/2007-04-03-UndefinedSymbol.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -relocation-model=pic | \
+; RUN: llc < %s -mtriple=arm-apple-darwin -relocation-model=pic | \
; RUN: not grep LPC9
%struct.B = type { i32 }
diff --git a/test/CodeGen/ARM/2007-04-30-CombinerCrash.ll b/test/CodeGen/ARM/2007-04-30-CombinerCrash.ll
index ec70a59..a89e937 100644
--- a/test/CodeGen/ARM/2007-04-30-CombinerCrash.ll
+++ b/test/CodeGen/ARM/2007-04-30-CombinerCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -mattr=+v6,+vfp2
+; RUN: llc < %s -mtriple=arm-apple-darwin -mattr=+v6,+vfp2
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:64"
target triple = "arm-apple-darwin8"
diff --git a/test/CodeGen/ARM/2007-05-03-BadPostIndexedLd.ll b/test/CodeGen/ARM/2007-05-03-BadPostIndexedLd.ll
index f3f82bc..c73b679 100644
--- a/test/CodeGen/ARM/2007-05-03-BadPostIndexedLd.ll
+++ b/test/CodeGen/ARM/2007-05-03-BadPostIndexedLd.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin
+; RUN: llc < %s -mtriple=arm-apple-darwin
%struct.Connection = type { i32, [10 x i8], i32 }
%struct.IntChunk = type { %struct.cppobjtype, i32, i32*, i32 }
diff --git a/test/CodeGen/ARM/2007-05-07-jumptoentry.ll b/test/CodeGen/ARM/2007-05-07-jumptoentry.ll
index 11431be..26864f1 100644
--- a/test/CodeGen/ARM/2007-05-07-jumptoentry.ll
+++ b/test/CodeGen/ARM/2007-05-07-jumptoentry.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc | not grep 1_0
+; RUN: llc < %s | not grep 1_0
; This used to create an extra branch to 'entry', LBB1_0.
; ModuleID = 'bug.bc'
diff --git a/test/CodeGen/ARM/2007-05-07-tailmerge-1.ll b/test/CodeGen/ARM/2007-05-07-tailmerge-1.ll
index c3596e7..f2a8ee1 100644
--- a/test/CodeGen/ARM/2007-05-07-tailmerge-1.ll
+++ b/test/CodeGen/ARM/2007-05-07-tailmerge-1.ll
@@ -1,7 +1,7 @@
-; RUN: llvm-as < %s | llc -march=arm -enable-tail-merge | grep bl.*baz | count 1
-; RUN: llvm-as < %s | llc -march=arm -enable-tail-merge | grep bl.*quux | count 1
-; RUN: llvm-as < %s | llc -march=arm -enable-tail-merge -enable-eh | grep bl.*baz | count 1
-; RUN: llvm-as < %s | llc -march=arm -enable-tail-merge -enable-eh | grep bl.*quux | count 1
+; RUN: llc < %s -march=arm -enable-tail-merge | grep bl.*baz | count 1
+; RUN: llc < %s -march=arm -enable-tail-merge | grep bl.*quux | count 1
+; RUN: llc < %s -march=arm -enable-tail-merge -enable-eh | grep bl.*baz | count 1
+; RUN: llc < %s -march=arm -enable-tail-merge -enable-eh | grep bl.*quux | count 1
; Check that calls to baz and quux are tail-merged.
; PR1628
diff --git a/test/CodeGen/ARM/2007-05-09-tailmerge-2.ll b/test/CodeGen/ARM/2007-05-09-tailmerge-2.ll
index 41ab1e5..2758505 100644
--- a/test/CodeGen/ARM/2007-05-09-tailmerge-2.ll
+++ b/test/CodeGen/ARM/2007-05-09-tailmerge-2.ll
@@ -1,7 +1,7 @@
-; RUN: llvm-as < %s | llc -march=arm -enable-tail-merge | grep bl.*baz | count 1
-; RUN: llvm-as < %s | llc -march=arm -enable-tail-merge | grep bl.*quux | count 1
-; RUN: llvm-as < %s | llc -march=arm -enable-tail-merge -enable-eh | grep bl.*baz | count 1
-; RUN: llvm-as < %s | llc -march=arm -enable-tail-merge -enable-eh | grep bl.*quux | count 1
+; RUN: llc < %s -march=arm -enable-tail-merge | grep bl.*baz | count 1
+; RUN: llc < %s -march=arm -enable-tail-merge | grep bl.*quux | count 1
+; RUN: llc < %s -march=arm -enable-tail-merge -enable-eh | grep bl.*baz | count 1
+; RUN: llc < %s -march=arm -enable-tail-merge -enable-eh | grep bl.*quux | count 1
; Check that calls to baz and quux are tail-merged.
; PR1628
diff --git a/test/CodeGen/ARM/2007-05-14-InlineAsmCstCrash.ll b/test/CodeGen/ARM/2007-05-14-InlineAsmCstCrash.ll
index 58c5f89..b3b0769 100644
--- a/test/CodeGen/ARM/2007-05-14-InlineAsmCstCrash.ll
+++ b/test/CodeGen/ARM/2007-05-14-InlineAsmCstCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6
+; RUN: llc < %s -march=arm -mattr=+v6
define i32 @test3() {
tail call void asm sideeffect "/* number: ${0:c} */", "i"( i32 1 )
diff --git a/test/CodeGen/ARM/2007-05-14-RegScavengerAssert.ll b/test/CodeGen/ARM/2007-05-14-RegScavengerAssert.ll
index 430b368..7b15ded 100644
--- a/test/CodeGen/ARM/2007-05-14-RegScavengerAssert.ll
+++ b/test/CodeGen/ARM/2007-05-14-RegScavengerAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mtriple=arm-linux-gnueabi
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi
; PR1406
%struct.AVClass = type { i8*, i8* (i8*)*, %struct.AVOption* }
diff --git a/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll b/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
index 4c4a933..061bf5e 100644
--- a/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
+++ b/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
@@ -1,11 +1,11 @@
-; RUN: llvm-as < %s | llc -march=arm | grep bl.*baz | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep bl.*quux | count 1
-; RUN: llvm-as < %s | llc -march=arm -enable-tail-merge=0 | grep bl.*baz | count 2
-; RUN: llvm-as < %s | llc -march=arm -enable-tail-merge=0 | grep bl.*quux | count 2
-; RUN: llvm-as < %s | llc -march=arm -enable-eh | grep bl.*baz | count 1
-; RUN: llvm-as < %s | llc -march=arm -enable-eh | grep bl.*quux | count 1
-; RUN: llvm-as < %s | llc -march=arm -enable-tail-merge=0 -enable-eh | grep bl.*baz | count 2
-; RUN: llvm-as < %s | llc -march=arm -enable-tail-merge=0 -enable-eh | grep bl.*quux | count 2
+; RUN: llc < %s -march=arm | grep bl.*baz | count 1
+; RUN: llc < %s -march=arm | grep bl.*quux | count 1
+; RUN: llc < %s -march=arm -enable-tail-merge=0 | grep bl.*baz | count 2
+; RUN: llc < %s -march=arm -enable-tail-merge=0 | grep bl.*quux | count 2
+; RUN: llc < %s -march=arm -enable-eh | grep bl.*baz | count 1
+; RUN: llc < %s -march=arm -enable-eh | grep bl.*quux | count 1
+; RUN: llc < %s -march=arm -enable-tail-merge=0 -enable-eh | grep bl.*baz | count 2
+; RUN: llc < %s -march=arm -enable-tail-merge=0 -enable-eh | grep bl.*quux | count 2
; Check that tail merging is the default on ARM, and that -enable-tail-merge=0 works.
; PR1628
diff --git a/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll b/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll
index de32a26..d2eb85d 100644
--- a/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll
+++ b/test/CodeGen/ARM/2007-05-23-BadPreIndexedStore.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm | not grep {str.*\\!}
+; RUN: llc < %s -march=arm | not grep {str.*\\!}
%struct.shape_edge_t = type { %struct.shape_edge_t*, %struct.shape_edge_t*, i32, i32, i32, i32 }
%struct.shape_path_t = type { %struct.shape_edge_t*, %struct.shape_edge_t*, i32, i32, i32, i32, i32, i32 }
diff --git a/test/CodeGen/ARM/2007-05-31-RegScavengerInfiniteLoop.ll b/test/CodeGen/ARM/2007-05-31-RegScavengerInfiniteLoop.ll
index d21a8f2..030486a 100644
--- a/test/CodeGen/ARM/2007-05-31-RegScavengerInfiniteLoop.ll
+++ b/test/CodeGen/ARM/2007-05-31-RegScavengerInfiniteLoop.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc
+; RUN: llc < %s
; PR1424
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/ARM/2007-08-15-ReuseBug.ll b/test/CodeGen/ARM/2007-08-15-ReuseBug.ll
index 3cfcdef..30b72e0 100644
--- a/test/CodeGen/ARM/2007-08-15-ReuseBug.ll
+++ b/test/CodeGen/ARM/2007-08-15-ReuseBug.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -relocation-model=pic -mattr=+v6
+; RUN: llc < %s -mtriple=arm-apple-darwin -relocation-model=pic -mattr=+v6
; PR1609
%struct.FILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
diff --git a/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll b/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll
index ec170f8..ff01506 100644
--- a/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll
+++ b/test/CodeGen/ARM/2008-02-04-LocalRegAllocBug.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi -regalloc=local
+; RUN: llc < %s -mtriple=arm-linux-gnueabi -regalloc=local
; PR1925
%struct.encode_aux_nearestmatch = type { i32*, i32*, i32*, i32*, i32, i32 }
diff --git a/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll b/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll
index b81d575..06bc987 100644
--- a/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll
+++ b/test/CodeGen/ARM/2008-02-29-RegAllocLocal.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -regalloc=local
+; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=local
; PR1925
%"struct.kc::impl_Ccode_option" = type { %"struct.kc::impl_abstract_phylum" }
diff --git a/test/CodeGen/ARM/2008-03-05-SxtInRegBug.ll b/test/CodeGen/ARM/2008-03-05-SxtInRegBug.ll
index ca34275..a604c5c 100644
--- a/test/CodeGen/ARM/2008-03-05-SxtInRegBug.ll
+++ b/test/CodeGen/ARM/2008-03-05-SxtInRegBug.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6 | not grep 255
+; RUN: llc < %s -march=arm -mattr=+v6 | not grep 255
define i32 @main(i32 %argc, i8** %argv) {
entry:
diff --git a/test/CodeGen/ARM/2008-03-07-RegScavengerAssert.ll b/test/CodeGen/ARM/2008-03-07-RegScavengerAssert.ll
index 70f1774..78c6222 100644
--- a/test/CodeGen/ARM/2008-03-07-RegScavengerAssert.ll
+++ b/test/CodeGen/ARM/2008-03-07-RegScavengerAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -mattr=+v6,+vfp2
+; RUN: llc < %s -mtriple=arm-apple-darwin -mattr=+v6,+vfp2
@accum = external global { double, double } ; <{ double, double }*> [#uses=1]
@.str = external constant [4 x i8] ; <[4 x i8]*> [#uses=1]
diff --git a/test/CodeGen/ARM/2008-04-04-ScavengerAssert.ll b/test/CodeGen/ARM/2008-04-04-ScavengerAssert.ll
index 610f5ea..234c7b6 100644
--- a/test/CodeGen/ARM/2008-04-04-ScavengerAssert.ll
+++ b/test/CodeGen/ARM/2008-04-04-ScavengerAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin
+; RUN: llc < %s -mtriple=arm-apple-darwin
@numBinsY = external global i32 ; <i32*> [#uses=1]
diff --git a/test/CodeGen/ARM/2008-04-10-ScavengerAssert.ll b/test/CodeGen/ARM/2008-04-10-ScavengerAssert.ll
index 80ccddf..77418be 100644
--- a/test/CodeGen/ARM/2008-04-10-ScavengerAssert.ll
+++ b/test/CodeGen/ARM/2008-04-10-ScavengerAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin
+; RUN: llc < %s -mtriple=arm-apple-darwin
%struct.CONTENTBOX = type { i32, i32, i32, i32, i32 }
%struct.FILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
diff --git a/test/CodeGen/ARM/2008-04-11-PHIofImpDef.ll b/test/CodeGen/ARM/2008-04-11-PHIofImpDef.ll
index 3cd757f..33bd4de 100644
--- a/test/CodeGen/ARM/2008-04-11-PHIofImpDef.ll
+++ b/test/CodeGen/ARM/2008-04-11-PHIofImpDef.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin
+; RUN: llc < %s -mtriple=arm-apple-darwin
declare void @foo(i8*, i8*, i32, i32, i32, i32, i32, i32, i32)
diff --git a/test/CodeGen/ARM/2008-05-19-LiveIntervalsBug.ll b/test/CodeGen/ARM/2008-05-19-LiveIntervalsBug.ll
index 035af08..71aa603 100644
--- a/test/CodeGen/ARM/2008-05-19-LiveIntervalsBug.ll
+++ b/test/CodeGen/ARM/2008-05-19-LiveIntervalsBug.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin
+; RUN: llc < %s -mtriple=arm-apple-darwin
%struct.BiContextType = type { i16, i8, i32 }
%struct.Bitstream = type { i32, i32, i8, i32, i32, i8, i8, i32, i32, i8*, i32 }
diff --git a/test/CodeGen/ARM/2008-05-19-ScavengerAssert.ll b/test/CodeGen/ARM/2008-05-19-ScavengerAssert.ll
index e98126b..aa61d86 100644
--- a/test/CodeGen/ARM/2008-05-19-ScavengerAssert.ll
+++ b/test/CodeGen/ARM/2008-05-19-ScavengerAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin
+; RUN: llc < %s -mtriple=arm-apple-darwin
%struct.Decoders = type { i32**, i16***, i16****, i16***, i16**, i8**, i8** }
@decoders = external global %struct.Decoders ; <%struct.Decoders*> [#uses=1]
diff --git a/test/CodeGen/ARM/2008-07-17-Fdiv.ll b/test/CodeGen/ARM/2008-07-17-Fdiv.ll
index aa75970..4cb768e 100644
--- a/test/CodeGen/ARM/2008-07-17-Fdiv.ll
+++ b/test/CodeGen/ARM/2008-07-17-Fdiv.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
define float @f(float %a, float %b) nounwind {
%tmp = fdiv float %a, %b
diff --git a/test/CodeGen/ARM/2008-07-24-CodeGenPrepCrash.ll b/test/CodeGen/ARM/2008-07-24-CodeGenPrepCrash.ll
index 6ea75eb..83fde07 100644
--- a/test/CodeGen/ARM/2008-07-24-CodeGenPrepCrash.ll
+++ b/test/CodeGen/ARM/2008-07-24-CodeGenPrepCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
; PR2589
define void @main({ i32 }*) {
diff --git a/test/CodeGen/ARM/2008-08-07-AsmPrintBug.ll b/test/CodeGen/ARM/2008-08-07-AsmPrintBug.ll
index 0a79e86..adb0112 100644
--- a/test/CodeGen/ARM/2008-08-07-AsmPrintBug.ll
+++ b/test/CodeGen/ARM/2008-08-07-AsmPrintBug.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -mattr=+v6 -relocation-model=pic | grep comm
+; RUN: llc < %s -mtriple=arm-apple-darwin -mattr=+v6 -relocation-model=pic | grep comm
%struct.FILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
%struct.__gcov_var = type { %struct.FILE*, i32, i32, i32, i32, i32, i32, [1025 x i32] }
diff --git a/test/CodeGen/ARM/2008-09-14-CoalescerBug.ll b/test/CodeGen/ARM/2008-09-14-CoalescerBug.ll
index c601b90..5f9d9ae 100644
--- a/test/CodeGen/ARM/2008-09-14-CoalescerBug.ll
+++ b/test/CodeGen/ARM/2008-09-14-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin
+; RUN: llc < %s -mtriple=arm-apple-darwin
@"\01LC1" = external constant [288 x i8] ; <[288 x i8]*> [#uses=1]
diff --git a/test/CodeGen/ARM/2008-09-17-CoalescerBug.ll b/test/CodeGen/ARM/2008-09-17-CoalescerBug.ll
index b3ea6fc..d3bc3e1 100644
--- a/test/CodeGen/ARM/2008-09-17-CoalescerBug.ll
+++ b/test/CodeGen/ARM/2008-09-17-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin
+; RUN: llc < %s -mtriple=arm-apple-darwin
define void @gcov_exit() nounwind {
entry:
diff --git a/test/CodeGen/ARM/2008-11-18-ScavengerAssert.ll b/test/CodeGen/ARM/2008-11-18-ScavengerAssert.ll
index 164e964..601a516 100644
--- a/test/CodeGen/ARM/2008-11-18-ScavengerAssert.ll
+++ b/test/CodeGen/ARM/2008-11-18-ScavengerAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6,+vfp2
+; RUN: llc < %s -march=arm -mattr=+v6,+vfp2
define hidden i64 @__muldi3(i64 %u, i64 %v) nounwind {
entry:
diff --git a/test/CodeGen/ARM/2008-11-19-ScavengerAssert.ll b/test/CodeGen/ARM/2008-11-19-ScavengerAssert.ll
index 3f17a51..35ca7b4 100644
--- a/test/CodeGen/ARM/2008-11-19-ScavengerAssert.ll
+++ b/test/CodeGen/ARM/2008-11-19-ScavengerAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin9 -stats |& grep asm-printer | grep 164
+; RUN: llc < %s -mtriple=arm-apple-darwin9 -stats |& grep asm-printer | grep 154
%"struct.Adv5::Ekin<3>" = type <{ i8 }>
%"struct.Adv5::X::Energyflux<3>" = type { double }
diff --git a/test/CodeGen/ARM/2009-02-16-SpillerBug.ll b/test/CodeGen/ARM/2009-02-16-SpillerBug.ll
index 48e663d..4c0c59c 100644
--- a/test/CodeGen/ARM/2009-02-16-SpillerBug.ll
+++ b/test/CodeGen/ARM/2009-02-16-SpillerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6,+vfp2
+; RUN: llc < %s -march=arm -mattr=+v6,+vfp2
target triple = "arm-apple-darwin9"
%struct.FILE_POS = type { i8, i8, i16, i32 }
diff --git a/test/CodeGen/ARM/2009-02-22-SoftenFloatVaArg.ll b/test/CodeGen/ARM/2009-02-22-SoftenFloatVaArg.ll
index d7befa0..a48f003 100644
--- a/test/CodeGen/ARM/2009-02-22-SoftenFloatVaArg.ll
+++ b/test/CodeGen/ARM/2009-02-22-SoftenFloatVaArg.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc
+; RUN: llc < %s
; PR3610
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-s0:0:64-f80:32:32"
target triple = "arm-elf"
diff --git a/test/CodeGen/ARM/2009-02-27-SpillerBug.ll b/test/CodeGen/ARM/2009-02-27-SpillerBug.ll
index bd5b719..bc5e602 100644
--- a/test/CodeGen/ARM/2009-02-27-SpillerBug.ll
+++ b/test/CodeGen/ARM/2009-02-27-SpillerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6,+vfp2
+; RUN: llc < %s -march=arm -mattr=+v6,+vfp2
target triple = "arm-apple-darwin9"
@a = external global double ; <double*> [#uses=1]
diff --git a/test/CodeGen/ARM/2009-03-07-SpillerBug.ll b/test/CodeGen/ARM/2009-03-07-SpillerBug.ll
index 399ed30..0ec17ae 100644
--- a/test/CodeGen/ARM/2009-03-07-SpillerBug.ll
+++ b/test/CodeGen/ARM/2009-03-07-SpillerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=armv6-apple-darwin9 -mattr=+vfp2
+; RUN: llc < %s -mtriple=armv6-apple-darwin9 -mattr=+vfp2
; rdar://6653182
%struct.ggBRDF = type { i32 (...)** }
diff --git a/test/CodeGen/ARM/2009-03-09-AddrModeBug.ll b/test/CodeGen/ARM/2009-03-09-AddrModeBug.ll
index 0ec6d7d..a1ce384 100644
--- a/test/CodeGen/ARM/2009-03-09-AddrModeBug.ll
+++ b/test/CodeGen/ARM/2009-03-09-AddrModeBug.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
%struct.hit_t = type { %struct.v_t, double }
%struct.node_t = type { %struct.hit_t, %struct.hit_t, i32 }
diff --git a/test/CodeGen/ARM/2009-04-06-AsmModifier.ll b/test/CodeGen/ARM/2009-04-06-AsmModifier.ll
index 11c05c6..3526722 100644
--- a/test/CodeGen/ARM/2009-04-06-AsmModifier.ll
+++ b/test/CodeGen/ARM/2009-04-06-AsmModifier.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm | grep {swi 107}
+; RUN: llc < %s -march=arm | grep {swi 107}
define i32 @_swilseek(i32) nounwind {
entry:
diff --git a/test/CodeGen/ARM/2009-04-08-AggregateAddr.ll b/test/CodeGen/ARM/2009-04-08-AggregateAddr.ll
index c00b1fb..f6b3d2c 100644
--- a/test/CodeGen/ARM/2009-04-08-AggregateAddr.ll
+++ b/test/CodeGen/ARM/2009-04-08-AggregateAddr.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
; PR3795
define fastcc void @_D3foo3fooFAriZv({ i32, { double, double }* } %d_arg, i32 %x_arg) {
diff --git a/test/CodeGen/ARM/2009-04-08-FREM.ll b/test/CodeGen/ARM/2009-04-08-FREM.ll
index c7e343c..99907fc 100644
--- a/test/CodeGen/ARM/2009-04-08-FREM.ll
+++ b/test/CodeGen/ARM/2009-04-08-FREM.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/ARM/2009-04-08-FloatUndef.ll b/test/CodeGen/ARM/2009-04-08-FloatUndef.ll
index f394847..05d2f26 100644
--- a/test/CodeGen/ARM/2009-04-08-FloatUndef.ll
+++ b/test/CodeGen/ARM/2009-04-08-FloatUndef.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
define void @execute_shader(<4 x float>* %OUT, <4 x float>* %IN, <4 x float>* %CONST) {
entry:
diff --git a/test/CodeGen/ARM/2009-04-09-RegScavengerAsm.ll b/test/CodeGen/ARM/2009-04-09-RegScavengerAsm.ll
index 223fa0f..deb092b 100644
--- a/test/CodeGen/ARM/2009-04-09-RegScavengerAsm.ll
+++ b/test/CodeGen/ARM/2009-04-09-RegScavengerAsm.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
; PR3954
define void @foo(...) nounwind {
diff --git a/test/CodeGen/ARM/2009-05-05-DAGCombineBug.ll b/test/CodeGen/ARM/2009-05-05-DAGCombineBug.ll
index 2bca6e6..670d204 100644
--- a/test/CodeGen/ARM/2009-05-05-DAGCombineBug.ll
+++ b/test/CodeGen/ARM/2009-05-05-DAGCombineBug.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-linuxeabi-unknown-gnu -mattr=+v6
+; RUN: llc < %s -mtriple=arm-linuxeabi-unknown-gnu -mattr=+v6
; PR4166
%"byte[]" = type { i32, i8* }
diff --git a/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll b/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll
index d03b7ce..75610ff 100644
--- a/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll
+++ b/test/CodeGen/ARM/2009-05-07-RegAllocLocal.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=armv5-unknown-linux-gnueabi -O0 -regalloc=local
+; RUN: llc < %s -mtriple=armv5-unknown-linux-gnueabi -O0 -regalloc=local
; PR4100
@.str = external constant [30 x i8] ; <[30 x i8]*> [#uses=1]
diff --git a/test/CodeGen/ARM/2009-05-11-CodePlacementCrash.ll b/test/CodeGen/ARM/2009-05-11-CodePlacementCrash.ll
index 35d4306..7046fcc 100644
--- a/test/CodeGen/ARM/2009-05-11-CodePlacementCrash.ll
+++ b/test/CodeGen/ARM/2009-05-11-CodePlacementCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
%struct.List = type { %struct.List*, i32 }
@Node5 = external constant %struct.List ; <%struct.List*> [#uses=1]
@"\01LC" = external constant [7 x i8] ; <[7 x i8]*> [#uses=1]
diff --git a/test/CodeGen/ARM/2009-05-18-InlineAsmMem.ll b/test/CodeGen/ARM/2009-05-18-InlineAsmMem.ll
index f942c9f..1e2707f 100644
--- a/test/CodeGen/ARM/2009-05-18-InlineAsmMem.ll
+++ b/test/CodeGen/ARM/2009-05-18-InlineAsmMem.ll
@@ -1,7 +1,9 @@
-; RUN: llvm-as < %s | llc -march=arm | grep swp
+; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc < %s -march=thumb | FileCheck %s
; PR4091
define void @foo(i32 %i, i32* %p) nounwind {
+;CHECK: swp r2, r0, [r1]
%asmtmp = call i32 asm sideeffect "swp $0, $2, $3", "=&r,=*m,r,*m,~{memory}"(i32* %p, i32 %i, i32* %p) nounwind
ret void
}
diff --git a/test/CodeGen/ARM/2009-06-02-ISelCrash.ll b/test/CodeGen/ARM/2009-06-02-ISelCrash.ll
index 7cd35b9..403e3f6 100644
--- a/test/CodeGen/ARM/2009-06-02-ISelCrash.ll
+++ b/test/CodeGen/ARM/2009-06-02-ISelCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -relocation-model=pic -mattr=+v6,+vfp2
+; RUN: llc < %s -mtriple=arm-apple-darwin -relocation-model=pic -mattr=+v6,+vfp2
@"\01LC" = external constant [15 x i8] ; <[15 x i8]*> [#uses=1]
diff --git a/test/CodeGen/ARM/2009-06-04-MissingLiveIn.ll b/test/CodeGen/ARM/2009-06-04-MissingLiveIn.ll
index 5eaae7a..98e0023 100644
--- a/test/CodeGen/ARM/2009-06-04-MissingLiveIn.ll
+++ b/test/CodeGen/ARM/2009-06-04-MissingLiveIn.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -mattr=+v6
+; RUN: llc < %s -mtriple=arm-apple-darwin -mattr=+v6
%struct.anon = type { i16, i16 }
%struct.cab_archive = type { i32, i16, i16, i16, i16, i8, %struct.cab_folder*, %struct.cab_file* }
diff --git a/test/CodeGen/ARM/2009-06-12-RegScavengerAssert.ll b/test/CodeGen/ARM/2009-06-12-RegScavengerAssert.ll
index 45b4bd4..27888d7 100644
--- a/test/CodeGen/ARM/2009-06-12-RegScavengerAssert.ll
+++ b/test/CodeGen/ARM/2009-06-12-RegScavengerAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=armv6-apple-darwin
+; RUN: llc < %s -mtriple=armv6-apple-darwin
type { i32, i32, %struct.D_Sym**, [3 x %struct.D_Sym*] } ; type %0
type { i32, %struct.D_Reduction** } ; type %1
diff --git a/test/CodeGen/ARM/2009-06-15-RegScavengerAssert.ll b/test/CodeGen/ARM/2009-06-15-RegScavengerAssert.ll
index c715a18..a0f903b 100644
--- a/test/CodeGen/ARM/2009-06-15-RegScavengerAssert.ll
+++ b/test/CodeGen/ARM/2009-06-15-RegScavengerAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=armv6-apple-darwin
+; RUN: llc < %s -mtriple=armv6-apple-darwin
%struct.term = type { i32, i32, i32 }
diff --git a/test/CodeGen/ARM/2009-06-19-RegScavengerAssert.ll b/test/CodeGen/ARM/2009-06-19-RegScavengerAssert.ll
index cbe2385..b56b6844 100644
--- a/test/CodeGen/ARM/2009-06-19-RegScavengerAssert.ll
+++ b/test/CodeGen/ARM/2009-06-19-RegScavengerAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=armv6-eabi -mattr=+vfp2 -float-abi=hard
+; RUN: llc < %s -mtriple=armv6-eabi -mattr=+vfp2 -float-abi=hard
; PR4419
define float @__ieee754_acosf(float %x) nounwind {
diff --git a/test/CodeGen/ARM/2009-06-22-CoalescerBug.ll b/test/CodeGen/ARM/2009-06-22-CoalescerBug.ll
index 5c8d7b0..e068be7 100644
--- a/test/CodeGen/ARM/2009-06-22-CoalescerBug.ll
+++ b/test/CodeGen/ARM/2009-06-22-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=armv6-apple-darwin
+; RUN: llc < %s -mtriple=armv6-apple-darwin
%struct.rtunion = type { i64 }
%struct.rtx_def = type { i16, i8, i8, [1 x %struct.rtunion] }
diff --git a/test/CodeGen/ARM/2009-06-30-RegScavengerAssert.ll b/test/CodeGen/ARM/2009-06-30-RegScavengerAssert.ll
index 27cad7c..17efe00 100644
--- a/test/CodeGen/ARM/2009-06-30-RegScavengerAssert.ll
+++ b/test/CodeGen/ARM/2009-06-30-RegScavengerAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mtriple=armv6-apple-darwin9
+; RUN: llc < %s -march=arm -mtriple=armv6-apple-darwin9
@nn = external global i32 ; <i32*> [#uses=1]
@al_len = external global i32 ; <i32*> [#uses=2]
diff --git a/test/CodeGen/ARM/2009-06-30-RegScavengerAssert2.ll b/test/CodeGen/ARM/2009-06-30-RegScavengerAssert2.ll
index 3a14d67..f520be3 100644
--- a/test/CodeGen/ARM/2009-06-30-RegScavengerAssert2.ll
+++ b/test/CodeGen/ARM/2009-06-30-RegScavengerAssert2.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mtriple=armv6-apple-darwin9
+; RUN: llc < %s -march=arm -mtriple=armv6-apple-darwin9
@no_mat = external global i32 ; <i32*> [#uses=1]
@no_mis = external global i32 ; <i32*> [#uses=2]
diff --git a/test/CodeGen/ARM/2009-06-30-RegScavengerAssert3.ll b/test/CodeGen/ARM/2009-06-30-RegScavengerAssert3.ll
index f94b59d..eee6ff9 100644
--- a/test/CodeGen/ARM/2009-06-30-RegScavengerAssert3.ll
+++ b/test/CodeGen/ARM/2009-06-30-RegScavengerAssert3.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mtriple=armv6-apple-darwin9
+; RUN: llc < %s -march=arm -mtriple=armv6-apple-darwin9
@JJ = external global i32* ; <i32**> [#uses=1]
diff --git a/test/CodeGen/ARM/2009-06-30-RegScavengerAssert4.ll b/test/CodeGen/ARM/2009-06-30-RegScavengerAssert4.ll
index bca7f79..93c92b1 100644
--- a/test/CodeGen/ARM/2009-06-30-RegScavengerAssert4.ll
+++ b/test/CodeGen/ARM/2009-06-30-RegScavengerAssert4.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mtriple=armv6-apple-darwin9
+; RUN: llc < %s -march=arm -mtriple=armv6-apple-darwin9
@r = external global i32 ; <i32*> [#uses=1]
@qr = external global i32 ; <i32*> [#uses=1]
diff --git a/test/CodeGen/ARM/2009-06-30-RegScavengerAssert5.ll b/test/CodeGen/ARM/2009-06-30-RegScavengerAssert5.ll
index 0c90592..277283d 100644
--- a/test/CodeGen/ARM/2009-06-30-RegScavengerAssert5.ll
+++ b/test/CodeGen/ARM/2009-06-30-RegScavengerAssert5.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mtriple=armv6-apple-darwin9
+; RUN: llc < %s -march=arm -mtriple=armv6-apple-darwin9
@XX = external global i32* ; <i32**> [#uses=1]
diff --git a/test/CodeGen/ARM/2009-07-01-CommuteBug.ll b/test/CodeGen/ARM/2009-07-01-CommuteBug.ll
index dfccefc..5c0e5fa 100644
--- a/test/CodeGen/ARM/2009-07-01-CommuteBug.ll
+++ b/test/CodeGen/ARM/2009-07-01-CommuteBug.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mtriple=armv6-apple-darwin9
+; RUN: llc < %s -march=arm -mtriple=armv6-apple-darwin9
@qr = external global i32 ; <i32*> [#uses=1]
@II = external global i32* ; <i32**> [#uses=1]
diff --git a/test/CodeGen/ARM/2009-07-09-asm-p-constraint.ll b/test/CodeGen/ARM/2009-07-09-asm-p-constraint.ll
new file mode 100644
index 0000000..e1e94b6
--- /dev/null
+++ b/test/CodeGen/ARM/2009-07-09-asm-p-constraint.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -march=arm -mattr=+v6
+
+define void @test(i8* %x) nounwind {
+entry:
+ call void asm sideeffect "pld\09${0:a}", "r,~{cc}"(i8* %x) nounwind
+ ret void
+}
diff --git a/test/CodeGen/ARM/2009-07-18-RewriterBug.ll b/test/CodeGen/ARM/2009-07-18-RewriterBug.ll
new file mode 100644
index 0000000..ee93fde
--- /dev/null
+++ b/test/CodeGen/ARM/2009-07-18-RewriterBug.ll
@@ -0,0 +1,1323 @@
+; RUN: llc < %s -mtriple=armv6-apple-darwin10 -mattr=+vfp2 | grep fcmpezd | count 13
+
+ %struct.EDGE_PAIR = type { %struct.edge_rec*, %struct.edge_rec* }
+ %struct.VEC2 = type { double, double, double }
+ %struct.VERTEX = type { %struct.VEC2, %struct.VERTEX*, %struct.VERTEX* }
+ %struct.edge_rec = type { %struct.VERTEX*, %struct.edge_rec*, i32, i8* }
+@avail_edge = internal global %struct.edge_rec* null ; <%struct.edge_rec**> [#uses=6]
+@_2E_str7 = internal constant [21 x i8] c"ERROR: Only 1 point!\00", section "__TEXT,__cstring,cstring_literals", align 1 ; <[21 x i8]*> [#uses=1]
+@llvm.used = appending global [1 x i8*] [i8* bitcast (void (%struct.EDGE_PAIR*, %struct.VERTEX*, %struct.VERTEX*)* @build_delaunay to i8*)], section "llvm.metadata" ; <[1 x i8*]*> [#uses=0]
+
+define arm_apcscc void @build_delaunay(%struct.EDGE_PAIR* noalias nocapture sret %agg.result, %struct.VERTEX* %tree, %struct.VERTEX* %extra) nounwind {
+entry:
+ %delright = alloca %struct.EDGE_PAIR, align 8 ; <%struct.EDGE_PAIR*> [#uses=3]
+ %delleft = alloca %struct.EDGE_PAIR, align 8 ; <%struct.EDGE_PAIR*> [#uses=3]
+ %0 = icmp eq %struct.VERTEX* %tree, null ; <i1> [#uses=1]
+ br i1 %0, label %bb8, label %bb
+
+bb: ; preds = %entry
+ %1 = getelementptr %struct.VERTEX* %tree, i32 0, i32 2 ; <%struct.VERTEX**> [#uses=1]
+ %2 = load %struct.VERTEX** %1, align 4 ; <%struct.VERTEX*> [#uses=2]
+ %3 = icmp eq %struct.VERTEX* %2, null ; <i1> [#uses=1]
+ br i1 %3, label %bb7, label %bb1.i
+
+bb1.i: ; preds = %bb1.i, %bb
+ %tree_addr.0.i = phi %struct.VERTEX* [ %5, %bb1.i ], [ %tree, %bb ] ; <%struct.VERTEX*> [#uses=3]
+ %4 = getelementptr %struct.VERTEX* %tree_addr.0.i, i32 0, i32 1 ; <%struct.VERTEX**> [#uses=1]
+ %5 = load %struct.VERTEX** %4, align 4 ; <%struct.VERTEX*> [#uses=2]
+ %6 = icmp eq %struct.VERTEX* %5, null ; <i1> [#uses=1]
+ br i1 %6, label %get_low.exit, label %bb1.i
+
+get_low.exit: ; preds = %bb1.i
+ call arm_apcscc void @build_delaunay(%struct.EDGE_PAIR* noalias sret %delright, %struct.VERTEX* %2, %struct.VERTEX* %extra) nounwind
+ %7 = getelementptr %struct.VERTEX* %tree, i32 0, i32 1 ; <%struct.VERTEX**> [#uses=1]
+ %8 = load %struct.VERTEX** %7, align 4 ; <%struct.VERTEX*> [#uses=1]
+ call arm_apcscc void @build_delaunay(%struct.EDGE_PAIR* noalias sret %delleft, %struct.VERTEX* %8, %struct.VERTEX* %tree) nounwind
+ %9 = getelementptr %struct.EDGE_PAIR* %delleft, i32 0, i32 0 ; <%struct.edge_rec**> [#uses=1]
+ %10 = load %struct.edge_rec** %9, align 8 ; <%struct.edge_rec*> [#uses=2]
+ %11 = getelementptr %struct.EDGE_PAIR* %delleft, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %12 = load %struct.edge_rec** %11, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %13 = getelementptr %struct.EDGE_PAIR* %delright, i32 0, i32 0 ; <%struct.edge_rec**> [#uses=1]
+ %14 = load %struct.edge_rec** %13, align 8 ; <%struct.edge_rec*> [#uses=1]
+ %15 = getelementptr %struct.EDGE_PAIR* %delright, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %16 = load %struct.edge_rec** %15, align 4 ; <%struct.edge_rec*> [#uses=2]
+ br label %bb.i
+
+bb.i: ; preds = %bb4.i, %get_low.exit
+ %rdi_addr.0.i = phi %struct.edge_rec* [ %14, %get_low.exit ], [ %72, %bb4.i ] ; <%struct.edge_rec*> [#uses=2]
+ %ldi_addr.1.i = phi %struct.edge_rec* [ %12, %get_low.exit ], [ %ldi_addr.0.i, %bb4.i ] ; <%struct.edge_rec*> [#uses=3]
+ %17 = getelementptr %struct.edge_rec* %rdi_addr.0.i, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %18 = load %struct.VERTEX** %17, align 4 ; <%struct.VERTEX*> [#uses=3]
+ %19 = ptrtoint %struct.edge_rec* %ldi_addr.1.i to i32 ; <i32> [#uses=1]
+ %20 = getelementptr %struct.VERTEX* %18, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %21 = load double* %20, align 4 ; <double> [#uses=3]
+ %22 = getelementptr %struct.VERTEX* %18, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %23 = load double* %22, align 4 ; <double> [#uses=3]
+ br label %bb2.i
+
+bb1.i1: ; preds = %bb2.i
+ %24 = ptrtoint %struct.edge_rec* %ldi_addr.0.i to i32 ; <i32> [#uses=2]
+ %25 = add i32 %24, 48 ; <i32> [#uses=1]
+ %26 = and i32 %25, 63 ; <i32> [#uses=1]
+ %27 = and i32 %24, -64 ; <i32> [#uses=1]
+ %28 = or i32 %26, %27 ; <i32> [#uses=1]
+ %29 = inttoptr i32 %28 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %30 = getelementptr %struct.edge_rec* %29, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %31 = load %struct.edge_rec** %30, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %32 = ptrtoint %struct.edge_rec* %31 to i32 ; <i32> [#uses=2]
+ %33 = add i32 %32, 16 ; <i32> [#uses=1]
+ %34 = and i32 %33, 63 ; <i32> [#uses=1]
+ %35 = and i32 %32, -64 ; <i32> [#uses=1]
+ %36 = or i32 %34, %35 ; <i32> [#uses=2]
+ %37 = inttoptr i32 %36 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ br label %bb2.i
+
+bb2.i: ; preds = %bb1.i1, %bb.i
+ %ldi_addr.1.pn.i = phi %struct.edge_rec* [ %ldi_addr.1.i, %bb.i ], [ %37, %bb1.i1 ] ; <%struct.edge_rec*> [#uses=1]
+ %.pn6.in.in.i = phi i32 [ %19, %bb.i ], [ %36, %bb1.i1 ] ; <i32> [#uses=1]
+ %ldi_addr.0.i = phi %struct.edge_rec* [ %ldi_addr.1.i, %bb.i ], [ %37, %bb1.i1 ] ; <%struct.edge_rec*> [#uses=4]
+ %.pn6.in.i = xor i32 %.pn6.in.in.i, 32 ; <i32> [#uses=1]
+ %.pn6.i = inttoptr i32 %.pn6.in.i to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %t1.0.in.i = getelementptr %struct.edge_rec* %ldi_addr.1.pn.i, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %t2.0.in.i = getelementptr %struct.edge_rec* %.pn6.i, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %t1.0.i = load %struct.VERTEX** %t1.0.in.i ; <%struct.VERTEX*> [#uses=2]
+ %t2.0.i = load %struct.VERTEX** %t2.0.in.i ; <%struct.VERTEX*> [#uses=2]
+ %38 = getelementptr %struct.VERTEX* %t1.0.i, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %39 = load double* %38, align 4 ; <double> [#uses=3]
+ %40 = getelementptr %struct.VERTEX* %t1.0.i, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %41 = load double* %40, align 4 ; <double> [#uses=3]
+ %42 = getelementptr %struct.VERTEX* %t2.0.i, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %43 = load double* %42, align 4 ; <double> [#uses=1]
+ %44 = getelementptr %struct.VERTEX* %t2.0.i, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %45 = load double* %44, align 4 ; <double> [#uses=1]
+ %46 = fsub double %39, %21 ; <double> [#uses=1]
+ %47 = fsub double %45, %23 ; <double> [#uses=1]
+ %48 = fmul double %46, %47 ; <double> [#uses=1]
+ %49 = fsub double %43, %21 ; <double> [#uses=1]
+ %50 = fsub double %41, %23 ; <double> [#uses=1]
+ %51 = fmul double %49, %50 ; <double> [#uses=1]
+ %52 = fsub double %48, %51 ; <double> [#uses=1]
+ %53 = fcmp ogt double %52, 0.000000e+00 ; <i1> [#uses=1]
+ br i1 %53, label %bb1.i1, label %bb3.i
+
+bb3.i: ; preds = %bb2.i
+ %54 = ptrtoint %struct.edge_rec* %rdi_addr.0.i to i32 ; <i32> [#uses=1]
+ %55 = xor i32 %54, 32 ; <i32> [#uses=3]
+ %56 = inttoptr i32 %55 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %57 = getelementptr %struct.edge_rec* %56, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %58 = load %struct.VERTEX** %57, align 4 ; <%struct.VERTEX*> [#uses=2]
+ %59 = getelementptr %struct.VERTEX* %58, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %60 = load double* %59, align 4 ; <double> [#uses=1]
+ %61 = getelementptr %struct.VERTEX* %58, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %62 = load double* %61, align 4 ; <double> [#uses=1]
+ %63 = fsub double %60, %39 ; <double> [#uses=1]
+ %64 = fsub double %23, %41 ; <double> [#uses=1]
+ %65 = fmul double %63, %64 ; <double> [#uses=1]
+ %66 = fsub double %21, %39 ; <double> [#uses=1]
+ %67 = fsub double %62, %41 ; <double> [#uses=1]
+ %68 = fmul double %66, %67 ; <double> [#uses=1]
+ %69 = fsub double %65, %68 ; <double> [#uses=1]
+ %70 = fcmp ogt double %69, 0.000000e+00 ; <i1> [#uses=1]
+ br i1 %70, label %bb4.i, label %bb5.i
+
+bb4.i: ; preds = %bb3.i
+ %71 = getelementptr %struct.edge_rec* %56, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %72 = load %struct.edge_rec** %71, align 4 ; <%struct.edge_rec*> [#uses=1]
+ br label %bb.i
+
+bb5.i: ; preds = %bb3.i
+ %73 = add i32 %55, 48 ; <i32> [#uses=1]
+ %74 = and i32 %73, 63 ; <i32> [#uses=1]
+ %75 = and i32 %55, -64 ; <i32> [#uses=1]
+ %76 = or i32 %74, %75 ; <i32> [#uses=1]
+ %77 = inttoptr i32 %76 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %78 = getelementptr %struct.edge_rec* %77, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %79 = load %struct.edge_rec** %78, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %80 = ptrtoint %struct.edge_rec* %79 to i32 ; <i32> [#uses=2]
+ %81 = add i32 %80, 16 ; <i32> [#uses=1]
+ %82 = and i32 %81, 63 ; <i32> [#uses=1]
+ %83 = and i32 %80, -64 ; <i32> [#uses=1]
+ %84 = or i32 %82, %83 ; <i32> [#uses=1]
+ %85 = inttoptr i32 %84 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %86 = getelementptr %struct.edge_rec* %ldi_addr.0.i, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %87 = load %struct.VERTEX** %86, align 4 ; <%struct.VERTEX*> [#uses=1]
+ %88 = call arm_apcscc %struct.edge_rec* @alloc_edge() nounwind ; <%struct.edge_rec*> [#uses=6]
+ %89 = getelementptr %struct.edge_rec* %88, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=4]
+ store %struct.edge_rec* %88, %struct.edge_rec** %89, align 4
+ %90 = getelementptr %struct.edge_rec* %88, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=2]
+ store %struct.VERTEX* %18, %struct.VERTEX** %90, align 4
+ %91 = ptrtoint %struct.edge_rec* %88 to i32 ; <i32> [#uses=5]
+ %92 = add i32 %91, 16 ; <i32> [#uses=2]
+ %93 = inttoptr i32 %92 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %94 = add i32 %91, 48 ; <i32> [#uses=1]
+ %95 = inttoptr i32 %94 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %96 = getelementptr %struct.edge_rec* %93, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %95, %struct.edge_rec** %96, align 4
+ %97 = add i32 %91, 32 ; <i32> [#uses=1]
+ %98 = inttoptr i32 %97 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=3]
+ %99 = getelementptr %struct.edge_rec* %98, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %98, %struct.edge_rec** %99, align 4
+ %100 = getelementptr %struct.edge_rec* %98, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ store %struct.VERTEX* %87, %struct.VERTEX** %100, align 4
+ %101 = getelementptr %struct.edge_rec* %95, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %93, %struct.edge_rec** %101, align 4
+ %102 = load %struct.edge_rec** %89, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %103 = ptrtoint %struct.edge_rec* %102 to i32 ; <i32> [#uses=2]
+ %104 = add i32 %103, 16 ; <i32> [#uses=1]
+ %105 = and i32 %104, 63 ; <i32> [#uses=1]
+ %106 = and i32 %103, -64 ; <i32> [#uses=1]
+ %107 = or i32 %105, %106 ; <i32> [#uses=1]
+ %108 = inttoptr i32 %107 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %109 = getelementptr %struct.edge_rec* %85, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %110 = load %struct.edge_rec** %109, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %111 = ptrtoint %struct.edge_rec* %110 to i32 ; <i32> [#uses=2]
+ %112 = add i32 %111, 16 ; <i32> [#uses=1]
+ %113 = and i32 %112, 63 ; <i32> [#uses=1]
+ %114 = and i32 %111, -64 ; <i32> [#uses=1]
+ %115 = or i32 %113, %114 ; <i32> [#uses=1]
+ %116 = inttoptr i32 %115 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %117 = getelementptr %struct.edge_rec* %116, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %118 = load %struct.edge_rec** %117, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %119 = getelementptr %struct.edge_rec* %108, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %120 = load %struct.edge_rec** %119, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %118, %struct.edge_rec** %119, align 4
+ store %struct.edge_rec* %120, %struct.edge_rec** %117, align 4
+ %121 = load %struct.edge_rec** %89, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %122 = load %struct.edge_rec** %109, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %121, %struct.edge_rec** %109, align 4
+ store %struct.edge_rec* %122, %struct.edge_rec** %89, align 4
+ %123 = xor i32 %91, 32 ; <i32> [#uses=1]
+ %124 = inttoptr i32 %123 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=3]
+ %125 = getelementptr %struct.edge_rec* %124, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %126 = load %struct.edge_rec** %125, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %127 = ptrtoint %struct.edge_rec* %126 to i32 ; <i32> [#uses=2]
+ %128 = add i32 %127, 16 ; <i32> [#uses=1]
+ %129 = and i32 %128, 63 ; <i32> [#uses=1]
+ %130 = and i32 %127, -64 ; <i32> [#uses=1]
+ %131 = or i32 %129, %130 ; <i32> [#uses=1]
+ %132 = inttoptr i32 %131 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %133 = getelementptr %struct.edge_rec* %ldi_addr.0.i, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %134 = load %struct.edge_rec** %133, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %135 = ptrtoint %struct.edge_rec* %134 to i32 ; <i32> [#uses=2]
+ %136 = add i32 %135, 16 ; <i32> [#uses=1]
+ %137 = and i32 %136, 63 ; <i32> [#uses=1]
+ %138 = and i32 %135, -64 ; <i32> [#uses=1]
+ %139 = or i32 %137, %138 ; <i32> [#uses=1]
+ %140 = inttoptr i32 %139 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %141 = getelementptr %struct.edge_rec* %140, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %142 = load %struct.edge_rec** %141, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %143 = getelementptr %struct.edge_rec* %132, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %144 = load %struct.edge_rec** %143, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %142, %struct.edge_rec** %143, align 4
+ store %struct.edge_rec* %144, %struct.edge_rec** %141, align 4
+ %145 = load %struct.edge_rec** %125, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %146 = load %struct.edge_rec** %133, align 4 ; <%struct.edge_rec*> [#uses=2]
+ store %struct.edge_rec* %145, %struct.edge_rec** %133, align 4
+ store %struct.edge_rec* %146, %struct.edge_rec** %125, align 4
+ %147 = and i32 %92, 63 ; <i32> [#uses=1]
+ %148 = and i32 %91, -64 ; <i32> [#uses=1]
+ %149 = or i32 %147, %148 ; <i32> [#uses=1]
+ %150 = inttoptr i32 %149 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %151 = getelementptr %struct.edge_rec* %150, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %152 = load %struct.edge_rec** %151, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %153 = ptrtoint %struct.edge_rec* %152 to i32 ; <i32> [#uses=2]
+ %154 = add i32 %153, 16 ; <i32> [#uses=1]
+ %155 = and i32 %154, 63 ; <i32> [#uses=1]
+ %156 = and i32 %153, -64 ; <i32> [#uses=1]
+ %157 = or i32 %155, %156 ; <i32> [#uses=1]
+ %158 = inttoptr i32 %157 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %159 = load %struct.VERTEX** %90, align 4 ; <%struct.VERTEX*> [#uses=1]
+ %160 = getelementptr %struct.edge_rec* %124, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %161 = load %struct.VERTEX** %160, align 4 ; <%struct.VERTEX*> [#uses=1]
+ %162 = getelementptr %struct.edge_rec* %16, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %163 = load %struct.VERTEX** %162, align 4 ; <%struct.VERTEX*> [#uses=1]
+ %164 = icmp eq %struct.VERTEX* %163, %159 ; <i1> [#uses=1]
+ %rdo_addr.0.i = select i1 %164, %struct.edge_rec* %88, %struct.edge_rec* %16 ; <%struct.edge_rec*> [#uses=3]
+ %165 = getelementptr %struct.edge_rec* %10, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %166 = load %struct.VERTEX** %165, align 4 ; <%struct.VERTEX*> [#uses=1]
+ %167 = icmp eq %struct.VERTEX* %166, %161 ; <i1> [#uses=1]
+ %ldo_addr.0.ph.i = select i1 %167, %struct.edge_rec* %124, %struct.edge_rec* %10 ; <%struct.edge_rec*> [#uses=3]
+ br label %bb9.i
+
+bb9.i: ; preds = %bb25.i, %bb24.i, %bb5.i
+ %lcand.2.i = phi %struct.edge_rec* [ %146, %bb5.i ], [ %lcand.1.i, %bb24.i ], [ %739, %bb25.i ] ; <%struct.edge_rec*> [#uses=5]
+ %rcand.2.i = phi %struct.edge_rec* [ %158, %bb5.i ], [ %666, %bb24.i ], [ %rcand.1.i, %bb25.i ] ; <%struct.edge_rec*> [#uses=5]
+ %basel.0.i = phi %struct.edge_rec* [ %88, %bb5.i ], [ %595, %bb24.i ], [ %716, %bb25.i ] ; <%struct.edge_rec*> [#uses=2]
+ %168 = getelementptr %struct.edge_rec* %lcand.2.i, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %169 = load %struct.edge_rec** %168, align 4 ; <%struct.edge_rec*> [#uses=3]
+ %170 = getelementptr %struct.edge_rec* %basel.0.i, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=3]
+ %171 = load %struct.VERTEX** %170, align 4 ; <%struct.VERTEX*> [#uses=4]
+ %172 = ptrtoint %struct.edge_rec* %basel.0.i to i32 ; <i32> [#uses=3]
+ %173 = xor i32 %172, 32 ; <i32> [#uses=1]
+ %174 = inttoptr i32 %173 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %175 = getelementptr %struct.edge_rec* %174, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=3]
+ %176 = load %struct.VERTEX** %175, align 4 ; <%struct.VERTEX*> [#uses=3]
+ %177 = ptrtoint %struct.edge_rec* %169 to i32 ; <i32> [#uses=1]
+ %178 = xor i32 %177, 32 ; <i32> [#uses=1]
+ %179 = inttoptr i32 %178 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %180 = getelementptr %struct.edge_rec* %179, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %181 = load %struct.VERTEX** %180, align 4 ; <%struct.VERTEX*> [#uses=2]
+ %182 = getelementptr %struct.VERTEX* %171, i32 0, i32 0, i32 0 ; <double*> [#uses=2]
+ %183 = load double* %182, align 4 ; <double> [#uses=2]
+ %184 = getelementptr %struct.VERTEX* %171, i32 0, i32 0, i32 1 ; <double*> [#uses=2]
+ %185 = load double* %184, align 4 ; <double> [#uses=2]
+ %186 = getelementptr %struct.VERTEX* %181, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %187 = load double* %186, align 4 ; <double> [#uses=1]
+ %188 = getelementptr %struct.VERTEX* %181, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %189 = load double* %188, align 4 ; <double> [#uses=1]
+ %190 = getelementptr %struct.VERTEX* %176, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %191 = load double* %190, align 4 ; <double> [#uses=2]
+ %192 = getelementptr %struct.VERTEX* %176, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %193 = load double* %192, align 4 ; <double> [#uses=2]
+ %194 = fsub double %183, %191 ; <double> [#uses=1]
+ %195 = fsub double %189, %193 ; <double> [#uses=1]
+ %196 = fmul double %194, %195 ; <double> [#uses=1]
+ %197 = fsub double %187, %191 ; <double> [#uses=1]
+ %198 = fsub double %185, %193 ; <double> [#uses=1]
+ %199 = fmul double %197, %198 ; <double> [#uses=1]
+ %200 = fsub double %196, %199 ; <double> [#uses=1]
+ %201 = fcmp ogt double %200, 0.000000e+00 ; <i1> [#uses=1]
+ br i1 %201, label %bb10.i, label %bb13.i
+
+bb10.i: ; preds = %bb9.i
+ %202 = getelementptr %struct.VERTEX* %171, i32 0, i32 0, i32 2 ; <double*> [#uses=1]
+ %avail_edge.promoted25 = load %struct.edge_rec** @avail_edge ; <%struct.edge_rec*> [#uses=1]
+ br label %bb12.i
+
+bb11.i: ; preds = %bb12.i
+ %203 = ptrtoint %struct.edge_rec* %lcand.0.i to i32 ; <i32> [#uses=3]
+ %204 = add i32 %203, 16 ; <i32> [#uses=1]
+ %205 = and i32 %204, 63 ; <i32> [#uses=1]
+ %206 = and i32 %203, -64 ; <i32> [#uses=3]
+ %207 = or i32 %205, %206 ; <i32> [#uses=1]
+ %208 = inttoptr i32 %207 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %209 = getelementptr %struct.edge_rec* %208, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %210 = load %struct.edge_rec** %209, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %211 = ptrtoint %struct.edge_rec* %210 to i32 ; <i32> [#uses=2]
+ %212 = add i32 %211, 16 ; <i32> [#uses=1]
+ %213 = and i32 %212, 63 ; <i32> [#uses=1]
+ %214 = and i32 %211, -64 ; <i32> [#uses=1]
+ %215 = or i32 %213, %214 ; <i32> [#uses=1]
+ %216 = inttoptr i32 %215 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %217 = getelementptr %struct.edge_rec* %lcand.0.i, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %218 = load %struct.edge_rec** %217, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %219 = ptrtoint %struct.edge_rec* %218 to i32 ; <i32> [#uses=2]
+ %220 = add i32 %219, 16 ; <i32> [#uses=1]
+ %221 = and i32 %220, 63 ; <i32> [#uses=1]
+ %222 = and i32 %219, -64 ; <i32> [#uses=1]
+ %223 = or i32 %221, %222 ; <i32> [#uses=1]
+ %224 = inttoptr i32 %223 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %225 = getelementptr %struct.edge_rec* %216, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %226 = load %struct.edge_rec** %225, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %227 = ptrtoint %struct.edge_rec* %226 to i32 ; <i32> [#uses=2]
+ %228 = add i32 %227, 16 ; <i32> [#uses=1]
+ %229 = and i32 %228, 63 ; <i32> [#uses=1]
+ %230 = and i32 %227, -64 ; <i32> [#uses=1]
+ %231 = or i32 %229, %230 ; <i32> [#uses=1]
+ %232 = inttoptr i32 %231 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %233 = getelementptr %struct.edge_rec* %232, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %234 = load %struct.edge_rec** %233, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %235 = getelementptr %struct.edge_rec* %224, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %236 = load %struct.edge_rec** %235, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %234, %struct.edge_rec** %235, align 4
+ store %struct.edge_rec* %236, %struct.edge_rec** %233, align 4
+ %237 = load %struct.edge_rec** %217, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %238 = load %struct.edge_rec** %225, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %237, %struct.edge_rec** %225, align 4
+ store %struct.edge_rec* %238, %struct.edge_rec** %217, align 4
+ %239 = xor i32 %203, 32 ; <i32> [#uses=2]
+ %240 = add i32 %239, 16 ; <i32> [#uses=1]
+ %241 = and i32 %240, 63 ; <i32> [#uses=1]
+ %242 = or i32 %241, %206 ; <i32> [#uses=1]
+ %243 = inttoptr i32 %242 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %244 = getelementptr %struct.edge_rec* %243, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %245 = load %struct.edge_rec** %244, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %246 = ptrtoint %struct.edge_rec* %245 to i32 ; <i32> [#uses=2]
+ %247 = add i32 %246, 16 ; <i32> [#uses=1]
+ %248 = and i32 %247, 63 ; <i32> [#uses=1]
+ %249 = and i32 %246, -64 ; <i32> [#uses=1]
+ %250 = or i32 %248, %249 ; <i32> [#uses=1]
+ %251 = inttoptr i32 %250 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %252 = inttoptr i32 %239 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %253 = getelementptr %struct.edge_rec* %252, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %254 = load %struct.edge_rec** %253, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %255 = ptrtoint %struct.edge_rec* %254 to i32 ; <i32> [#uses=2]
+ %256 = add i32 %255, 16 ; <i32> [#uses=1]
+ %257 = and i32 %256, 63 ; <i32> [#uses=1]
+ %258 = and i32 %255, -64 ; <i32> [#uses=1]
+ %259 = or i32 %257, %258 ; <i32> [#uses=1]
+ %260 = inttoptr i32 %259 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %261 = getelementptr %struct.edge_rec* %251, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %262 = load %struct.edge_rec** %261, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %263 = ptrtoint %struct.edge_rec* %262 to i32 ; <i32> [#uses=2]
+ %264 = add i32 %263, 16 ; <i32> [#uses=1]
+ %265 = and i32 %264, 63 ; <i32> [#uses=1]
+ %266 = and i32 %263, -64 ; <i32> [#uses=1]
+ %267 = or i32 %265, %266 ; <i32> [#uses=1]
+ %268 = inttoptr i32 %267 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %269 = getelementptr %struct.edge_rec* %268, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %270 = load %struct.edge_rec** %269, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %271 = getelementptr %struct.edge_rec* %260, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %272 = load %struct.edge_rec** %271, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %270, %struct.edge_rec** %271, align 4
+ store %struct.edge_rec* %272, %struct.edge_rec** %269, align 4
+ %273 = load %struct.edge_rec** %253, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %274 = load %struct.edge_rec** %261, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %273, %struct.edge_rec** %261, align 4
+ store %struct.edge_rec* %274, %struct.edge_rec** %253, align 4
+ %275 = inttoptr i32 %206 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %276 = getelementptr %struct.edge_rec* %275, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %avail_edge.tmp.026, %struct.edge_rec** %276, align 4
+ %277 = getelementptr %struct.edge_rec* %t.0.i, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %278 = load %struct.edge_rec** %277, align 4 ; <%struct.edge_rec*> [#uses=2]
+ %.pre.i = load double* %182, align 4 ; <double> [#uses=1]
+ %.pre22.i = load double* %184, align 4 ; <double> [#uses=1]
+ br label %bb12.i
+
+bb12.i: ; preds = %bb11.i, %bb10.i
+ %avail_edge.tmp.026 = phi %struct.edge_rec* [ %avail_edge.promoted25, %bb10.i ], [ %275, %bb11.i ] ; <%struct.edge_rec*> [#uses=2]
+ %279 = phi double [ %.pre22.i, %bb11.i ], [ %185, %bb10.i ] ; <double> [#uses=3]
+ %280 = phi double [ %.pre.i, %bb11.i ], [ %183, %bb10.i ] ; <double> [#uses=3]
+ %lcand.0.i = phi %struct.edge_rec* [ %lcand.2.i, %bb10.i ], [ %t.0.i, %bb11.i ] ; <%struct.edge_rec*> [#uses=3]
+ %t.0.i = phi %struct.edge_rec* [ %169, %bb10.i ], [ %278, %bb11.i ] ; <%struct.edge_rec*> [#uses=4]
+ %.pn5.in.in.in.i = phi %struct.edge_rec* [ %lcand.2.i, %bb10.i ], [ %t.0.i, %bb11.i ] ; <%struct.edge_rec*> [#uses=1]
+ %.pn4.in.in.in.i = phi %struct.edge_rec* [ %169, %bb10.i ], [ %278, %bb11.i ] ; <%struct.edge_rec*> [#uses=1]
+ %lcand.2.pn.i = phi %struct.edge_rec* [ %lcand.2.i, %bb10.i ], [ %t.0.i, %bb11.i ] ; <%struct.edge_rec*> [#uses=1]
+ %.pn5.in.in.i = ptrtoint %struct.edge_rec* %.pn5.in.in.in.i to i32 ; <i32> [#uses=1]
+ %.pn4.in.in.i = ptrtoint %struct.edge_rec* %.pn4.in.in.in.i to i32 ; <i32> [#uses=1]
+ %.pn5.in.i = xor i32 %.pn5.in.in.i, 32 ; <i32> [#uses=1]
+ %.pn4.in.i = xor i32 %.pn4.in.in.i, 32 ; <i32> [#uses=1]
+ %.pn5.i = inttoptr i32 %.pn5.in.i to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %.pn4.i = inttoptr i32 %.pn4.in.i to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %v1.0.in.i = getelementptr %struct.edge_rec* %.pn5.i, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %v2.0.in.i = getelementptr %struct.edge_rec* %.pn4.i, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %v3.0.in.i = getelementptr %struct.edge_rec* %lcand.2.pn.i, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %v1.0.i = load %struct.VERTEX** %v1.0.in.i ; <%struct.VERTEX*> [#uses=3]
+ %v2.0.i = load %struct.VERTEX** %v2.0.in.i ; <%struct.VERTEX*> [#uses=3]
+ %v3.0.i = load %struct.VERTEX** %v3.0.in.i ; <%struct.VERTEX*> [#uses=3]
+ %281 = load double* %202, align 4 ; <double> [#uses=3]
+ %282 = getelementptr %struct.VERTEX* %v1.0.i, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %283 = load double* %282, align 4 ; <double> [#uses=1]
+ %284 = fsub double %283, %280 ; <double> [#uses=2]
+ %285 = getelementptr %struct.VERTEX* %v1.0.i, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %286 = load double* %285, align 4 ; <double> [#uses=1]
+ %287 = fsub double %286, %279 ; <double> [#uses=2]
+ %288 = getelementptr %struct.VERTEX* %v1.0.i, i32 0, i32 0, i32 2 ; <double*> [#uses=1]
+ %289 = load double* %288, align 4 ; <double> [#uses=1]
+ %290 = getelementptr %struct.VERTEX* %v2.0.i, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %291 = load double* %290, align 4 ; <double> [#uses=1]
+ %292 = fsub double %291, %280 ; <double> [#uses=2]
+ %293 = getelementptr %struct.VERTEX* %v2.0.i, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %294 = load double* %293, align 4 ; <double> [#uses=1]
+ %295 = fsub double %294, %279 ; <double> [#uses=2]
+ %296 = getelementptr %struct.VERTEX* %v2.0.i, i32 0, i32 0, i32 2 ; <double*> [#uses=1]
+ %297 = load double* %296, align 4 ; <double> [#uses=1]
+ %298 = getelementptr %struct.VERTEX* %v3.0.i, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %299 = load double* %298, align 4 ; <double> [#uses=1]
+ %300 = fsub double %299, %280 ; <double> [#uses=2]
+ %301 = getelementptr %struct.VERTEX* %v3.0.i, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %302 = load double* %301, align 4 ; <double> [#uses=1]
+ %303 = fsub double %302, %279 ; <double> [#uses=2]
+ %304 = getelementptr %struct.VERTEX* %v3.0.i, i32 0, i32 0, i32 2 ; <double*> [#uses=1]
+ %305 = load double* %304, align 4 ; <double> [#uses=1]
+ %306 = fsub double %289, %281 ; <double> [#uses=1]
+ %307 = fmul double %292, %303 ; <double> [#uses=1]
+ %308 = fmul double %295, %300 ; <double> [#uses=1]
+ %309 = fsub double %307, %308 ; <double> [#uses=1]
+ %310 = fmul double %306, %309 ; <double> [#uses=1]
+ %311 = fsub double %297, %281 ; <double> [#uses=1]
+ %312 = fmul double %300, %287 ; <double> [#uses=1]
+ %313 = fmul double %303, %284 ; <double> [#uses=1]
+ %314 = fsub double %312, %313 ; <double> [#uses=1]
+ %315 = fmul double %311, %314 ; <double> [#uses=1]
+ %316 = fadd double %315, %310 ; <double> [#uses=1]
+ %317 = fsub double %305, %281 ; <double> [#uses=1]
+ %318 = fmul double %284, %295 ; <double> [#uses=1]
+ %319 = fmul double %287, %292 ; <double> [#uses=1]
+ %320 = fsub double %318, %319 ; <double> [#uses=1]
+ %321 = fmul double %317, %320 ; <double> [#uses=1]
+ %322 = fadd double %321, %316 ; <double> [#uses=1]
+ %323 = fcmp ogt double %322, 0.000000e+00 ; <i1> [#uses=1]
+ br i1 %323, label %bb11.i, label %bb13.loopexit.i
+
+bb13.loopexit.i: ; preds = %bb12.i
+ store %struct.edge_rec* %avail_edge.tmp.026, %struct.edge_rec** @avail_edge
+ %.pre23.i = load %struct.VERTEX** %170, align 4 ; <%struct.VERTEX*> [#uses=1]
+ %.pre24.i = load %struct.VERTEX** %175, align 4 ; <%struct.VERTEX*> [#uses=1]
+ br label %bb13.i
+
+bb13.i: ; preds = %bb13.loopexit.i, %bb9.i
+ %324 = phi %struct.VERTEX* [ %.pre24.i, %bb13.loopexit.i ], [ %176, %bb9.i ] ; <%struct.VERTEX*> [#uses=4]
+ %325 = phi %struct.VERTEX* [ %.pre23.i, %bb13.loopexit.i ], [ %171, %bb9.i ] ; <%struct.VERTEX*> [#uses=3]
+ %lcand.1.i = phi %struct.edge_rec* [ %lcand.0.i, %bb13.loopexit.i ], [ %lcand.2.i, %bb9.i ] ; <%struct.edge_rec*> [#uses=3]
+ %326 = ptrtoint %struct.edge_rec* %rcand.2.i to i32 ; <i32> [#uses=2]
+ %327 = add i32 %326, 16 ; <i32> [#uses=1]
+ %328 = and i32 %327, 63 ; <i32> [#uses=1]
+ %329 = and i32 %326, -64 ; <i32> [#uses=1]
+ %330 = or i32 %328, %329 ; <i32> [#uses=1]
+ %331 = inttoptr i32 %330 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %332 = getelementptr %struct.edge_rec* %331, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %333 = load %struct.edge_rec** %332, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %334 = ptrtoint %struct.edge_rec* %333 to i32 ; <i32> [#uses=2]
+ %335 = add i32 %334, 16 ; <i32> [#uses=1]
+ %336 = and i32 %335, 63 ; <i32> [#uses=1]
+ %337 = and i32 %334, -64 ; <i32> [#uses=1]
+ %338 = or i32 %336, %337 ; <i32> [#uses=3]
+ %339 = xor i32 %338, 32 ; <i32> [#uses=1]
+ %340 = inttoptr i32 %339 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %341 = getelementptr %struct.edge_rec* %340, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %342 = load %struct.VERTEX** %341, align 4 ; <%struct.VERTEX*> [#uses=2]
+ %343 = getelementptr %struct.VERTEX* %325, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %344 = load double* %343, align 4 ; <double> [#uses=1]
+ %345 = getelementptr %struct.VERTEX* %325, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %346 = load double* %345, align 4 ; <double> [#uses=1]
+ %347 = getelementptr %struct.VERTEX* %342, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %348 = load double* %347, align 4 ; <double> [#uses=1]
+ %349 = getelementptr %struct.VERTEX* %342, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %350 = load double* %349, align 4 ; <double> [#uses=1]
+ %351 = getelementptr %struct.VERTEX* %324, i32 0, i32 0, i32 0 ; <double*> [#uses=2]
+ %352 = load double* %351, align 4 ; <double> [#uses=3]
+ %353 = getelementptr %struct.VERTEX* %324, i32 0, i32 0, i32 1 ; <double*> [#uses=2]
+ %354 = load double* %353, align 4 ; <double> [#uses=3]
+ %355 = fsub double %344, %352 ; <double> [#uses=1]
+ %356 = fsub double %350, %354 ; <double> [#uses=1]
+ %357 = fmul double %355, %356 ; <double> [#uses=1]
+ %358 = fsub double %348, %352 ; <double> [#uses=1]
+ %359 = fsub double %346, %354 ; <double> [#uses=1]
+ %360 = fmul double %358, %359 ; <double> [#uses=1]
+ %361 = fsub double %357, %360 ; <double> [#uses=1]
+ %362 = fcmp ogt double %361, 0.000000e+00 ; <i1> [#uses=1]
+ br i1 %362, label %bb14.i, label %bb17.i
+
+bb14.i: ; preds = %bb13.i
+ %363 = getelementptr %struct.VERTEX* %324, i32 0, i32 0, i32 2 ; <double*> [#uses=1]
+ %avail_edge.promoted = load %struct.edge_rec** @avail_edge ; <%struct.edge_rec*> [#uses=1]
+ br label %bb16.i
+
+bb15.i: ; preds = %bb16.i
+ %364 = ptrtoint %struct.edge_rec* %rcand.0.i to i32 ; <i32> [#uses=3]
+ %365 = add i32 %364, 16 ; <i32> [#uses=1]
+ %366 = and i32 %365, 63 ; <i32> [#uses=1]
+ %367 = and i32 %364, -64 ; <i32> [#uses=3]
+ %368 = or i32 %366, %367 ; <i32> [#uses=1]
+ %369 = inttoptr i32 %368 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %370 = getelementptr %struct.edge_rec* %369, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %371 = load %struct.edge_rec** %370, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %372 = ptrtoint %struct.edge_rec* %371 to i32 ; <i32> [#uses=2]
+ %373 = add i32 %372, 16 ; <i32> [#uses=1]
+ %374 = and i32 %373, 63 ; <i32> [#uses=1]
+ %375 = and i32 %372, -64 ; <i32> [#uses=1]
+ %376 = or i32 %374, %375 ; <i32> [#uses=1]
+ %377 = inttoptr i32 %376 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %378 = getelementptr %struct.edge_rec* %rcand.0.i, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %379 = load %struct.edge_rec** %378, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %380 = ptrtoint %struct.edge_rec* %379 to i32 ; <i32> [#uses=2]
+ %381 = add i32 %380, 16 ; <i32> [#uses=1]
+ %382 = and i32 %381, 63 ; <i32> [#uses=1]
+ %383 = and i32 %380, -64 ; <i32> [#uses=1]
+ %384 = or i32 %382, %383 ; <i32> [#uses=1]
+ %385 = inttoptr i32 %384 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %386 = getelementptr %struct.edge_rec* %377, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %387 = load %struct.edge_rec** %386, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %388 = ptrtoint %struct.edge_rec* %387 to i32 ; <i32> [#uses=2]
+ %389 = add i32 %388, 16 ; <i32> [#uses=1]
+ %390 = and i32 %389, 63 ; <i32> [#uses=1]
+ %391 = and i32 %388, -64 ; <i32> [#uses=1]
+ %392 = or i32 %390, %391 ; <i32> [#uses=1]
+ %393 = inttoptr i32 %392 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %394 = getelementptr %struct.edge_rec* %393, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %395 = load %struct.edge_rec** %394, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %396 = getelementptr %struct.edge_rec* %385, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %397 = load %struct.edge_rec** %396, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %395, %struct.edge_rec** %396, align 4
+ store %struct.edge_rec* %397, %struct.edge_rec** %394, align 4
+ %398 = load %struct.edge_rec** %378, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %399 = load %struct.edge_rec** %386, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %398, %struct.edge_rec** %386, align 4
+ store %struct.edge_rec* %399, %struct.edge_rec** %378, align 4
+ %400 = xor i32 %364, 32 ; <i32> [#uses=2]
+ %401 = add i32 %400, 16 ; <i32> [#uses=1]
+ %402 = and i32 %401, 63 ; <i32> [#uses=1]
+ %403 = or i32 %402, %367 ; <i32> [#uses=1]
+ %404 = inttoptr i32 %403 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %405 = getelementptr %struct.edge_rec* %404, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %406 = load %struct.edge_rec** %405, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %407 = ptrtoint %struct.edge_rec* %406 to i32 ; <i32> [#uses=2]
+ %408 = add i32 %407, 16 ; <i32> [#uses=1]
+ %409 = and i32 %408, 63 ; <i32> [#uses=1]
+ %410 = and i32 %407, -64 ; <i32> [#uses=1]
+ %411 = or i32 %409, %410 ; <i32> [#uses=1]
+ %412 = inttoptr i32 %411 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %413 = inttoptr i32 %400 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %414 = getelementptr %struct.edge_rec* %413, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %415 = load %struct.edge_rec** %414, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %416 = ptrtoint %struct.edge_rec* %415 to i32 ; <i32> [#uses=2]
+ %417 = add i32 %416, 16 ; <i32> [#uses=1]
+ %418 = and i32 %417, 63 ; <i32> [#uses=1]
+ %419 = and i32 %416, -64 ; <i32> [#uses=1]
+ %420 = or i32 %418, %419 ; <i32> [#uses=1]
+ %421 = inttoptr i32 %420 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %422 = getelementptr %struct.edge_rec* %412, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %423 = load %struct.edge_rec** %422, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %424 = ptrtoint %struct.edge_rec* %423 to i32 ; <i32> [#uses=2]
+ %425 = add i32 %424, 16 ; <i32> [#uses=1]
+ %426 = and i32 %425, 63 ; <i32> [#uses=1]
+ %427 = and i32 %424, -64 ; <i32> [#uses=1]
+ %428 = or i32 %426, %427 ; <i32> [#uses=1]
+ %429 = inttoptr i32 %428 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %430 = getelementptr %struct.edge_rec* %429, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %431 = load %struct.edge_rec** %430, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %432 = getelementptr %struct.edge_rec* %421, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %433 = load %struct.edge_rec** %432, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %431, %struct.edge_rec** %432, align 4
+ store %struct.edge_rec* %433, %struct.edge_rec** %430, align 4
+ %434 = load %struct.edge_rec** %414, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %435 = load %struct.edge_rec** %422, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %434, %struct.edge_rec** %422, align 4
+ store %struct.edge_rec* %435, %struct.edge_rec** %414, align 4
+ %436 = inttoptr i32 %367 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %437 = getelementptr %struct.edge_rec* %436, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %avail_edge.tmp.0, %struct.edge_rec** %437, align 4
+ %438 = add i32 %t.1.in.i, 16 ; <i32> [#uses=1]
+ %439 = and i32 %438, 63 ; <i32> [#uses=1]
+ %440 = and i32 %t.1.in.i, -64 ; <i32> [#uses=1]
+ %441 = or i32 %439, %440 ; <i32> [#uses=1]
+ %442 = inttoptr i32 %441 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %443 = getelementptr %struct.edge_rec* %442, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %444 = load %struct.edge_rec** %443, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %445 = ptrtoint %struct.edge_rec* %444 to i32 ; <i32> [#uses=2]
+ %446 = add i32 %445, 16 ; <i32> [#uses=1]
+ %447 = and i32 %446, 63 ; <i32> [#uses=1]
+ %448 = and i32 %445, -64 ; <i32> [#uses=1]
+ %449 = or i32 %447, %448 ; <i32> [#uses=2]
+ %.pre25.i = load double* %351, align 4 ; <double> [#uses=1]
+ %.pre26.i = load double* %353, align 4 ; <double> [#uses=1]
+ br label %bb16.i
+
+bb16.i: ; preds = %bb15.i, %bb14.i
+ %avail_edge.tmp.0 = phi %struct.edge_rec* [ %avail_edge.promoted, %bb14.i ], [ %436, %bb15.i ] ; <%struct.edge_rec*> [#uses=2]
+ %450 = phi double [ %.pre26.i, %bb15.i ], [ %354, %bb14.i ] ; <double> [#uses=3]
+ %451 = phi double [ %.pre25.i, %bb15.i ], [ %352, %bb14.i ] ; <double> [#uses=3]
+ %rcand.0.i = phi %struct.edge_rec* [ %rcand.2.i, %bb14.i ], [ %t.1.i, %bb15.i ] ; <%struct.edge_rec*> [#uses=3]
+ %t.1.in.i = phi i32 [ %338, %bb14.i ], [ %449, %bb15.i ] ; <i32> [#uses=3]
+ %.pn3.in.in.i = phi i32 [ %338, %bb14.i ], [ %449, %bb15.i ] ; <i32> [#uses=1]
+ %.pn.in.in.in.i = phi %struct.edge_rec* [ %rcand.2.i, %bb14.i ], [ %t.1.i, %bb15.i ] ; <%struct.edge_rec*> [#uses=1]
+ %rcand.2.pn.i = phi %struct.edge_rec* [ %rcand.2.i, %bb14.i ], [ %t.1.i, %bb15.i ] ; <%struct.edge_rec*> [#uses=1]
+ %t.1.i = inttoptr i32 %t.1.in.i to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=3]
+ %.pn.in.in.i = ptrtoint %struct.edge_rec* %.pn.in.in.in.i to i32 ; <i32> [#uses=1]
+ %.pn3.in.i = xor i32 %.pn3.in.in.i, 32 ; <i32> [#uses=1]
+ %.pn.in.i = xor i32 %.pn.in.in.i, 32 ; <i32> [#uses=1]
+ %.pn3.i = inttoptr i32 %.pn3.in.i to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %.pn.i = inttoptr i32 %.pn.in.i to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %v1.1.in.i = getelementptr %struct.edge_rec* %.pn3.i, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %v2.1.in.i = getelementptr %struct.edge_rec* %.pn.i, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %v3.1.in.i = getelementptr %struct.edge_rec* %rcand.2.pn.i, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %v1.1.i = load %struct.VERTEX** %v1.1.in.i ; <%struct.VERTEX*> [#uses=3]
+ %v2.1.i = load %struct.VERTEX** %v2.1.in.i ; <%struct.VERTEX*> [#uses=3]
+ %v3.1.i = load %struct.VERTEX** %v3.1.in.i ; <%struct.VERTEX*> [#uses=3]
+ %452 = load double* %363, align 4 ; <double> [#uses=3]
+ %453 = getelementptr %struct.VERTEX* %v1.1.i, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %454 = load double* %453, align 4 ; <double> [#uses=1]
+ %455 = fsub double %454, %451 ; <double> [#uses=2]
+ %456 = getelementptr %struct.VERTEX* %v1.1.i, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %457 = load double* %456, align 4 ; <double> [#uses=1]
+ %458 = fsub double %457, %450 ; <double> [#uses=2]
+ %459 = getelementptr %struct.VERTEX* %v1.1.i, i32 0, i32 0, i32 2 ; <double*> [#uses=1]
+ %460 = load double* %459, align 4 ; <double> [#uses=1]
+ %461 = getelementptr %struct.VERTEX* %v2.1.i, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %462 = load double* %461, align 4 ; <double> [#uses=1]
+ %463 = fsub double %462, %451 ; <double> [#uses=2]
+ %464 = getelementptr %struct.VERTEX* %v2.1.i, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %465 = load double* %464, align 4 ; <double> [#uses=1]
+ %466 = fsub double %465, %450 ; <double> [#uses=2]
+ %467 = getelementptr %struct.VERTEX* %v2.1.i, i32 0, i32 0, i32 2 ; <double*> [#uses=1]
+ %468 = load double* %467, align 4 ; <double> [#uses=1]
+ %469 = getelementptr %struct.VERTEX* %v3.1.i, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %470 = load double* %469, align 4 ; <double> [#uses=1]
+ %471 = fsub double %470, %451 ; <double> [#uses=2]
+ %472 = getelementptr %struct.VERTEX* %v3.1.i, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %473 = load double* %472, align 4 ; <double> [#uses=1]
+ %474 = fsub double %473, %450 ; <double> [#uses=2]
+ %475 = getelementptr %struct.VERTEX* %v3.1.i, i32 0, i32 0, i32 2 ; <double*> [#uses=1]
+ %476 = load double* %475, align 4 ; <double> [#uses=1]
+ %477 = fsub double %460, %452 ; <double> [#uses=1]
+ %478 = fmul double %463, %474 ; <double> [#uses=1]
+ %479 = fmul double %466, %471 ; <double> [#uses=1]
+ %480 = fsub double %478, %479 ; <double> [#uses=1]
+ %481 = fmul double %477, %480 ; <double> [#uses=1]
+ %482 = fsub double %468, %452 ; <double> [#uses=1]
+ %483 = fmul double %471, %458 ; <double> [#uses=1]
+ %484 = fmul double %474, %455 ; <double> [#uses=1]
+ %485 = fsub double %483, %484 ; <double> [#uses=1]
+ %486 = fmul double %482, %485 ; <double> [#uses=1]
+ %487 = fadd double %486, %481 ; <double> [#uses=1]
+ %488 = fsub double %476, %452 ; <double> [#uses=1]
+ %489 = fmul double %455, %466 ; <double> [#uses=1]
+ %490 = fmul double %458, %463 ; <double> [#uses=1]
+ %491 = fsub double %489, %490 ; <double> [#uses=1]
+ %492 = fmul double %488, %491 ; <double> [#uses=1]
+ %493 = fadd double %492, %487 ; <double> [#uses=1]
+ %494 = fcmp ogt double %493, 0.000000e+00 ; <i1> [#uses=1]
+ br i1 %494, label %bb15.i, label %bb17.loopexit.i
+
+bb17.loopexit.i: ; preds = %bb16.i
+ store %struct.edge_rec* %avail_edge.tmp.0, %struct.edge_rec** @avail_edge
+ %.pre27.i = load %struct.VERTEX** %170, align 4 ; <%struct.VERTEX*> [#uses=1]
+ %.pre28.i = load %struct.VERTEX** %175, align 4 ; <%struct.VERTEX*> [#uses=1]
+ br label %bb17.i
+
+bb17.i: ; preds = %bb17.loopexit.i, %bb13.i
+ %495 = phi %struct.VERTEX* [ %.pre28.i, %bb17.loopexit.i ], [ %324, %bb13.i ] ; <%struct.VERTEX*> [#uses=3]
+ %496 = phi %struct.VERTEX* [ %.pre27.i, %bb17.loopexit.i ], [ %325, %bb13.i ] ; <%struct.VERTEX*> [#uses=3]
+ %rcand.1.i = phi %struct.edge_rec* [ %rcand.0.i, %bb17.loopexit.i ], [ %rcand.2.i, %bb13.i ] ; <%struct.edge_rec*> [#uses=3]
+ %497 = ptrtoint %struct.edge_rec* %lcand.1.i to i32 ; <i32> [#uses=1]
+ %498 = xor i32 %497, 32 ; <i32> [#uses=1]
+ %499 = inttoptr i32 %498 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %500 = getelementptr %struct.edge_rec* %499, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %501 = load %struct.VERTEX** %500, align 4 ; <%struct.VERTEX*> [#uses=4]
+ %502 = getelementptr %struct.VERTEX* %496, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %503 = load double* %502, align 4 ; <double> [#uses=1]
+ %504 = getelementptr %struct.VERTEX* %496, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %505 = load double* %504, align 4 ; <double> [#uses=1]
+ %506 = getelementptr %struct.VERTEX* %501, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %507 = load double* %506, align 4 ; <double> [#uses=2]
+ %508 = getelementptr %struct.VERTEX* %501, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %509 = load double* %508, align 4 ; <double> [#uses=2]
+ %510 = getelementptr %struct.VERTEX* %495, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %511 = load double* %510, align 4 ; <double> [#uses=3]
+ %512 = getelementptr %struct.VERTEX* %495, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %513 = load double* %512, align 4 ; <double> [#uses=3]
+ %514 = fsub double %503, %511 ; <double> [#uses=2]
+ %515 = fsub double %509, %513 ; <double> [#uses=1]
+ %516 = fmul double %514, %515 ; <double> [#uses=1]
+ %517 = fsub double %507, %511 ; <double> [#uses=1]
+ %518 = fsub double %505, %513 ; <double> [#uses=2]
+ %519 = fmul double %517, %518 ; <double> [#uses=1]
+ %520 = fsub double %516, %519 ; <double> [#uses=1]
+ %521 = fcmp ogt double %520, 0.000000e+00 ; <i1> [#uses=2]
+ %522 = ptrtoint %struct.edge_rec* %rcand.1.i to i32 ; <i32> [#uses=3]
+ %523 = xor i32 %522, 32 ; <i32> [#uses=1]
+ %524 = inttoptr i32 %523 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %525 = getelementptr %struct.edge_rec* %524, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %526 = load %struct.VERTEX** %525, align 4 ; <%struct.VERTEX*> [#uses=4]
+ %527 = getelementptr %struct.VERTEX* %526, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %528 = load double* %527, align 4 ; <double> [#uses=4]
+ %529 = getelementptr %struct.VERTEX* %526, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %530 = load double* %529, align 4 ; <double> [#uses=4]
+ %531 = fsub double %530, %513 ; <double> [#uses=1]
+ %532 = fmul double %514, %531 ; <double> [#uses=1]
+ %533 = fsub double %528, %511 ; <double> [#uses=1]
+ %534 = fmul double %533, %518 ; <double> [#uses=1]
+ %535 = fsub double %532, %534 ; <double> [#uses=1]
+ %536 = fcmp ogt double %535, 0.000000e+00 ; <i1> [#uses=2]
+ %537 = or i1 %536, %521 ; <i1> [#uses=1]
+ br i1 %537, label %bb21.i, label %do_merge.exit
+
+bb21.i: ; preds = %bb17.i
+ %538 = getelementptr %struct.edge_rec* %lcand.1.i, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %539 = load %struct.VERTEX** %538, align 4 ; <%struct.VERTEX*> [#uses=3]
+ %540 = getelementptr %struct.edge_rec* %rcand.1.i, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %541 = load %struct.VERTEX** %540, align 4 ; <%struct.VERTEX*> [#uses=3]
+ br i1 %521, label %bb22.i, label %bb24.i
+
+bb22.i: ; preds = %bb21.i
+ br i1 %536, label %bb23.i, label %bb25.i
+
+bb23.i: ; preds = %bb22.i
+ %542 = getelementptr %struct.VERTEX* %526, i32 0, i32 0, i32 2 ; <double*> [#uses=1]
+ %543 = load double* %542, align 4 ; <double> [#uses=3]
+ %544 = fsub double %507, %528 ; <double> [#uses=2]
+ %545 = fsub double %509, %530 ; <double> [#uses=2]
+ %546 = getelementptr %struct.VERTEX* %501, i32 0, i32 0, i32 2 ; <double*> [#uses=1]
+ %547 = load double* %546, align 4 ; <double> [#uses=1]
+ %548 = getelementptr %struct.VERTEX* %539, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %549 = load double* %548, align 4 ; <double> [#uses=1]
+ %550 = fsub double %549, %528 ; <double> [#uses=2]
+ %551 = getelementptr %struct.VERTEX* %539, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %552 = load double* %551, align 4 ; <double> [#uses=1]
+ %553 = fsub double %552, %530 ; <double> [#uses=2]
+ %554 = getelementptr %struct.VERTEX* %539, i32 0, i32 0, i32 2 ; <double*> [#uses=1]
+ %555 = load double* %554, align 4 ; <double> [#uses=1]
+ %556 = getelementptr %struct.VERTEX* %541, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %557 = load double* %556, align 4 ; <double> [#uses=1]
+ %558 = fsub double %557, %528 ; <double> [#uses=2]
+ %559 = getelementptr %struct.VERTEX* %541, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %560 = load double* %559, align 4 ; <double> [#uses=1]
+ %561 = fsub double %560, %530 ; <double> [#uses=2]
+ %562 = getelementptr %struct.VERTEX* %541, i32 0, i32 0, i32 2 ; <double*> [#uses=1]
+ %563 = load double* %562, align 4 ; <double> [#uses=1]
+ %564 = fsub double %547, %543 ; <double> [#uses=1]
+ %565 = fmul double %550, %561 ; <double> [#uses=1]
+ %566 = fmul double %553, %558 ; <double> [#uses=1]
+ %567 = fsub double %565, %566 ; <double> [#uses=1]
+ %568 = fmul double %564, %567 ; <double> [#uses=1]
+ %569 = fsub double %555, %543 ; <double> [#uses=1]
+ %570 = fmul double %558, %545 ; <double> [#uses=1]
+ %571 = fmul double %561, %544 ; <double> [#uses=1]
+ %572 = fsub double %570, %571 ; <double> [#uses=1]
+ %573 = fmul double %569, %572 ; <double> [#uses=1]
+ %574 = fadd double %573, %568 ; <double> [#uses=1]
+ %575 = fsub double %563, %543 ; <double> [#uses=1]
+ %576 = fmul double %544, %553 ; <double> [#uses=1]
+ %577 = fmul double %545, %550 ; <double> [#uses=1]
+ %578 = fsub double %576, %577 ; <double> [#uses=1]
+ %579 = fmul double %575, %578 ; <double> [#uses=1]
+ %580 = fadd double %579, %574 ; <double> [#uses=1]
+ %581 = fcmp ogt double %580, 0.000000e+00 ; <i1> [#uses=1]
+ br i1 %581, label %bb24.i, label %bb25.i
+
+bb24.i: ; preds = %bb23.i, %bb21.i
+ %582 = add i32 %522, 48 ; <i32> [#uses=1]
+ %583 = and i32 %582, 63 ; <i32> [#uses=1]
+ %584 = and i32 %522, -64 ; <i32> [#uses=1]
+ %585 = or i32 %583, %584 ; <i32> [#uses=1]
+ %586 = inttoptr i32 %585 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %587 = getelementptr %struct.edge_rec* %586, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %588 = load %struct.edge_rec** %587, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %589 = ptrtoint %struct.edge_rec* %588 to i32 ; <i32> [#uses=2]
+ %590 = add i32 %589, 16 ; <i32> [#uses=1]
+ %591 = and i32 %590, 63 ; <i32> [#uses=1]
+ %592 = and i32 %589, -64 ; <i32> [#uses=1]
+ %593 = or i32 %591, %592 ; <i32> [#uses=1]
+ %594 = inttoptr i32 %593 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %595 = call arm_apcscc %struct.edge_rec* @alloc_edge() nounwind ; <%struct.edge_rec*> [#uses=5]
+ %596 = getelementptr %struct.edge_rec* %595, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=4]
+ store %struct.edge_rec* %595, %struct.edge_rec** %596, align 4
+ %597 = getelementptr %struct.edge_rec* %595, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ store %struct.VERTEX* %526, %struct.VERTEX** %597, align 4
+ %598 = ptrtoint %struct.edge_rec* %595 to i32 ; <i32> [#uses=5]
+ %599 = add i32 %598, 16 ; <i32> [#uses=1]
+ %600 = inttoptr i32 %599 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %601 = add i32 %598, 48 ; <i32> [#uses=1]
+ %602 = inttoptr i32 %601 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %603 = getelementptr %struct.edge_rec* %600, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %602, %struct.edge_rec** %603, align 4
+ %604 = add i32 %598, 32 ; <i32> [#uses=1]
+ %605 = inttoptr i32 %604 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=3]
+ %606 = getelementptr %struct.edge_rec* %605, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %605, %struct.edge_rec** %606, align 4
+ %607 = getelementptr %struct.edge_rec* %605, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ store %struct.VERTEX* %495, %struct.VERTEX** %607, align 4
+ %608 = getelementptr %struct.edge_rec* %602, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %600, %struct.edge_rec** %608, align 4
+ %609 = load %struct.edge_rec** %596, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %610 = ptrtoint %struct.edge_rec* %609 to i32 ; <i32> [#uses=2]
+ %611 = add i32 %610, 16 ; <i32> [#uses=1]
+ %612 = and i32 %611, 63 ; <i32> [#uses=1]
+ %613 = and i32 %610, -64 ; <i32> [#uses=1]
+ %614 = or i32 %612, %613 ; <i32> [#uses=1]
+ %615 = inttoptr i32 %614 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %616 = getelementptr %struct.edge_rec* %594, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %617 = load %struct.edge_rec** %616, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %618 = ptrtoint %struct.edge_rec* %617 to i32 ; <i32> [#uses=2]
+ %619 = add i32 %618, 16 ; <i32> [#uses=1]
+ %620 = and i32 %619, 63 ; <i32> [#uses=1]
+ %621 = and i32 %618, -64 ; <i32> [#uses=1]
+ %622 = or i32 %620, %621 ; <i32> [#uses=1]
+ %623 = inttoptr i32 %622 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %624 = getelementptr %struct.edge_rec* %623, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %625 = load %struct.edge_rec** %624, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %626 = getelementptr %struct.edge_rec* %615, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %627 = load %struct.edge_rec** %626, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %625, %struct.edge_rec** %626, align 4
+ store %struct.edge_rec* %627, %struct.edge_rec** %624, align 4
+ %628 = load %struct.edge_rec** %596, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %629 = load %struct.edge_rec** %616, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %628, %struct.edge_rec** %616, align 4
+ store %struct.edge_rec* %629, %struct.edge_rec** %596, align 4
+ %630 = xor i32 %598, 32 ; <i32> [#uses=2]
+ %631 = inttoptr i32 %630 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %632 = getelementptr %struct.edge_rec* %631, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %633 = load %struct.edge_rec** %632, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %634 = ptrtoint %struct.edge_rec* %633 to i32 ; <i32> [#uses=2]
+ %635 = add i32 %634, 16 ; <i32> [#uses=1]
+ %636 = and i32 %635, 63 ; <i32> [#uses=1]
+ %637 = and i32 %634, -64 ; <i32> [#uses=1]
+ %638 = or i32 %636, %637 ; <i32> [#uses=1]
+ %639 = inttoptr i32 %638 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %640 = getelementptr %struct.edge_rec* %174, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %641 = load %struct.edge_rec** %640, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %642 = ptrtoint %struct.edge_rec* %641 to i32 ; <i32> [#uses=2]
+ %643 = add i32 %642, 16 ; <i32> [#uses=1]
+ %644 = and i32 %643, 63 ; <i32> [#uses=1]
+ %645 = and i32 %642, -64 ; <i32> [#uses=1]
+ %646 = or i32 %644, %645 ; <i32> [#uses=1]
+ %647 = inttoptr i32 %646 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %648 = getelementptr %struct.edge_rec* %647, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %649 = load %struct.edge_rec** %648, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %650 = getelementptr %struct.edge_rec* %639, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %651 = load %struct.edge_rec** %650, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %649, %struct.edge_rec** %650, align 4
+ store %struct.edge_rec* %651, %struct.edge_rec** %648, align 4
+ %652 = load %struct.edge_rec** %632, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %653 = load %struct.edge_rec** %640, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %652, %struct.edge_rec** %640, align 4
+ store %struct.edge_rec* %653, %struct.edge_rec** %632, align 4
+ %654 = add i32 %630, 48 ; <i32> [#uses=1]
+ %655 = and i32 %654, 63 ; <i32> [#uses=1]
+ %656 = and i32 %598, -64 ; <i32> [#uses=1]
+ %657 = or i32 %655, %656 ; <i32> [#uses=1]
+ %658 = inttoptr i32 %657 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %659 = getelementptr %struct.edge_rec* %658, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %660 = load %struct.edge_rec** %659, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %661 = ptrtoint %struct.edge_rec* %660 to i32 ; <i32> [#uses=2]
+ %662 = add i32 %661, 16 ; <i32> [#uses=1]
+ %663 = and i32 %662, 63 ; <i32> [#uses=1]
+ %664 = and i32 %661, -64 ; <i32> [#uses=1]
+ %665 = or i32 %663, %664 ; <i32> [#uses=1]
+ %666 = inttoptr i32 %665 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ br label %bb9.i
+
+bb25.i: ; preds = %bb23.i, %bb22.i
+ %667 = add i32 %172, 16 ; <i32> [#uses=1]
+ %668 = and i32 %667, 63 ; <i32> [#uses=1]
+ %669 = and i32 %172, -64 ; <i32> [#uses=1]
+ %670 = or i32 %668, %669 ; <i32> [#uses=1]
+ %671 = inttoptr i32 %670 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %672 = getelementptr %struct.edge_rec* %671, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %673 = load %struct.edge_rec** %672, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %674 = ptrtoint %struct.edge_rec* %673 to i32 ; <i32> [#uses=2]
+ %675 = add i32 %674, 16 ; <i32> [#uses=1]
+ %676 = and i32 %675, 63 ; <i32> [#uses=1]
+ %677 = and i32 %674, -64 ; <i32> [#uses=1]
+ %678 = or i32 %676, %677 ; <i32> [#uses=1]
+ %679 = inttoptr i32 %678 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %680 = call arm_apcscc %struct.edge_rec* @alloc_edge() nounwind ; <%struct.edge_rec*> [#uses=4]
+ %681 = getelementptr %struct.edge_rec* %680, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=5]
+ store %struct.edge_rec* %680, %struct.edge_rec** %681, align 4
+ %682 = getelementptr %struct.edge_rec* %680, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ store %struct.VERTEX* %501, %struct.VERTEX** %682, align 4
+ %683 = ptrtoint %struct.edge_rec* %680 to i32 ; <i32> [#uses=4]
+ %684 = add i32 %683, 16 ; <i32> [#uses=1]
+ %685 = inttoptr i32 %684 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %686 = add i32 %683, 48 ; <i32> [#uses=1]
+ %687 = inttoptr i32 %686 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %688 = getelementptr %struct.edge_rec* %685, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %687, %struct.edge_rec** %688, align 4
+ %689 = add i32 %683, 32 ; <i32> [#uses=1]
+ %690 = inttoptr i32 %689 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=3]
+ %691 = getelementptr %struct.edge_rec* %690, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %690, %struct.edge_rec** %691, align 4
+ %692 = getelementptr %struct.edge_rec* %690, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ store %struct.VERTEX* %496, %struct.VERTEX** %692, align 4
+ %693 = getelementptr %struct.edge_rec* %687, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %685, %struct.edge_rec** %693, align 4
+ %694 = load %struct.edge_rec** %681, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %695 = ptrtoint %struct.edge_rec* %694 to i32 ; <i32> [#uses=2]
+ %696 = add i32 %695, 16 ; <i32> [#uses=1]
+ %697 = and i32 %696, 63 ; <i32> [#uses=1]
+ %698 = and i32 %695, -64 ; <i32> [#uses=1]
+ %699 = or i32 %697, %698 ; <i32> [#uses=1]
+ %700 = inttoptr i32 %699 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %701 = getelementptr %struct.edge_rec* %499, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %702 = load %struct.edge_rec** %701, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %703 = ptrtoint %struct.edge_rec* %702 to i32 ; <i32> [#uses=2]
+ %704 = add i32 %703, 16 ; <i32> [#uses=1]
+ %705 = and i32 %704, 63 ; <i32> [#uses=1]
+ %706 = and i32 %703, -64 ; <i32> [#uses=1]
+ %707 = or i32 %705, %706 ; <i32> [#uses=1]
+ %708 = inttoptr i32 %707 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %709 = getelementptr %struct.edge_rec* %708, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %710 = load %struct.edge_rec** %709, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %711 = getelementptr %struct.edge_rec* %700, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %712 = load %struct.edge_rec** %711, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %710, %struct.edge_rec** %711, align 4
+ store %struct.edge_rec* %712, %struct.edge_rec** %709, align 4
+ %713 = load %struct.edge_rec** %681, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %714 = load %struct.edge_rec** %701, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %713, %struct.edge_rec** %701, align 4
+ store %struct.edge_rec* %714, %struct.edge_rec** %681, align 4
+ %715 = xor i32 %683, 32 ; <i32> [#uses=1]
+ %716 = inttoptr i32 %715 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %717 = getelementptr %struct.edge_rec* %716, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %718 = load %struct.edge_rec** %717, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %719 = ptrtoint %struct.edge_rec* %718 to i32 ; <i32> [#uses=2]
+ %720 = add i32 %719, 16 ; <i32> [#uses=1]
+ %721 = and i32 %720, 63 ; <i32> [#uses=1]
+ %722 = and i32 %719, -64 ; <i32> [#uses=1]
+ %723 = or i32 %721, %722 ; <i32> [#uses=1]
+ %724 = inttoptr i32 %723 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %725 = getelementptr %struct.edge_rec* %679, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %726 = load %struct.edge_rec** %725, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %727 = ptrtoint %struct.edge_rec* %726 to i32 ; <i32> [#uses=2]
+ %728 = add i32 %727, 16 ; <i32> [#uses=1]
+ %729 = and i32 %728, 63 ; <i32> [#uses=1]
+ %730 = and i32 %727, -64 ; <i32> [#uses=1]
+ %731 = or i32 %729, %730 ; <i32> [#uses=1]
+ %732 = inttoptr i32 %731 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %733 = getelementptr %struct.edge_rec* %732, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %734 = load %struct.edge_rec** %733, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %735 = getelementptr %struct.edge_rec* %724, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %736 = load %struct.edge_rec** %735, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %734, %struct.edge_rec** %735, align 4
+ store %struct.edge_rec* %736, %struct.edge_rec** %733, align 4
+ %737 = load %struct.edge_rec** %717, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %738 = load %struct.edge_rec** %725, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %737, %struct.edge_rec** %725, align 4
+ store %struct.edge_rec* %738, %struct.edge_rec** %717, align 4
+ %739 = load %struct.edge_rec** %681, align 4 ; <%struct.edge_rec*> [#uses=1]
+ br label %bb9.i
+
+do_merge.exit: ; preds = %bb17.i
+ %740 = getelementptr %struct.edge_rec* %ldo_addr.0.ph.i, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %741 = load %struct.VERTEX** %740, align 4 ; <%struct.VERTEX*> [#uses=1]
+ %742 = icmp eq %struct.VERTEX* %741, %tree_addr.0.i ; <i1> [#uses=1]
+ br i1 %742, label %bb5.loopexit, label %bb2
+
+bb2: ; preds = %bb2, %do_merge.exit
+ %ldo.07 = phi %struct.edge_rec* [ %747, %bb2 ], [ %ldo_addr.0.ph.i, %do_merge.exit ] ; <%struct.edge_rec*> [#uses=1]
+ %743 = ptrtoint %struct.edge_rec* %ldo.07 to i32 ; <i32> [#uses=1]
+ %744 = xor i32 %743, 32 ; <i32> [#uses=1]
+ %745 = inttoptr i32 %744 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %746 = getelementptr %struct.edge_rec* %745, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %747 = load %struct.edge_rec** %746, align 4 ; <%struct.edge_rec*> [#uses=3]
+ %748 = getelementptr %struct.edge_rec* %747, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %749 = load %struct.VERTEX** %748, align 4 ; <%struct.VERTEX*> [#uses=1]
+ %750 = icmp eq %struct.VERTEX* %749, %tree_addr.0.i ; <i1> [#uses=1]
+ br i1 %750, label %bb5.loopexit, label %bb2
+
+bb4: ; preds = %bb5.loopexit, %bb4
+ %rdo.05 = phi %struct.edge_rec* [ %755, %bb4 ], [ %rdo_addr.0.i, %bb5.loopexit ] ; <%struct.edge_rec*> [#uses=1]
+ %751 = getelementptr %struct.edge_rec* %rdo.05, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %752 = load %struct.edge_rec** %751, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %753 = ptrtoint %struct.edge_rec* %752 to i32 ; <i32> [#uses=1]
+ %754 = xor i32 %753, 32 ; <i32> [#uses=1]
+ %755 = inttoptr i32 %754 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=3]
+ %756 = getelementptr %struct.edge_rec* %755, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %757 = load %struct.VERTEX** %756, align 4 ; <%struct.VERTEX*> [#uses=1]
+ %758 = icmp eq %struct.VERTEX* %757, %extra ; <i1> [#uses=1]
+ br i1 %758, label %bb6, label %bb4
+
+bb5.loopexit: ; preds = %bb2, %do_merge.exit
+ %ldo.0.lcssa = phi %struct.edge_rec* [ %ldo_addr.0.ph.i, %do_merge.exit ], [ %747, %bb2 ] ; <%struct.edge_rec*> [#uses=1]
+ %759 = getelementptr %struct.edge_rec* %rdo_addr.0.i, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %760 = load %struct.VERTEX** %759, align 4 ; <%struct.VERTEX*> [#uses=1]
+ %761 = icmp eq %struct.VERTEX* %760, %extra ; <i1> [#uses=1]
+ br i1 %761, label %bb6, label %bb4
+
+bb6: ; preds = %bb5.loopexit, %bb4
+ %rdo.0.lcssa = phi %struct.edge_rec* [ %rdo_addr.0.i, %bb5.loopexit ], [ %755, %bb4 ] ; <%struct.edge_rec*> [#uses=1]
+ %tmp16 = ptrtoint %struct.edge_rec* %ldo.0.lcssa to i32 ; <i32> [#uses=1]
+ %tmp4 = ptrtoint %struct.edge_rec* %rdo.0.lcssa to i32 ; <i32> [#uses=1]
+ br label %bb15
+
+bb7: ; preds = %bb
+ %762 = getelementptr %struct.VERTEX* %tree, i32 0, i32 1 ; <%struct.VERTEX**> [#uses=1]
+ %763 = load %struct.VERTEX** %762, align 4 ; <%struct.VERTEX*> [#uses=4]
+ %764 = icmp eq %struct.VERTEX* %763, null ; <i1> [#uses=1]
+ %765 = call arm_apcscc %struct.edge_rec* @alloc_edge() nounwind ; <%struct.edge_rec*> [#uses=5]
+ %766 = getelementptr %struct.edge_rec* %765, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=4]
+ store %struct.edge_rec* %765, %struct.edge_rec** %766, align 4
+ %767 = getelementptr %struct.edge_rec* %765, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=3]
+ br i1 %764, label %bb10, label %bb11
+
+bb8: ; preds = %entry
+ %768 = call arm_apcscc i32 @puts(i8* getelementptr ([21 x i8]* @_2E_str7, i32 0, i32 0)) nounwind ; <i32> [#uses=0]
+ call arm_apcscc void @exit(i32 -1) noreturn nounwind
+ unreachable
+
+bb10: ; preds = %bb7
+ store %struct.VERTEX* %tree, %struct.VERTEX** %767, align 4
+ %769 = ptrtoint %struct.edge_rec* %765 to i32 ; <i32> [#uses=5]
+ %770 = add i32 %769, 16 ; <i32> [#uses=1]
+ %771 = inttoptr i32 %770 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %772 = add i32 %769, 48 ; <i32> [#uses=1]
+ %773 = inttoptr i32 %772 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %774 = getelementptr %struct.edge_rec* %771, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %773, %struct.edge_rec** %774, align 4
+ %775 = add i32 %769, 32 ; <i32> [#uses=1]
+ %776 = inttoptr i32 %775 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=3]
+ %777 = getelementptr %struct.edge_rec* %776, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %776, %struct.edge_rec** %777, align 4
+ %778 = getelementptr %struct.edge_rec* %776, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ store %struct.VERTEX* %extra, %struct.VERTEX** %778, align 4
+ %779 = getelementptr %struct.edge_rec* %773, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %771, %struct.edge_rec** %779, align 4
+ %780 = xor i32 %769, 32 ; <i32> [#uses=1]
+ br label %bb15
+
+bb11: ; preds = %bb7
+ store %struct.VERTEX* %763, %struct.VERTEX** %767, align 4
+ %781 = ptrtoint %struct.edge_rec* %765 to i32 ; <i32> [#uses=6]
+ %782 = add i32 %781, 16 ; <i32> [#uses=1]
+ %783 = inttoptr i32 %782 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %784 = add i32 %781, 48 ; <i32> [#uses=1]
+ %785 = inttoptr i32 %784 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %786 = getelementptr %struct.edge_rec* %783, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %785, %struct.edge_rec** %786, align 4
+ %787 = add i32 %781, 32 ; <i32> [#uses=1]
+ %788 = inttoptr i32 %787 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=3]
+ %789 = getelementptr %struct.edge_rec* %788, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %788, %struct.edge_rec** %789, align 4
+ %790 = getelementptr %struct.edge_rec* %788, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ store %struct.VERTEX* %tree, %struct.VERTEX** %790, align 4
+ %791 = getelementptr %struct.edge_rec* %785, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %783, %struct.edge_rec** %791, align 4
+ %792 = call arm_apcscc %struct.edge_rec* @alloc_edge() nounwind ; <%struct.edge_rec*> [#uses=4]
+ %793 = getelementptr %struct.edge_rec* %792, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=4]
+ store %struct.edge_rec* %792, %struct.edge_rec** %793, align 4
+ %794 = getelementptr %struct.edge_rec* %792, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ store %struct.VERTEX* %tree, %struct.VERTEX** %794, align 4
+ %795 = ptrtoint %struct.edge_rec* %792 to i32 ; <i32> [#uses=5]
+ %796 = add i32 %795, 16 ; <i32> [#uses=1]
+ %797 = inttoptr i32 %796 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %798 = add i32 %795, 48 ; <i32> [#uses=2]
+ %799 = inttoptr i32 %798 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %800 = getelementptr %struct.edge_rec* %797, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %799, %struct.edge_rec** %800, align 4
+ %801 = add i32 %795, 32 ; <i32> [#uses=1]
+ %802 = inttoptr i32 %801 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=3]
+ %803 = getelementptr %struct.edge_rec* %802, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %802, %struct.edge_rec** %803, align 4
+ %804 = getelementptr %struct.edge_rec* %802, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ store %struct.VERTEX* %extra, %struct.VERTEX** %804, align 4
+ %805 = getelementptr %struct.edge_rec* %799, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %797, %struct.edge_rec** %805, align 4
+ %806 = xor i32 %781, 32 ; <i32> [#uses=1]
+ %807 = inttoptr i32 %806 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %808 = getelementptr %struct.edge_rec* %807, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %809 = load %struct.edge_rec** %808, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %810 = ptrtoint %struct.edge_rec* %809 to i32 ; <i32> [#uses=2]
+ %811 = add i32 %810, 16 ; <i32> [#uses=1]
+ %812 = and i32 %811, 63 ; <i32> [#uses=1]
+ %813 = and i32 %810, -64 ; <i32> [#uses=1]
+ %814 = or i32 %812, %813 ; <i32> [#uses=1]
+ %815 = inttoptr i32 %814 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %816 = load %struct.edge_rec** %793, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %817 = ptrtoint %struct.edge_rec* %816 to i32 ; <i32> [#uses=2]
+ %818 = add i32 %817, 16 ; <i32> [#uses=1]
+ %819 = and i32 %818, 63 ; <i32> [#uses=1]
+ %820 = and i32 %817, -64 ; <i32> [#uses=1]
+ %821 = or i32 %819, %820 ; <i32> [#uses=1]
+ %822 = inttoptr i32 %821 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %823 = getelementptr %struct.edge_rec* %822, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %824 = load %struct.edge_rec** %823, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %825 = getelementptr %struct.edge_rec* %815, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %826 = load %struct.edge_rec** %825, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %824, %struct.edge_rec** %825, align 4
+ store %struct.edge_rec* %826, %struct.edge_rec** %823, align 4
+ %827 = load %struct.edge_rec** %808, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %828 = load %struct.edge_rec** %793, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %827, %struct.edge_rec** %793, align 4
+ store %struct.edge_rec* %828, %struct.edge_rec** %808, align 4
+ %829 = xor i32 %795, 32 ; <i32> [#uses=3]
+ %830 = inttoptr i32 %829 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %831 = getelementptr %struct.edge_rec* %830, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ %832 = load %struct.VERTEX** %831, align 4 ; <%struct.VERTEX*> [#uses=1]
+ %833 = and i32 %798, 63 ; <i32> [#uses=1]
+ %834 = and i32 %795, -64 ; <i32> [#uses=1]
+ %835 = or i32 %833, %834 ; <i32> [#uses=1]
+ %836 = inttoptr i32 %835 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %837 = getelementptr %struct.edge_rec* %836, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %838 = load %struct.edge_rec** %837, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %839 = ptrtoint %struct.edge_rec* %838 to i32 ; <i32> [#uses=2]
+ %840 = add i32 %839, 16 ; <i32> [#uses=1]
+ %841 = and i32 %840, 63 ; <i32> [#uses=1]
+ %842 = and i32 %839, -64 ; <i32> [#uses=1]
+ %843 = or i32 %841, %842 ; <i32> [#uses=1]
+ %844 = inttoptr i32 %843 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %845 = load %struct.VERTEX** %767, align 4 ; <%struct.VERTEX*> [#uses=1]
+ %846 = call arm_apcscc %struct.edge_rec* @alloc_edge() nounwind ; <%struct.edge_rec*> [#uses=4]
+ %847 = getelementptr %struct.edge_rec* %846, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=7]
+ store %struct.edge_rec* %846, %struct.edge_rec** %847, align 4
+ %848 = getelementptr %struct.edge_rec* %846, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ store %struct.VERTEX* %832, %struct.VERTEX** %848, align 4
+ %849 = ptrtoint %struct.edge_rec* %846 to i32 ; <i32> [#uses=6]
+ %850 = add i32 %849, 16 ; <i32> [#uses=2]
+ %851 = inttoptr i32 %850 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %852 = add i32 %849, 48 ; <i32> [#uses=1]
+ %853 = inttoptr i32 %852 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %854 = getelementptr %struct.edge_rec* %851, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %853, %struct.edge_rec** %854, align 4
+ %855 = add i32 %849, 32 ; <i32> [#uses=1]
+ %856 = inttoptr i32 %855 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=3]
+ %857 = getelementptr %struct.edge_rec* %856, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %856, %struct.edge_rec** %857, align 4
+ %858 = getelementptr %struct.edge_rec* %856, i32 0, i32 0 ; <%struct.VERTEX**> [#uses=1]
+ store %struct.VERTEX* %845, %struct.VERTEX** %858, align 4
+ %859 = getelementptr %struct.edge_rec* %853, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %851, %struct.edge_rec** %859, align 4
+ %860 = load %struct.edge_rec** %847, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %861 = ptrtoint %struct.edge_rec* %860 to i32 ; <i32> [#uses=2]
+ %862 = add i32 %861, 16 ; <i32> [#uses=1]
+ %863 = and i32 %862, 63 ; <i32> [#uses=1]
+ %864 = and i32 %861, -64 ; <i32> [#uses=1]
+ %865 = or i32 %863, %864 ; <i32> [#uses=1]
+ %866 = inttoptr i32 %865 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %867 = getelementptr %struct.edge_rec* %844, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %868 = load %struct.edge_rec** %867, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %869 = ptrtoint %struct.edge_rec* %868 to i32 ; <i32> [#uses=2]
+ %870 = add i32 %869, 16 ; <i32> [#uses=1]
+ %871 = and i32 %870, 63 ; <i32> [#uses=1]
+ %872 = and i32 %869, -64 ; <i32> [#uses=1]
+ %873 = or i32 %871, %872 ; <i32> [#uses=1]
+ %874 = inttoptr i32 %873 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %875 = getelementptr %struct.edge_rec* %874, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %876 = load %struct.edge_rec** %875, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %877 = getelementptr %struct.edge_rec* %866, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %878 = load %struct.edge_rec** %877, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %876, %struct.edge_rec** %877, align 4
+ store %struct.edge_rec* %878, %struct.edge_rec** %875, align 4
+ %879 = load %struct.edge_rec** %847, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %880 = load %struct.edge_rec** %867, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %879, %struct.edge_rec** %867, align 4
+ store %struct.edge_rec* %880, %struct.edge_rec** %847, align 4
+ %881 = xor i32 %849, 32 ; <i32> [#uses=3]
+ %882 = inttoptr i32 %881 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %883 = getelementptr %struct.edge_rec* %882, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=6]
+ %884 = load %struct.edge_rec** %883, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %885 = ptrtoint %struct.edge_rec* %884 to i32 ; <i32> [#uses=2]
+ %886 = add i32 %885, 16 ; <i32> [#uses=1]
+ %887 = and i32 %886, 63 ; <i32> [#uses=1]
+ %888 = and i32 %885, -64 ; <i32> [#uses=1]
+ %889 = or i32 %887, %888 ; <i32> [#uses=1]
+ %890 = inttoptr i32 %889 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %891 = load %struct.edge_rec** %766, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %892 = ptrtoint %struct.edge_rec* %891 to i32 ; <i32> [#uses=2]
+ %893 = add i32 %892, 16 ; <i32> [#uses=1]
+ %894 = and i32 %893, 63 ; <i32> [#uses=1]
+ %895 = and i32 %892, -64 ; <i32> [#uses=1]
+ %896 = or i32 %894, %895 ; <i32> [#uses=1]
+ %897 = inttoptr i32 %896 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %898 = getelementptr %struct.edge_rec* %897, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %899 = load %struct.edge_rec** %898, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %900 = getelementptr %struct.edge_rec* %890, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %901 = load %struct.edge_rec** %900, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %899, %struct.edge_rec** %900, align 4
+ store %struct.edge_rec* %901, %struct.edge_rec** %898, align 4
+ %902 = load %struct.edge_rec** %883, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %903 = load %struct.edge_rec** %766, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %902, %struct.edge_rec** %766, align 4
+ store %struct.edge_rec* %903, %struct.edge_rec** %883, align 4
+ %904 = getelementptr %struct.VERTEX* %763, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %905 = load double* %904, align 4 ; <double> [#uses=2]
+ %906 = getelementptr %struct.VERTEX* %763, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %907 = load double* %906, align 4 ; <double> [#uses=2]
+ %908 = getelementptr %struct.VERTEX* %extra, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %909 = load double* %908, align 4 ; <double> [#uses=3]
+ %910 = getelementptr %struct.VERTEX* %extra, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %911 = load double* %910, align 4 ; <double> [#uses=3]
+ %912 = getelementptr %struct.VERTEX* %tree, i32 0, i32 0, i32 0 ; <double*> [#uses=1]
+ %913 = load double* %912, align 4 ; <double> [#uses=3]
+ %914 = getelementptr %struct.VERTEX* %tree, i32 0, i32 0, i32 1 ; <double*> [#uses=1]
+ %915 = load double* %914, align 4 ; <double> [#uses=3]
+ %916 = fsub double %905, %913 ; <double> [#uses=1]
+ %917 = fsub double %911, %915 ; <double> [#uses=1]
+ %918 = fmul double %916, %917 ; <double> [#uses=1]
+ %919 = fsub double %909, %913 ; <double> [#uses=1]
+ %920 = fsub double %907, %915 ; <double> [#uses=1]
+ %921 = fmul double %919, %920 ; <double> [#uses=1]
+ %922 = fsub double %918, %921 ; <double> [#uses=1]
+ %923 = fcmp ogt double %922, 0.000000e+00 ; <i1> [#uses=1]
+ br i1 %923, label %bb15, label %bb13
+
+bb13: ; preds = %bb11
+ %924 = fsub double %905, %909 ; <double> [#uses=1]
+ %925 = fsub double %915, %911 ; <double> [#uses=1]
+ %926 = fmul double %924, %925 ; <double> [#uses=1]
+ %927 = fsub double %913, %909 ; <double> [#uses=1]
+ %928 = fsub double %907, %911 ; <double> [#uses=1]
+ %929 = fmul double %927, %928 ; <double> [#uses=1]
+ %930 = fsub double %926, %929 ; <double> [#uses=1]
+ %931 = fcmp ogt double %930, 0.000000e+00 ; <i1> [#uses=1]
+ br i1 %931, label %bb15, label %bb14
+
+bb14: ; preds = %bb13
+ %932 = and i32 %850, 63 ; <i32> [#uses=1]
+ %933 = and i32 %849, -64 ; <i32> [#uses=3]
+ %934 = or i32 %932, %933 ; <i32> [#uses=1]
+ %935 = inttoptr i32 %934 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %936 = getelementptr %struct.edge_rec* %935, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %937 = load %struct.edge_rec** %936, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %938 = ptrtoint %struct.edge_rec* %937 to i32 ; <i32> [#uses=2]
+ %939 = add i32 %938, 16 ; <i32> [#uses=1]
+ %940 = and i32 %939, 63 ; <i32> [#uses=1]
+ %941 = and i32 %938, -64 ; <i32> [#uses=1]
+ %942 = or i32 %940, %941 ; <i32> [#uses=1]
+ %943 = inttoptr i32 %942 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %944 = load %struct.edge_rec** %847, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %945 = ptrtoint %struct.edge_rec* %944 to i32 ; <i32> [#uses=2]
+ %946 = add i32 %945, 16 ; <i32> [#uses=1]
+ %947 = and i32 %946, 63 ; <i32> [#uses=1]
+ %948 = and i32 %945, -64 ; <i32> [#uses=1]
+ %949 = or i32 %947, %948 ; <i32> [#uses=1]
+ %950 = inttoptr i32 %949 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %951 = getelementptr %struct.edge_rec* %943, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %952 = load %struct.edge_rec** %951, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %953 = ptrtoint %struct.edge_rec* %952 to i32 ; <i32> [#uses=2]
+ %954 = add i32 %953, 16 ; <i32> [#uses=1]
+ %955 = and i32 %954, 63 ; <i32> [#uses=1]
+ %956 = and i32 %953, -64 ; <i32> [#uses=1]
+ %957 = or i32 %955, %956 ; <i32> [#uses=1]
+ %958 = inttoptr i32 %957 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %959 = getelementptr %struct.edge_rec* %958, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %960 = load %struct.edge_rec** %959, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %961 = getelementptr %struct.edge_rec* %950, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %962 = load %struct.edge_rec** %961, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %960, %struct.edge_rec** %961, align 4
+ store %struct.edge_rec* %962, %struct.edge_rec** %959, align 4
+ %963 = load %struct.edge_rec** %847, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %964 = load %struct.edge_rec** %951, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %963, %struct.edge_rec** %951, align 4
+ store %struct.edge_rec* %964, %struct.edge_rec** %847, align 4
+ %965 = add i32 %881, 16 ; <i32> [#uses=1]
+ %966 = and i32 %965, 63 ; <i32> [#uses=1]
+ %967 = or i32 %966, %933 ; <i32> [#uses=1]
+ %968 = inttoptr i32 %967 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %969 = getelementptr %struct.edge_rec* %968, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ %970 = load %struct.edge_rec** %969, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %971 = ptrtoint %struct.edge_rec* %970 to i32 ; <i32> [#uses=2]
+ %972 = add i32 %971, 16 ; <i32> [#uses=1]
+ %973 = and i32 %972, 63 ; <i32> [#uses=1]
+ %974 = and i32 %971, -64 ; <i32> [#uses=1]
+ %975 = or i32 %973, %974 ; <i32> [#uses=1]
+ %976 = inttoptr i32 %975 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %977 = load %struct.edge_rec** %883, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %978 = ptrtoint %struct.edge_rec* %977 to i32 ; <i32> [#uses=2]
+ %979 = add i32 %978, 16 ; <i32> [#uses=1]
+ %980 = and i32 %979, 63 ; <i32> [#uses=1]
+ %981 = and i32 %978, -64 ; <i32> [#uses=1]
+ %982 = or i32 %980, %981 ; <i32> [#uses=1]
+ %983 = inttoptr i32 %982 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %984 = getelementptr %struct.edge_rec* %976, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=3]
+ %985 = load %struct.edge_rec** %984, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %986 = ptrtoint %struct.edge_rec* %985 to i32 ; <i32> [#uses=2]
+ %987 = add i32 %986, 16 ; <i32> [#uses=1]
+ %988 = and i32 %987, 63 ; <i32> [#uses=1]
+ %989 = and i32 %986, -64 ; <i32> [#uses=1]
+ %990 = or i32 %988, %989 ; <i32> [#uses=1]
+ %991 = inttoptr i32 %990 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=1]
+ %992 = getelementptr %struct.edge_rec* %991, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %993 = load %struct.edge_rec** %992, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %994 = getelementptr %struct.edge_rec* %983, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=2]
+ %995 = load %struct.edge_rec** %994, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %993, %struct.edge_rec** %994, align 4
+ store %struct.edge_rec* %995, %struct.edge_rec** %992, align 4
+ %996 = load %struct.edge_rec** %883, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %997 = load %struct.edge_rec** %984, align 4 ; <%struct.edge_rec*> [#uses=1]
+ store %struct.edge_rec* %996, %struct.edge_rec** %984, align 4
+ store %struct.edge_rec* %997, %struct.edge_rec** %883, align 4
+ %998 = inttoptr i32 %933 to %struct.edge_rec* ; <%struct.edge_rec*> [#uses=2]
+ %999 = load %struct.edge_rec** @avail_edge, align 4 ; <%struct.edge_rec*> [#uses=1]
+ %1000 = getelementptr %struct.edge_rec* %998, i32 0, i32 1 ; <%struct.edge_rec**> [#uses=1]
+ store %struct.edge_rec* %999, %struct.edge_rec** %1000, align 4
+ store %struct.edge_rec* %998, %struct.edge_rec** @avail_edge, align 4
+ br label %bb15
+
+bb15: ; preds = %bb14, %bb13, %bb11, %bb10, %bb6
+ %retval.1.0 = phi i32 [ %780, %bb10 ], [ %829, %bb13 ], [ %829, %bb14 ], [ %tmp4, %bb6 ], [ %849, %bb11 ] ; <i32> [#uses=1]
+ %retval.0.0 = phi i32 [ %769, %bb10 ], [ %781, %bb13 ], [ %781, %bb14 ], [ %tmp16, %bb6 ], [ %881, %bb11 ] ; <i32> [#uses=1]
+ %agg.result162 = bitcast %struct.EDGE_PAIR* %agg.result to i64* ; <i64*> [#uses=1]
+ %1001 = zext i32 %retval.0.0 to i64 ; <i64> [#uses=1]
+ %1002 = zext i32 %retval.1.0 to i64 ; <i64> [#uses=1]
+ %1003 = shl i64 %1002, 32 ; <i64> [#uses=1]
+ %1004 = or i64 %1003, %1001 ; <i64> [#uses=1]
+ store i64 %1004, i64* %agg.result162, align 4
+ ret void
+}
+
+declare arm_apcscc i32 @puts(i8* nocapture) nounwind
+
+declare arm_apcscc void @exit(i32) noreturn nounwind
+
+declare arm_apcscc %struct.edge_rec* @alloc_edge() nounwind
diff --git a/test/CodeGen/ARM/2009-07-22-ScavengerAssert.ll b/test/CodeGen/ARM/2009-07-22-ScavengerAssert.ll
new file mode 100644
index 0000000..b4b989b
--- /dev/null
+++ b/test/CodeGen/ARM/2009-07-22-ScavengerAssert.ll
@@ -0,0 +1,94 @@
+; RUN: llc < %s -mtriple=armv6-apple-darwin10
+
+ %struct.cli_ac_alt = type { i8, i8*, i16, i16, %struct.cli_ac_alt* }
+ %struct.cli_ac_node = type { i8, i8, %struct.cli_ac_patt*, %struct.cli_ac_node**, %struct.cli_ac_node* }
+ %struct.cli_ac_patt = type { i16*, i16*, i16, i16, i8, i32, i32, i8*, i8*, i32, i16, i16, i16, i16, %struct.cli_ac_alt**, i8, i16, %struct.cli_ac_patt*, %struct.cli_ac_patt* }
+ %struct.cli_bm_patt = type { i8*, i8*, i16, i16, i8*, i8*, i8, %struct.cli_bm_patt*, i16 }
+ %struct.cli_matcher = type { i16, i8, i8*, %struct.cli_bm_patt**, i32*, i32, i8, i8, %struct.cli_ac_node*, %struct.cli_ac_node**, %struct.cli_ac_patt**, i32, i32, i32 }
+
+declare arm_apcscc i32 @strlen(i8* nocapture) nounwind readonly
+
+define arm_apcscc i32 @cli_ac_addsig(%struct.cli_matcher* nocapture %root, i8* %virname, i8* %hexsig, i32 %sigid, i16 zeroext %parts, i16 zeroext %partno, i16 zeroext %type, i32 %mindist, i32 %maxdist, i8* %offset, i8 zeroext %target) nounwind {
+entry:
+ br i1 undef, label %bb126, label %bb1
+
+bb1: ; preds = %entry
+ br i1 undef, label %cli_calloc.exit.thread, label %cli_calloc.exit
+
+cli_calloc.exit.thread: ; preds = %bb1
+ ret i32 -114
+
+cli_calloc.exit: ; preds = %bb1
+ store i16 %parts, i16* undef, align 4
+ br i1 undef, label %bb52, label %bb4
+
+bb4: ; preds = %cli_calloc.exit
+ br i1 undef, label %bb.i, label %bb1.i3
+
+bb.i: ; preds = %bb4
+ unreachable
+
+bb1.i3: ; preds = %bb4
+ br i1 undef, label %bb2.i4, label %cli_strdup.exit
+
+bb2.i4: ; preds = %bb1.i3
+ ret i32 -114
+
+cli_strdup.exit: ; preds = %bb1.i3
+ br i1 undef, label %cli_calloc.exit54.thread, label %cli_calloc.exit54
+
+cli_calloc.exit54.thread: ; preds = %cli_strdup.exit
+ ret i32 -114
+
+cli_calloc.exit54: ; preds = %cli_strdup.exit
+ br label %bb45
+
+cli_calloc.exit70.thread: ; preds = %bb45
+ unreachable
+
+cli_calloc.exit70: ; preds = %bb45
+ br i1 undef, label %bb.i83, label %bb1.i84
+
+bb.i83: ; preds = %cli_calloc.exit70
+ unreachable
+
+bb1.i84: ; preds = %cli_calloc.exit70
+ br i1 undef, label %bb2.i85, label %bb17
+
+bb2.i85: ; preds = %bb1.i84
+ unreachable
+
+bb17: ; preds = %bb1.i84
+ br i1 undef, label %bb22, label %bb.nph
+
+bb.nph: ; preds = %bb17
+ br label %bb18
+
+bb18: ; preds = %bb18, %bb.nph
+ br i1 undef, label %bb18, label %bb22
+
+bb22: ; preds = %bb18, %bb17
+ br i1 undef, label %bb25, label %bb43.preheader
+
+bb43.preheader: ; preds = %bb22
+ br i1 undef, label %bb28, label %bb45
+
+bb25: ; preds = %bb22
+ unreachable
+
+bb28: ; preds = %bb43.preheader
+ unreachable
+
+bb45: ; preds = %bb43.preheader, %cli_calloc.exit54
+ br i1 undef, label %cli_calloc.exit70.thread, label %cli_calloc.exit70
+
+bb52: ; preds = %cli_calloc.exit
+ %0 = load i16* undef, align 4 ; <i16> [#uses=1]
+ %1 = icmp eq i16 %0, 0 ; <i1> [#uses=1]
+ %iftmp.20.0 = select i1 %1, i8* %hexsig, i8* null ; <i8*> [#uses=1]
+ %2 = tail call arm_apcscc i32 @strlen(i8* %iftmp.20.0) nounwind readonly ; <i32> [#uses=0]
+ unreachable
+
+bb126: ; preds = %entry
+ ret i32 -117
+}
diff --git a/test/CodeGen/ARM/2009-07-22-SchedulerAssert.ll b/test/CodeGen/ARM/2009-07-22-SchedulerAssert.ll
new file mode 100644
index 0000000..24f4990
--- /dev/null
+++ b/test/CodeGen/ARM/2009-07-22-SchedulerAssert.ll
@@ -0,0 +1,95 @@
+; RUN: llc < %s -march=arm
+
+ %struct.cli_ac_alt = type { i8, i8*, i16, i16, %struct.cli_ac_alt* }
+ %struct.cli_ac_node = type { i8, i8, %struct.cli_ac_patt*, %struct.cli_ac_node**, %struct.cli_ac_node* }
+ %struct.cli_ac_patt = type { i16*, i16*, i16, i16, i8, i32, i32, i8*, i8*, i32, i16, i16, i16, i16, %struct.cli_ac_alt**, i8, i16, %struct.cli_ac_patt*, %struct.cli_ac_patt* }
+ %struct.cli_bm_patt = type { i8*, i8*, i16, i16, i8*, i8*, i8, %struct.cli_bm_patt*, i16 }
+ %struct.cli_matcher = type { i16, i8, i8*, %struct.cli_bm_patt**, i32*, i32, i8, i8, %struct.cli_ac_node*, %struct.cli_ac_node**, %struct.cli_ac_patt**, i32, i32, i32 }
+
+define arm_apcscc i32 @cli_ac_addsig(%struct.cli_matcher* nocapture %root, i8* %virname, i8* %hexsig, i32 %sigid, i16 zeroext %parts, i16 zeroext %partno, i16 zeroext %type, i32 %mindist, i32 %maxdist, i8* %offset, i8 zeroext %target) nounwind {
+entry:
+ br i1 undef, label %bb126, label %bb1
+
+bb1: ; preds = %entry
+ br i1 undef, label %cli_calloc.exit.thread, label %cli_calloc.exit
+
+cli_calloc.exit.thread: ; preds = %bb1
+ ret i32 -114
+
+cli_calloc.exit: ; preds = %bb1
+ br i1 undef, label %bb52, label %bb4
+
+bb4: ; preds = %cli_calloc.exit
+ br i1 undef, label %bb.i, label %bb1.i3
+
+bb.i: ; preds = %bb4
+ unreachable
+
+bb1.i3: ; preds = %bb4
+ br i1 undef, label %bb2.i4, label %cli_strdup.exit
+
+bb2.i4: ; preds = %bb1.i3
+ ret i32 -114
+
+cli_strdup.exit: ; preds = %bb1.i3
+ br i1 undef, label %cli_calloc.exit54.thread, label %cli_calloc.exit54
+
+cli_calloc.exit54.thread: ; preds = %cli_strdup.exit
+ ret i32 -114
+
+cli_calloc.exit54: ; preds = %cli_strdup.exit
+ br label %bb45
+
+cli_calloc.exit70.thread: ; preds = %bb45
+ unreachable
+
+cli_calloc.exit70: ; preds = %bb45
+ br i1 undef, label %bb.i83, label %bb1.i84
+
+bb.i83: ; preds = %cli_calloc.exit70
+ unreachable
+
+bb1.i84: ; preds = %cli_calloc.exit70
+ br i1 undef, label %bb2.i85, label %bb17
+
+bb2.i85: ; preds = %bb1.i84
+ unreachable
+
+bb17: ; preds = %bb1.i84
+ br i1 undef, label %bb22, label %bb.nph
+
+bb.nph: ; preds = %bb17
+ br label %bb18
+
+bb18: ; preds = %bb18, %bb.nph
+ br i1 undef, label %bb18, label %bb22
+
+bb22: ; preds = %bb18, %bb17
+ %0 = getelementptr i8* null, i32 10 ; <i8*> [#uses=1]
+ %1 = bitcast i8* %0 to i16* ; <i16*> [#uses=1]
+ %2 = load i16* %1, align 2 ; <i16> [#uses=1]
+ %3 = add i16 %2, 1 ; <i16> [#uses=1]
+ %4 = zext i16 %3 to i32 ; <i32> [#uses=1]
+ %5 = mul i32 %4, 3 ; <i32> [#uses=1]
+ %6 = add i32 %5, -1 ; <i32> [#uses=1]
+ %7 = icmp eq i32 %6, undef ; <i1> [#uses=1]
+ br i1 %7, label %bb25, label %bb43.preheader
+
+bb43.preheader: ; preds = %bb22
+ br i1 undef, label %bb28, label %bb45
+
+bb25: ; preds = %bb22
+ unreachable
+
+bb28: ; preds = %bb43.preheader
+ unreachable
+
+bb45: ; preds = %bb43.preheader, %cli_calloc.exit54
+ br i1 undef, label %cli_calloc.exit70.thread, label %cli_calloc.exit70
+
+bb52: ; preds = %cli_calloc.exit
+ unreachable
+
+bb126: ; preds = %entry
+ ret i32 -117
+}
diff --git a/test/CodeGen/ARM/2009-07-29-VFP3Registers.ll b/test/CodeGen/ARM/2009-07-29-VFP3Registers.ll
new file mode 100644
index 0000000..e1d19d1
--- /dev/null
+++ b/test/CodeGen/ARM/2009-07-29-VFP3Registers.ll
@@ -0,0 +1,108 @@
+; RUN: llc < %s -mtriple=armv7-apple-darwin10 -mattr=+vfp3
+
+@a = external global double ; <double*> [#uses=1]
+
+declare double @llvm.exp.f64(double) nounwind readonly
+
+define arm_apcscc void @findratio(double* nocapture %res1, double* nocapture %res2) nounwind {
+entry:
+ br label %bb
+
+bb: ; preds = %bb, %entry
+ br i1 undef, label %bb28, label %bb
+
+bb28: ; preds = %bb
+ %0 = load double* @a, align 4 ; <double> [#uses=2]
+ %1 = fadd double %0, undef ; <double> [#uses=2]
+ br i1 undef, label %bb59, label %bb60
+
+bb59: ; preds = %bb28
+ %2 = fsub double -0.000000e+00, undef ; <double> [#uses=2]
+ br label %bb61
+
+bb60: ; preds = %bb28
+ %3 = tail call double @llvm.exp.f64(double undef) nounwind ; <double> [#uses=1]
+ %4 = fsub double -0.000000e+00, %3 ; <double> [#uses=2]
+ %5 = fsub double -0.000000e+00, undef ; <double> [#uses=1]
+ %6 = fsub double -0.000000e+00, undef ; <double> [#uses=1]
+ br label %bb61
+
+bb61: ; preds = %bb60, %bb59
+ %.pn201 = phi double [ undef, %bb59 ], [ undef, %bb60 ] ; <double> [#uses=1]
+ %.pn111 = phi double [ undef, %bb59 ], [ undef, %bb60 ] ; <double> [#uses=1]
+ %.pn452 = phi double [ undef, %bb59 ], [ undef, %bb60 ] ; <double> [#uses=1]
+ %.pn85 = phi double [ undef, %bb59 ], [ undef, %bb60 ] ; <double> [#uses=1]
+ %.pn238 = phi double [ 0.000000e+00, %bb59 ], [ 0.000000e+00, %bb60 ] ; <double> [#uses=1]
+ %.pn39 = phi double [ undef, %bb59 ], [ undef, %bb60 ] ; <double> [#uses=1]
+ %.pn230 = phi double [ undef, %bb59 ], [ undef, %bb60 ] ; <double> [#uses=1]
+ %.pn228 = phi double [ 0.000000e+00, %bb59 ], [ undef, %bb60 ] ; <double> [#uses=1]
+ %.pn224 = phi double [ undef, %bb59 ], [ undef, %bb60 ] ; <double> [#uses=1]
+ %.pn222 = phi double [ 0.000000e+00, %bb59 ], [ undef, %bb60 ] ; <double> [#uses=1]
+ %.pn218 = phi double [ %2, %bb59 ], [ %4, %bb60 ] ; <double> [#uses=1]
+ %.pn214 = phi double [ 0.000000e+00, %bb59 ], [ undef, %bb60 ] ; <double> [#uses=1]
+ %.pn212 = phi double [ %2, %bb59 ], [ %4, %bb60 ] ; <double> [#uses=1]
+ %.pn213 = phi double [ undef, %bb59 ], [ undef, %bb60 ] ; <double> [#uses=1]
+ %.pn210 = phi double [ undef, %bb59 ], [ %5, %bb60 ] ; <double> [#uses=1]
+ %.pn202 = phi double [ undef, %bb59 ], [ %6, %bb60 ] ; <double> [#uses=0]
+ %.pn390 = fdiv double %.pn452, undef ; <double> [#uses=0]
+ %.pn145 = fdiv double %.pn238, %1 ; <double> [#uses=0]
+ %.pn138 = fdiv double %.pn230, undef ; <double> [#uses=1]
+ %.pn139 = fdiv double %.pn228, undef ; <double> [#uses=1]
+ %.pn134 = fdiv double %.pn224, %0 ; <double> [#uses=1]
+ %.pn135 = fdiv double %.pn222, %1 ; <double> [#uses=1]
+ %.pn133 = fdiv double %.pn218, undef ; <double> [#uses=0]
+ %.pn128 = fdiv double %.pn214, undef ; <double> [#uses=1]
+ %.pn129 = fdiv double %.pn212, %.pn213 ; <double> [#uses=1]
+ %.pn126 = fdiv double %.pn210, undef ; <double> [#uses=0]
+ %.pn54.in = fmul double undef, %.pn201 ; <double> [#uses=1]
+ %.pn42.in = fmul double undef, undef ; <double> [#uses=1]
+ %.pn76 = fsub double %.pn138, %.pn139 ; <double> [#uses=1]
+ %.pn74 = fsub double %.pn134, %.pn135 ; <double> [#uses=1]
+ %.pn70 = fsub double %.pn128, %.pn129 ; <double> [#uses=1]
+ %.pn54 = fdiv double %.pn54.in, 6.000000e+00 ; <double> [#uses=1]
+ %.pn64 = fmul double undef, 0x3FE5555555555555 ; <double> [#uses=1]
+ %.pn65 = fmul double undef, undef ; <double> [#uses=1]
+ %.pn50 = fmul double undef, %.pn111 ; <double> [#uses=0]
+ %.pn42 = fdiv double %.pn42.in, 6.000000e+00 ; <double> [#uses=1]
+ %.pn40 = fmul double undef, %.pn85 ; <double> [#uses=0]
+ %.pn56 = fadd double %.pn76, undef ; <double> [#uses=1]
+ %.pn57 = fmul double %.pn74, undef ; <double> [#uses=1]
+ %.pn36 = fadd double undef, undef ; <double> [#uses=1]
+ %.pn37 = fmul double %.pn70, undef ; <double> [#uses=1]
+ %.pn33 = fmul double undef, 0x3FC5555555555555 ; <double> [#uses=1]
+ %.pn29 = fsub double %.pn64, %.pn65 ; <double> [#uses=1]
+ %.pn21 = fadd double undef, undef ; <double> [#uses=1]
+ %.pn27 = fmul double undef, 0x3FC5555555555555 ; <double> [#uses=1]
+ %.pn11 = fadd double %.pn56, %.pn57 ; <double> [#uses=1]
+ %.pn32 = fmul double %.pn54, undef ; <double> [#uses=1]
+ %.pn26 = fmul double %.pn42, undef ; <double> [#uses=1]
+ %.pn15 = fmul double 0.000000e+00, %.pn39 ; <double> [#uses=1]
+ %.pn7 = fadd double %.pn36, %.pn37 ; <double> [#uses=1]
+ %.pn30 = fsub double %.pn32, %.pn33 ; <double> [#uses=1]
+ %.pn28 = fadd double %.pn30, 0.000000e+00 ; <double> [#uses=1]
+ %.pn24 = fsub double %.pn28, %.pn29 ; <double> [#uses=1]
+ %.pn22 = fsub double %.pn26, %.pn27 ; <double> [#uses=1]
+ %.pn20 = fadd double %.pn24, undef ; <double> [#uses=1]
+ %.pn18 = fadd double %.pn22, 0.000000e+00 ; <double> [#uses=1]
+ %.pn16 = fsub double %.pn20, %.pn21 ; <double> [#uses=1]
+ %.pn14 = fsub double %.pn18, undef ; <double> [#uses=1]
+ %.pn12 = fadd double %.pn16, undef ; <double> [#uses=1]
+ %.pn10 = fadd double %.pn14, %.pn15 ; <double> [#uses=1]
+ %.pn8 = fsub double %.pn12, undef ; <double> [#uses=1]
+ %.pn6 = fsub double %.pn10, %.pn11 ; <double> [#uses=1]
+ %.pn4 = fadd double %.pn8, undef ; <double> [#uses=1]
+ %.pn2 = fadd double %.pn6, %.pn7 ; <double> [#uses=1]
+ %N1.0 = fsub double %.pn4, undef ; <double> [#uses=1]
+ %D1.0 = fsub double %.pn2, undef ; <double> [#uses=2]
+ br i1 undef, label %bb62, label %bb64
+
+bb62: ; preds = %bb61
+ %7 = fadd double %D1.0, undef ; <double> [#uses=1]
+ br label %bb64
+
+bb64: ; preds = %bb62, %bb61
+ %.pn = phi double [ undef, %bb62 ], [ %N1.0, %bb61 ] ; <double> [#uses=1]
+ %.pn1 = phi double [ %7, %bb62 ], [ %D1.0, %bb61 ] ; <double> [#uses=1]
+ %x.1 = fdiv double %.pn, %.pn1 ; <double> [#uses=0]
+ ret void
+}
diff --git a/test/CodeGen/ARM/2009-08-02-RegScavengerAssert-Neon.ll b/test/CodeGen/ARM/2009-08-02-RegScavengerAssert-Neon.ll
new file mode 100644
index 0000000..2d4e58d
--- /dev/null
+++ b/test/CodeGen/ARM/2009-08-02-RegScavengerAssert-Neon.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -march=arm -mattr=+neon
+; PR4657
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:64"
+target triple = "armv7-apple-darwin9"
+
+define arm_apcscc <4 x i32> @scale(<4 x i32> %v, i32 %f) nounwind {
+entry:
+ %v_addr = alloca <4 x i32> ; <<4 x i32>*> [#uses=2]
+ %f_addr = alloca i32 ; <i32*> [#uses=2]
+ %retval = alloca <4 x i32> ; <<4 x i32>*> [#uses=2]
+ %0 = alloca <4 x i32> ; <<4 x i32>*> [#uses=2]
+ %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
+ store <4 x i32> %v, <4 x i32>* %v_addr
+ store i32 %f, i32* %f_addr
+ %1 = load <4 x i32>* %v_addr, align 16 ; <<4 x i32>> [#uses=1]
+ %2 = load i32* %f_addr, align 4 ; <i32> [#uses=1]
+ %3 = insertelement <4 x i32> undef, i32 %2, i32 0 ; <<4 x i32>> [#uses=1]
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer ; <<4 x i32>> [#uses=1]
+ %5 = mul <4 x i32> %1, %4 ; <<4 x i32>> [#uses=1]
+ store <4 x i32> %5, <4 x i32>* %0, align 16
+ %6 = load <4 x i32>* %0, align 16 ; <<4 x i32>> [#uses=1]
+ store <4 x i32> %6, <4 x i32>* %retval, align 16
+ br label %return
+
+return: ; preds = %entry
+ %retval1 = load <4 x i32>* %retval ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %retval1
+}
diff --git a/test/CodeGen/ARM/2009-08-04-RegScavengerAssert-2.ll b/test/CodeGen/ARM/2009-08-04-RegScavengerAssert-2.ll
new file mode 100644
index 0000000..65ffed2
--- /dev/null
+++ b/test/CodeGen/ARM/2009-08-04-RegScavengerAssert-2.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple=armv6-elf
+; PR4528
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:64"
+target triple = "armv6-elf"
+
+define arm_aapcscc i32 @file_read_actor(i32* nocapture %desc, i32* %page, i32 %offset, i32 %size) nounwind optsize {
+entry:
+ br i1 undef, label %fault_in_pages_writeable.exit, label %bb5.i
+
+bb5.i: ; preds = %entry
+ %asmtmp.i = tail call i32 asm sideeffect "1:\09strbt\09$1,[$2]\0A2:\0A\09.section .fixup,\22ax\22\0A\09.align\092\0A3:\09mov\09$0, $3\0A\09b\092b\0A\09.previous\0A\09.section __ex_table,\22a\22\0A\09.align\093\0A\09.long\091b, 3b\0A\09.previous", "=r,r,r,i,0,~{cc}"(i8 0, i32 undef, i32 -14, i32 0) nounwind ; <i32> [#uses=1]
+ %0 = icmp eq i32 %asmtmp.i, 0 ; <i1> [#uses=1]
+ br i1 %0, label %bb6.i, label %fault_in_pages_writeable.exit
+
+bb6.i: ; preds = %bb5.i
+ br i1 undef, label %fault_in_pages_writeable.exit, label %bb7.i
+
+bb7.i: ; preds = %bb6.i
+ unreachable
+
+fault_in_pages_writeable.exit: ; preds = %bb6.i, %bb5.i, %entry
+ br i1 undef, label %bb2, label %bb3
+
+bb2: ; preds = %fault_in_pages_writeable.exit
+ unreachable
+
+bb3: ; preds = %fault_in_pages_writeable.exit
+ %1 = tail call arm_aapcscc i32 @__copy_to_user(i8* undef, i8* undef, i32 undef) nounwind ; <i32> [#uses=0]
+ unreachable
+}
+
+declare arm_aapcscc i32 @__copy_to_user(i8*, i8*, i32)
diff --git a/test/CodeGen/ARM/2009-08-04-RegScavengerAssert.ll b/test/CodeGen/ARM/2009-08-04-RegScavengerAssert.ll
new file mode 100644
index 0000000..9e5372a
--- /dev/null
+++ b/test/CodeGen/ARM/2009-08-04-RegScavengerAssert.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=armv6-elf
+; PR4528
+
+define arm_aapcscc i32 @file_read_actor(i32 %desc, i32 %page, i32 %offset, i32 %size) nounwind optsize {
+entry:
+ br i1 undef, label %fault_in_pages_writeable.exit, label %bb5.i
+
+bb5.i: ; preds = %entry
+ %asmtmp.i = tail call i32 asm sideeffect "1:\09strbt\09$1,[$2]\0A2:\0A\09.section .fixup,\22ax\22\0A\09.align\092\0A3:\09mov\09$0, $3\0A\09b\092b\0A\09.previous\0A\09.section __ex_table,\22a\22\0A\09.align\093\0A\09.long\091b, 3b\0A\09.previous", "=r,r,r,i,0,~{cc}"(i8 0, i32 undef, i32 -14, i32 0) nounwind ; <i32> [#uses=1]
+ br label %fault_in_pages_writeable.exit
+
+fault_in_pages_writeable.exit: ; preds = %bb5.i, %entry
+ %0 = phi i32 [ 0, %entry ], [ %asmtmp.i, %bb5.i ] ; <i32> [#uses=1]
+ %1 = icmp eq i32 %0, 0 ; <i1> [#uses=1]
+ br i1 %1, label %bb2, label %bb3
+
+bb2: ; preds = %fault_in_pages_writeable.exit
+ unreachable
+
+bb3: ; preds = %fault_in_pages_writeable.exit
+ %2 = tail call arm_aapcscc i32 @__copy_to_user(i8* undef, i8* undef, i32 undef) nounwind ; <i32> [#uses=0]
+ unreachable
+}
+
+declare arm_aapcscc i32 @__copy_to_user(i8*, i8*, i32)
diff --git a/test/CodeGen/ARM/2009-08-15-RegScavenger-EarlyClobber.ll b/test/CodeGen/ARM/2009-08-15-RegScavenger-EarlyClobber.ll
new file mode 100644
index 0000000..18d68f7
--- /dev/null
+++ b/test/CodeGen/ARM/2009-08-15-RegScavenger-EarlyClobber.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -march=arm
+; PR4528
+
+; Inline asm is allowed to contain operands "=&r", "0".
+
+%struct.device_dma_parameters = type { i32, i32 }
+%struct.iovec = type { i8*, i32 }
+
+define arm_aapcscc i32 @generic_segment_checks(%struct.iovec* nocapture %iov, i32* nocapture %nr_segs, i32* nocapture %count, i32 %access_flags) nounwind optsize {
+entry:
+ br label %bb8
+
+bb: ; preds = %bb8
+ br i1 undef, label %bb10, label %bb2
+
+bb2: ; preds = %bb
+ %asmtmp = tail call %struct.device_dma_parameters asm "adds $1, $2, $3; sbcccs $1, $1, $0; movcc $0, #0", "=&r,=&r,r,Ir,0,~{cc}"(i8* undef, i32 undef, i32 0) nounwind; <%struct.device_dma_parameters> [#uses=1]
+ %asmresult = extractvalue %struct.device_dma_parameters %asmtmp, 0; <i32> [#uses=1]
+ %0 = icmp eq i32 %asmresult, 0 ; <i1> [#uses=1]
+ br i1 %0, label %bb7, label %bb4
+
+bb4: ; preds = %bb2
+ br i1 undef, label %bb10, label %bb9
+
+bb7: ; preds = %bb2
+ %1 = add i32 %2, 1 ; <i32> [#uses=1]
+ br label %bb8
+
+bb8: ; preds = %bb7, %entry
+ %2 = phi i32 [ 0, %entry ], [ %1, %bb7 ] ; <i32> [#uses=3]
+ %scevgep22 = getelementptr %struct.iovec* %iov, i32 %2, i32 0; <i8**> [#uses=0]
+ %3 = load i32* %nr_segs, align 4 ; <i32> [#uses=1]
+ %4 = icmp ult i32 %2, %3 ; <i1> [#uses=1]
+ br i1 %4, label %bb, label %bb9
+
+bb9: ; preds = %bb8, %bb4
+ store i32 undef, i32* %count, align 4
+ ret i32 0
+
+bb10: ; preds = %bb4, %bb
+ ret i32 0
+}
diff --git a/test/CodeGen/ARM/2009-08-15-RegScavengerAssert.ll b/test/CodeGen/ARM/2009-08-15-RegScavengerAssert.ll
new file mode 100644
index 0000000..a46482c
--- /dev/null
+++ b/test/CodeGen/ARM/2009-08-15-RegScavengerAssert.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -march=arm
+; PR4716
+
+define arm_aapcscc void @_start() nounwind naked {
+entry:
+ tail call arm_aapcscc void @exit(i32 undef) noreturn nounwind
+ unreachable
+}
+
+declare arm_aapcscc void @exit(i32) noreturn nounwind
diff --git a/test/CodeGen/ARM/2009-08-21-PostRAKill.ll b/test/CodeGen/ARM/2009-08-21-PostRAKill.ll
new file mode 100644
index 0000000..84915c4
--- /dev/null
+++ b/test/CodeGen/ARM/2009-08-21-PostRAKill.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -march=arm -mattr=+vfp2 -post-RA-scheduler -mcpu=cortex-a8
+
+; ModuleID = '<stdin>'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:64"
+target triple = "armv7-apple-darwin9"
+
+%struct.tree = type { i32, double, double, %struct.tree*, %struct.tree*, %struct.tree*, %struct.tree* }
+@g = common global %struct.tree* null
+
+define arm_apcscc %struct.tree* @tsp(%struct.tree* %t, i32 %nproc) nounwind {
+entry:
+ %t.idx51.val.i = load double* null ; <double> [#uses=1]
+ br i1 undef, label %bb4.i, label %bb.i
+
+bb.i: ; preds = %entry
+ unreachable
+
+bb4.i: ; preds = %entry
+ %0 = load %struct.tree** @g, align 4 ; <%struct.tree*> [#uses=2]
+ %.idx45.i = getelementptr %struct.tree* %0, i32 0, i32 1 ; <double*> [#uses=1]
+ %.idx45.val.i = load double* %.idx45.i ; <double> [#uses=1]
+ %.idx46.i = getelementptr %struct.tree* %0, i32 0, i32 2 ; <double*> [#uses=1]
+ %.idx46.val.i = load double* %.idx46.i ; <double> [#uses=1]
+ %1 = fsub double 0.000000e+00, %.idx45.val.i ; <double> [#uses=2]
+ %2 = fmul double %1, %1 ; <double> [#uses=1]
+ %3 = fsub double %t.idx51.val.i, %.idx46.val.i ; <double> [#uses=2]
+ %4 = fmul double %3, %3 ; <double> [#uses=1]
+ %5 = fadd double %2, %4 ; <double> [#uses=1]
+ %6 = tail call double @llvm.sqrt.f64(double %5) nounwind ; <double> [#uses=1]
+ br i1 undef, label %bb7.i4, label %bb6.i
+
+bb6.i: ; preds = %bb4.i
+ br label %bb7.i4
+
+bb7.i4: ; preds = %bb6.i, %bb4.i
+ %tton1.0.i = phi double [ %6, %bb6.i ], [ undef, %bb4.i ] ; <double> [#uses=0]
+ unreachable
+}
+
+declare double @llvm.sqrt.f64(double) nounwind readonly
diff --git a/test/CodeGen/ARM/2009-08-21-PostRAKill2.ll b/test/CodeGen/ARM/2009-08-21-PostRAKill2.ll
new file mode 100644
index 0000000..a21ffc3
--- /dev/null
+++ b/test/CodeGen/ARM/2009-08-21-PostRAKill2.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -asm-verbose=false -O3 -relocation-model=pic -disable-fp-elim -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -post-RA-scheduler
+
+; ModuleID = '<stdin>'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:64"
+target triple = "armv7-apple-darwin9"
+
+%struct.anon = type { [3 x double], double, %struct.node*, [64 x %struct.bnode*], [64 x %struct.bnode*] }
+%struct.bnode = type { i16, double, [3 x double], i32, i32, [3 x double], [3 x double], [3 x double], double, %struct.bnode*, %struct.bnode* }
+%struct.icstruct = type { [3 x i32], i16 }
+%struct.node = type { i16, double, [3 x double], i32, i32 }
+
+declare arm_apcscc double @floor(double) nounwind readnone
+
+define void @intcoord(%struct.icstruct* noalias nocapture sret %agg.result, i1 %a, double %b) {
+entry:
+ br i1 %a, label %bb3, label %bb1
+
+bb1: ; preds = %entry
+ unreachable
+
+bb3: ; preds = %entry
+ br i1 %a, label %bb7, label %bb5
+
+bb5: ; preds = %bb3
+ unreachable
+
+bb7: ; preds = %bb3
+ br i1 %a, label %bb11, label %bb9
+
+bb9: ; preds = %bb7
+ %0 = tail call arm_apcscc double @floor(double %b) nounwind readnone ; <double> [#uses=0]
+ br label %bb11
+
+bb11: ; preds = %bb9, %bb7
+ %1 = getelementptr %struct.icstruct* %agg.result, i32 0, i32 0, i32 0 ; <i32*> [#uses=1]
+ store i32 0, i32* %1
+ ret void
+}
diff --git a/test/CodeGen/ARM/2009-08-21-PostRAKill3.ll b/test/CodeGen/ARM/2009-08-21-PostRAKill3.ll
new file mode 100644
index 0000000..e3d8ea6
--- /dev/null
+++ b/test/CodeGen/ARM/2009-08-21-PostRAKill3.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -asm-verbose=false -O3 -relocation-model=pic -disable-fp-elim -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -post-RA-scheduler
+
+; ModuleID = '<stdin>'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:64"
+target triple = "armv7-apple-darwin9"
+
+%struct.Hosp = type { i32, i32, i32, %struct.List, %struct.List, %struct.List, %struct.List }
+%struct.List = type { %struct.List*, %struct.Patient*, %struct.List* }
+%struct.Patient = type { i32, i32, i32, %struct.Village* }
+%struct.Village = type { [4 x %struct.Village*], %struct.Village*, %struct.List, %struct.Hosp, i32, i32 }
+
+define arm_apcscc %struct.Village* @alloc_tree(i32 %level, i32 %label, %struct.Village* %back, i1 %p) nounwind {
+entry:
+ br i1 %p, label %bb8, label %bb1
+
+bb1: ; preds = %entry
+ %0 = malloc %struct.Village ; <%struct.Village*> [#uses=3]
+ %exp2 = call double @ldexp(double 1.000000e+00, i32 %level) nounwind ; <double> [#uses=1]
+ %.c = fptosi double %exp2 to i32 ; <i32> [#uses=1]
+ store i32 %.c, i32* null
+ %1 = getelementptr %struct.Village* %0, i32 0, i32 3, i32 6, i32 0 ; <%struct.List**> [#uses=1]
+ store %struct.List* null, %struct.List** %1
+ %2 = getelementptr %struct.Village* %0, i32 0, i32 3, i32 6, i32 2 ; <%struct.List**> [#uses=1]
+ store %struct.List* null, %struct.List** %2
+ ret %struct.Village* %0
+
+bb8: ; preds = %entry
+ ret %struct.Village* null
+}
+
+declare double @ldexp(double, i32)
diff --git a/test/CodeGen/ARM/2009-08-21-PostRAKill4.ll b/test/CodeGen/ARM/2009-08-21-PostRAKill4.ll
new file mode 100644
index 0000000..9123377
--- /dev/null
+++ b/test/CodeGen/ARM/2009-08-21-PostRAKill4.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -asm-verbose=false -O3 -relocation-model=pic -disable-fp-elim -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -post-RA-scheduler
+
+; ModuleID = '<stdin>'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:64"
+target triple = "armv7-apple-darwin9"
+
+@.str = external constant [36 x i8], align 1 ; <[36 x i8]*> [#uses=0]
+@.str1 = external constant [31 x i8], align 1 ; <[31 x i8]*> [#uses=1]
+@.str2 = external constant [4 x i8], align 1 ; <[4 x i8]*> [#uses=1]
+
+declare arm_apcscc i32 @getUnknown(i32, ...) nounwind
+
+declare void @llvm.va_start(i8*) nounwind
+
+declare void @llvm.va_end(i8*) nounwind
+
+declare arm_apcscc i32 @printf(i8* nocapture, ...) nounwind
+
+define arm_apcscc i32 @main() nounwind {
+entry:
+ %0 = tail call arm_apcscc i32 (i8*, ...)* @printf(i8* getelementptr ([31 x i8]* @.str1, i32 0, i32 0), i32 1, i32 1, i32 1, i32 1, i32 1, i32 1) nounwind ; <i32> [#uses=0]
+ %1 = tail call arm_apcscc i32 (i8*, ...)* @printf(i8* getelementptr ([31 x i8]* @.str1, i32 0, i32 0), i32 -128, i32 116, i32 116, i32 -3852, i32 -31232, i32 -1708916736) nounwind ; <i32> [#uses=0]
+ %2 = tail call arm_apcscc i32 (i32, ...)* @getUnknown(i32 undef, i32 116, i32 116, i32 -3852, i32 -31232, i32 30556, i32 -1708916736) nounwind ; <i32> [#uses=1]
+ %3 = tail call arm_apcscc i32 (i8*, ...)* @printf(i8* getelementptr ([4 x i8]* @.str2, i32 0, i32 0), i32 %2) nounwind ; <i32> [#uses=0]
+ ret i32 0
+}
diff --git a/test/CodeGen/ARM/2009-08-23-linkerprivate.ll b/test/CodeGen/ARM/2009-08-23-linkerprivate.ll
new file mode 100644
index 0000000..0fad533
--- /dev/null
+++ b/test/CodeGen/ARM/2009-08-23-linkerprivate.ll
@@ -0,0 +1,8 @@
+; RUN: llc < %s -march=arm -mtriple=arm-apple-darwin | FileCheck %s
+
+; ModuleID = '/Volumes/MacOS9/tests/WebKit/JavaScriptCore/profiler/ProfilerServer.mm'
+
+@"\01l_objc_msgSend_fixup_alloc" = linker_private hidden global i32 0, section "__DATA, __objc_msgrefs, coalesced", align 16 ; <i32*> [#uses=0]
+
+; CHECK: .globl l_objc_msgSend_fixup_alloc
+; CHECK: .weak_definition l_objc_msgSend_fixup_alloc
diff --git a/test/CodeGen/ARM/2009-08-26-ScalarToVector.ll b/test/CodeGen/ARM/2009-08-26-ScalarToVector.ll
new file mode 100644
index 0000000..c6ef256
--- /dev/null
+++ b/test/CodeGen/ARM/2009-08-26-ScalarToVector.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mattr=+neon | not grep fldmfdd
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
+target triple = "thumbv7-elf"
+
+%bar = type { float, float, float }
+%baz = type { i32, [16 x %bar], [16 x float], [16 x i32], i8 }
+%foo = type { <4 x float> }
+%quux = type { i32 (...)**, %baz*, i32 }
+%quuz = type { %quux, i32, %bar, [128 x i8], [16 x %foo], %foo, %foo, %foo }
+
+declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define arm_apcscc void @_ZN6squish10ClusterFit9Compress3EPv(%quuz* %this, i8* %block) {
+entry:
+ %0 = lshr <4 x i32> zeroinitializer, <i32 31, i32 31, i32 31, i32 31> ; <<4 x i32>> [#uses=1]
+ %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 2, i32 3> ; <<2 x i32>> [#uses=1]
+ %2 = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> undef, <2 x i32> %1) nounwind ; <<2 x i32>> [#uses=1]
+ %3 = extractelement <2 x i32> %2, i32 0 ; <i32> [#uses=1]
+ %not..i = icmp eq i32 %3, undef ; <i1> [#uses=1]
+ br i1 %not..i, label %return, label %bb221
+
+bb221: ; preds = %bb221, %entry
+ br label %bb221
+
+return: ; preds = %entry
+ ret void
+}
diff --git a/test/CodeGen/ARM/2009-08-27-ScalarToVector.ll b/test/CodeGen/ARM/2009-08-27-ScalarToVector.ll
new file mode 100644
index 0000000..bc5bfe9
--- /dev/null
+++ b/test/CodeGen/ARM/2009-08-27-ScalarToVector.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -mattr=+neon | not grep fldmfdd
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
+target triple = "thumbv7-elf"
+
+%bar = type { float, float, float }
+%baz = type { i32, [16 x %bar], [16 x float], [16 x i32], i8 }
+%foo = type { <4 x float> }
+%quux = type { i32 (...)**, %baz*, i32 }
+%quuz = type { %quux, i32, %bar, [128 x i8], [16 x %foo], %foo, %foo, %foo }
+
+define arm_apcscc void @aaaa(%quuz* %this, i8* %block) {
+entry:
+ br i1 undef, label %bb.nph269, label %bb201
+
+bb.nph269: ; preds = %entry
+ br label %bb12
+
+bb12: ; preds = %bb194, %bb.nph269
+ %0 = fmul <4 x float> undef, undef ; <<4 x float>> [#uses=1]
+ %1 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 2, i32 3> ; <<2 x float>> [#uses=1]
+ %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
+ %3 = fadd <4 x float> undef, %2 ; <<4 x float>> [#uses=1]
+ br i1 undef, label %bb194, label %bb186
+
+bb186: ; preds = %bb12
+ br label %bb194
+
+bb194: ; preds = %bb186, %bb12
+ %besterror.0.0 = phi <4 x float> [ %3, %bb186 ], [ undef, %bb12 ] ; <<4 x float>> [#uses=0]
+ %indvar.next294 = add i32 undef, 1 ; <i32> [#uses=0]
+ br label %bb12
+
+bb201: ; preds = %entry
+ ret void
+}
diff --git a/test/CodeGen/ARM/2009-08-29-ExtractEltf32.ll b/test/CodeGen/ARM/2009-08-29-ExtractEltf32.ll
new file mode 100644
index 0000000..d5178b4
--- /dev/null
+++ b/test/CodeGen/ARM/2009-08-29-ExtractEltf32.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mattr=+neon
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
+target triple = "thumbv7-elf"
+
+define arm_apcscc void @foo() nounwind {
+entry:
+ %0 = tail call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> undef, <2 x float> undef) nounwind ; <<2 x float>> [#uses=1]
+ %tmp28 = extractelement <2 x float> %0, i32 0 ; <float> [#uses=1]
+ %1 = fcmp une float %tmp28, 4.900000e+01 ; <i1> [#uses=1]
+ br i1 %1, label %bb, label %bb7
+
+bb: ; preds = %entry
+ unreachable
+
+bb7: ; preds = %entry
+ br i1 undef, label %bb8, label %bb9
+
+bb8: ; preds = %bb7
+ unreachable
+
+bb9: ; preds = %bb7
+ ret void
+}
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/2009-08-29-TooLongSplat.ll b/test/CodeGen/ARM/2009-08-29-TooLongSplat.ll
new file mode 100644
index 0000000..266fce6
--- /dev/null
+++ b/test/CodeGen/ARM/2009-08-29-TooLongSplat.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mattr=+neon
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
+target triple = "thumbv7-elf"
+
+define arm_apcscc void @aaa() nounwind {
+entry:
+ %0 = fmul <4 x float> undef, <float 1.000000e+00, float 1.000000e+01, float 1.000000e+02, float 0x3EB0C6F7A0000000> ; <<4 x float>> [#uses=1]
+ %tmp31 = extractelement <4 x float> %0, i32 0 ; <float> [#uses=1]
+ %1 = fpext float %tmp31 to double ; <double> [#uses=1]
+ %2 = fsub double 1.000000e+00, %1 ; <double> [#uses=1]
+ %3 = fdiv double %2, 1.000000e+00 ; <double> [#uses=1]
+ %4 = tail call double @fabs(double %3) nounwind readnone ; <double> [#uses=1]
+ %5 = fcmp ogt double %4, 1.000000e-05 ; <i1> [#uses=1]
+ br i1 %5, label %bb, label %bb7
+
+bb: ; preds = %entry
+ unreachable
+
+bb7: ; preds = %entry
+ unreachable
+}
+
+declare double @fabs(double)
diff --git a/test/CodeGen/ARM/2009-08-31-LSDA-Name.ll b/test/CodeGen/ARM/2009-08-31-LSDA-Name.ll
new file mode 100644
index 0000000..b6cf880
--- /dev/null
+++ b/test/CodeGen/ARM/2009-08-31-LSDA-Name.ll
@@ -0,0 +1,103 @@
+; RUN: llc < %s -mtriple=arm-apple-darwin9 -march=arm | FileCheck %s
+
+%struct.A = type { i32* }
+
+define arm_apcscc void @"\01-[MyFunction Name:]"() {
+entry:
+ %save_filt.1 = alloca i32 ; <i32*> [#uses=2]
+ %save_eptr.0 = alloca i8* ; <i8**> [#uses=2]
+ %a = alloca %struct.A ; <%struct.A*> [#uses=3]
+ %eh_exception = alloca i8* ; <i8**> [#uses=5]
+ %eh_selector = alloca i32 ; <i32*> [#uses=3]
+ %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
+ call arm_apcscc void @_ZN1AC1Ev(%struct.A* %a)
+ invoke arm_apcscc void @_Z3barv()
+ to label %invcont unwind label %lpad
+
+invcont: ; preds = %entry
+ call arm_apcscc void @_ZN1AD1Ev(%struct.A* %a) nounwind
+ br label %return
+
+bb: ; preds = %ppad
+ %eh_select = load i32* %eh_selector ; <i32> [#uses=1]
+ store i32 %eh_select, i32* %save_filt.1, align 4
+ %eh_value = load i8** %eh_exception ; <i8*> [#uses=1]
+ store i8* %eh_value, i8** %save_eptr.0, align 4
+ call arm_apcscc void @_ZN1AD1Ev(%struct.A* %a) nounwind
+ %0 = load i8** %save_eptr.0, align 4 ; <i8*> [#uses=1]
+ store i8* %0, i8** %eh_exception, align 4
+ %1 = load i32* %save_filt.1, align 4 ; <i32> [#uses=1]
+ store i32 %1, i32* %eh_selector, align 4
+ br label %Unwind
+
+return: ; preds = %invcont
+ ret void
+
+lpad: ; preds = %entry
+ %eh_ptr = call i8* @llvm.eh.exception() ; <i8*> [#uses=1]
+ store i8* %eh_ptr, i8** %eh_exception
+ %eh_ptr1 = load i8** %eh_exception ; <i8*> [#uses=1]
+ %eh_select2 = call i32 (i8*, i8*, ...)* @llvm.eh.selector.i32(i8* %eh_ptr1, i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*), i32 0) ; <i32> [#uses=1]
+ store i32 %eh_select2, i32* %eh_selector
+ br label %ppad
+
+ppad: ; preds = %lpad
+ br label %bb
+
+Unwind: ; preds = %bb
+ %eh_ptr3 = load i8** %eh_exception ; <i8*> [#uses=1]
+ call arm_apcscc void @_Unwind_SjLj_Resume(i8* %eh_ptr3)
+ unreachable
+}
+
+define linkonce_odr arm_apcscc void @_ZN1AC1Ev(%struct.A* %this) {
+entry:
+ %this_addr = alloca %struct.A* ; <%struct.A**> [#uses=2]
+ %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
+ store %struct.A* %this, %struct.A** %this_addr
+ %0 = call arm_apcscc i8* @_Znwm(i32 4) ; <i8*> [#uses=1]
+ %1 = bitcast i8* %0 to i32* ; <i32*> [#uses=1]
+ %2 = load %struct.A** %this_addr, align 4 ; <%struct.A*> [#uses=1]
+ %3 = getelementptr inbounds %struct.A* %2, i32 0, i32 0 ; <i32**> [#uses=1]
+ store i32* %1, i32** %3, align 4
+ br label %return
+
+return: ; preds = %entry
+ ret void
+}
+
+declare arm_apcscc i8* @_Znwm(i32)
+
+define linkonce_odr arm_apcscc void @_ZN1AD1Ev(%struct.A* %this) nounwind {
+entry:
+ %this_addr = alloca %struct.A* ; <%struct.A**> [#uses=2]
+ %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
+ store %struct.A* %this, %struct.A** %this_addr
+ %0 = load %struct.A** %this_addr, align 4 ; <%struct.A*> [#uses=1]
+ %1 = getelementptr inbounds %struct.A* %0, i32 0, i32 0 ; <i32**> [#uses=1]
+ %2 = load i32** %1, align 4 ; <i32*> [#uses=1]
+ %3 = bitcast i32* %2 to i8* ; <i8*> [#uses=1]
+ call arm_apcscc void @_ZdlPv(i8* %3) nounwind
+ br label %bb
+
+bb: ; preds = %entry
+ br label %return
+
+return: ; preds = %bb
+ ret void
+}
+;CHECK: L_LSDA_1:
+
+declare arm_apcscc void @_ZdlPv(i8*) nounwind
+
+declare arm_apcscc void @_Z3barv()
+
+declare i8* @llvm.eh.exception() nounwind
+
+declare i32 @llvm.eh.selector.i32(i8*, i8*, ...) nounwind
+
+declare i32 @llvm.eh.typeid.for.i32(i8*) nounwind
+
+declare arm_apcscc i32 @__gxx_personality_sj0(...)
+
+declare arm_apcscc void @_Unwind_SjLj_Resume(i8*)
diff --git a/test/CodeGen/ARM/2009-08-31-TwoRegShuffle.ll b/test/CodeGen/ARM/2009-08-31-TwoRegShuffle.ll
new file mode 100644
index 0000000..e1e60e6
--- /dev/null
+++ b/test/CodeGen/ARM/2009-08-31-TwoRegShuffle.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; pr4843
+define <4 x i16> @v2regbug(<4 x i16>* %B) nounwind {
+;CHECK: v2regbug:
+;CHECK: vzip.16
+ %tmp1 = load <4 x i16>* %B
+ %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32><i32 0, i32 0, i32 1, i32 1>
+ ret <4 x i16> %tmp2
+}
diff --git a/test/CodeGen/ARM/2009-09-01-PostRAProlog.ll b/test/CodeGen/ARM/2009-09-01-PostRAProlog.ll
new file mode 100644
index 0000000..bf91fe0
--- /dev/null
+++ b/test/CodeGen/ARM/2009-09-01-PostRAProlog.ll
@@ -0,0 +1,106 @@
+; RUN: llc -asm-verbose=false -O3 -relocation-model=pic -disable-fp-elim -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
+target triple = "thumbv7-apple-darwin9"
+
+@history = internal global [2 x [56 x i32]] [[56 x i32] [i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 0, i32 1, i32 2, i32 4, i32 2, i32 1, i32 0, i32 -1, i32 1, i32 3, i32 5, i32 7, i32 5, i32 3, i32 1, i32 -1, i32 2, i32 5, i32 8, i32 10, i32 8, i32 5, i32 2, i32 -1, i32 2, i32 5, i32 8, i32 10, i32 8, i32 5, i32 2, i32 -1, i32 1, i32 3, i32 5, i32 7, i32 5, i32 3, i32 1, i32 -1, i32 0, i32 1, i32 2, i32 4, i32 2, i32 1, i32 0], [56 x i32] [i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 0, i32 1, i32 2, i32 4, i32 2, i32 1, i32 0, i32 -1, i32 1, i32 3, i32 5, i32 7, i32 5, i32 3, i32 1, i32 -1, i32 2, i32 5, i32 8, i32 10, i32 8, i32 5, i32 2, i32 -1, i32 2, i32 5, i32 8, i32 10, i32 8, i32 5, i32 2, i32 -1, i32 1, i32 3, i32 5, i32 7, i32 5, i32 3, i32 1, i32 -1, i32 0, i32 1, i32 2, i32 4, i32 2, i32 1, i32 0]] ; <[2 x [56 x i32]]*> [#uses=3]
+@nodes = internal global i64 0 ; <i64*> [#uses=4]
+@.str = private constant [9 x i8] c"##-<=>+#\00", align 1 ; <[9 x i8]*> [#uses=2]
+@.str1 = private constant [6 x i8] c"%c%d\0A\00", align 1 ; <[6 x i8]*> [#uses=1]
+@.str2 = private constant [16 x i8] c"Fhourstones 2.0\00", align 1 ; <[16 x i8]*> [#uses=1]
+@.str3 = private constant [54 x i8] c"Using %d transposition table entries with %d probes.\0A\00", align 1 ; <[54 x i8]*> [#uses=1]
+@.str4 = private constant [31 x i8] c"Solving %d-ply position after \00", align 1 ; <[31 x i8]*> [#uses=1]
+@.str5 = private constant [7 x i8] c" . . .\00", align 1 ; <[7 x i8]*> [#uses=1]
+@.str6 = private constant [28 x i8] c"score = %d (%c) work = %d\0A\00", align 1 ; <[28 x i8]*> [#uses=1]
+@.str7 = private constant [36 x i8] c"%lu pos / %lu msec = %.1f Kpos/sec\0A\00", align 1 ; <[36 x i8]*> [#uses=1]
+@plycnt = internal global i32 0 ; <i32*> [#uses=21]
+@dias = internal global [19 x i32] zeroinitializer ; <[19 x i32]*> [#uses=43]
+@columns = internal global [128 x i32] zeroinitializer ; <[128 x i32]*> [#uses=18]
+@height = internal global [128 x i32] zeroinitializer ; <[128 x i32]*> [#uses=21]
+@rows = internal global [8 x i32] zeroinitializer ; <[8 x i32]*> [#uses=20]
+@colthr = internal global [128 x i32] zeroinitializer ; <[128 x i32]*> [#uses=5]
+@moves = internal global [44 x i32] zeroinitializer ; <[44 x i32]*> [#uses=9]
+@.str8 = private constant [3 x i8] c"%d\00", align 1 ; <[3 x i8]*> [#uses=1]
+@he = internal global i8* null ; <i8**> [#uses=9]
+@hits = internal global i64 0 ; <i64*> [#uses=8]
+@posed = internal global i64 0 ; <i64*> [#uses=7]
+@ht = internal global i32* null ; <i32**> [#uses=5]
+@.str16 = private constant [19 x i8] c"store rate = %.3f\0A\00", align 1 ; <[19 x i8]*> [#uses=1]
+@.str117 = private constant [45 x i8] c"- %5.3f < %5.3f = %5.3f > %5.3f + %5.3f\0A\00", align 1 ; <[45 x i8]*> [#uses=1]
+@.str218 = private constant [6 x i8] c"%7d%c\00", align 1 ; <[6 x i8]*> [#uses=1]
+@.str319 = private constant [30 x i8] c"Failed to allocate %u bytes.\0A\00", align 1 ; <[30 x i8]*> [#uses=1]
+
+declare arm_apcscc i32 @puts(i8* nocapture) nounwind
+
+declare arm_apcscc i32 @getchar() nounwind
+
+define internal arm_apcscc i32 @transpose() nounwind readonly {
+; CHECK: push
+entry:
+ %0 = load i32* getelementptr inbounds ([128 x i32]* @columns, i32 0, i32 1), align 4 ; <i32> [#uses=1]
+ %1 = shl i32 %0, 7 ; <i32> [#uses=1]
+ %2 = load i32* getelementptr inbounds ([128 x i32]* @columns, i32 0, i32 2), align 4 ; <i32> [#uses=1]
+ %3 = or i32 %1, %2 ; <i32> [#uses=1]
+ %4 = shl i32 %3, 7 ; <i32> [#uses=1]
+ %5 = load i32* getelementptr inbounds ([128 x i32]* @columns, i32 0, i32 3), align 4 ; <i32> [#uses=1]
+ %6 = or i32 %4, %5 ; <i32> [#uses=3]
+ %7 = load i32* getelementptr inbounds ([128 x i32]* @columns, i32 0, i32 7), align 4 ; <i32> [#uses=1]
+ %8 = shl i32 %7, 7 ; <i32> [#uses=1]
+ %9 = load i32* getelementptr inbounds ([128 x i32]* @columns, i32 0, i32 6), align 4 ; <i32> [#uses=1]
+ %10 = or i32 %8, %9 ; <i32> [#uses=1]
+ %11 = shl i32 %10, 7 ; <i32> [#uses=1]
+ %12 = load i32* getelementptr inbounds ([128 x i32]* @columns, i32 0, i32 5), align 4 ; <i32> [#uses=1]
+ %13 = or i32 %11, %12 ; <i32> [#uses=3]
+ %14 = icmp ugt i32 %6, %13 ; <i1> [#uses=2]
+ %.pn2.in.i = select i1 %14, i32 %6, i32 %13 ; <i32> [#uses=1]
+ %.pn1.in.i = select i1 %14, i32 %13, i32 %6 ; <i32> [#uses=1]
+ %.pn2.i = shl i32 %.pn2.in.i, 7 ; <i32> [#uses=1]
+ %.pn3.i = load i32* getelementptr inbounds ([128 x i32]* @columns, i32 0, i32 4) ; <i32> [#uses=1]
+ %.pn.in.in.i = or i32 %.pn2.i, %.pn3.i ; <i32> [#uses=1]
+ %.pn.in.i = zext i32 %.pn.in.in.i to i64 ; <i64> [#uses=1]
+ %.pn.i = shl i64 %.pn.in.i, 21 ; <i64> [#uses=1]
+ %.pn1.i = zext i32 %.pn1.in.i to i64 ; <i64> [#uses=1]
+ %iftmp.22.0.i = or i64 %.pn.i, %.pn1.i ; <i64> [#uses=2]
+ %15 = lshr i64 %iftmp.22.0.i, 17 ; <i64> [#uses=1]
+ %16 = trunc i64 %15 to i32 ; <i32> [#uses=2]
+ %17 = urem i64 %iftmp.22.0.i, 1050011 ; <i64> [#uses=1]
+ %18 = trunc i64 %17 to i32 ; <i32> [#uses=1]
+ %19 = urem i32 %16, 179 ; <i32> [#uses=1]
+ %20 = or i32 %19, 131072 ; <i32> [#uses=1]
+ %21 = load i32** @ht, align 4 ; <i32*> [#uses=1]
+ br label %bb5
+
+bb: ; preds = %bb5
+ %22 = getelementptr inbounds i32* %21, i32 %x.0 ; <i32*> [#uses=1]
+ %23 = load i32* %22, align 4 ; <i32> [#uses=1]
+ %24 = icmp eq i32 %23, %16 ; <i1> [#uses=1]
+ br i1 %24, label %bb1, label %bb2
+
+bb1: ; preds = %bb
+ %25 = load i8** @he, align 4 ; <i8*> [#uses=1]
+ %26 = getelementptr inbounds i8* %25, i32 %x.0 ; <i8*> [#uses=1]
+ %27 = load i8* %26, align 1 ; <i8> [#uses=1]
+ %28 = sext i8 %27 to i32 ; <i32> [#uses=1]
+ ret i32 %28
+
+bb2: ; preds = %bb
+ %29 = add nsw i32 %20, %x.0 ; <i32> [#uses=3]
+ %30 = add i32 %29, -1050011 ; <i32> [#uses=1]
+ %31 = icmp sgt i32 %29, 1050010 ; <i1> [#uses=1]
+ %. = select i1 %31, i32 %30, i32 %29 ; <i32> [#uses=1]
+ %32 = add i32 %33, 1 ; <i32> [#uses=1]
+ br label %bb5
+
+bb5: ; preds = %bb2, %entry
+ %33 = phi i32 [ 0, %entry ], [ %32, %bb2 ] ; <i32> [#uses=2]
+ %x.0 = phi i32 [ %18, %entry ], [ %., %bb2 ] ; <i32> [#uses=3]
+ %34 = icmp sgt i32 %33, 7 ; <i1> [#uses=1]
+ br i1 %34, label %bb7, label %bb
+
+bb7: ; preds = %bb5
+ ret i32 -128
+}
+
+declare arm_apcscc noalias i8* @calloc(i32, i32) nounwind
+
+declare void @llvm.memset.i64(i8* nocapture, i8, i64, i32) nounwind
diff --git a/test/CodeGen/ARM/2009-09-09-AllOnes.ll b/test/CodeGen/ARM/2009-09-09-AllOnes.ll
new file mode 100644
index 0000000..f654a16
--- /dev/null
+++ b/test/CodeGen/ARM/2009-09-09-AllOnes.ll
@@ -0,0 +1,10 @@
+; RUN: llc -mattr=+neon < %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
+target triple = "thumbv7-elf"
+
+define arm_apcscc void @foo() {
+entry:
+ %0 = insertelement <4 x i32> undef, i32 -1, i32 3
+ store <4 x i32> %0, <4 x i32>* undef, align 16
+ unreachable
+}
diff --git a/test/CodeGen/ARM/2009-09-09-fpcmp-ole.ll b/test/CodeGen/ARM/2009-09-09-fpcmp-ole.ll
new file mode 100644
index 0000000..98cab9a
--- /dev/null
+++ b/test/CodeGen/ARM/2009-09-09-fpcmp-ole.ll
@@ -0,0 +1,18 @@
+; RUN: llc -O1 -march=arm -mattr=+vfp2 < %s | FileCheck %s
+; pr4939
+
+define void @test(double* %x, double* %y) nounwind {
+ %1 = load double* %x, align 4
+ %2 = load double* %y, align 4
+ %3 = fsub double -0.000000e+00, %1
+ %4 = fcmp ugt double %2, %3
+ br i1 %4, label %bb1, label %bb2
+
+bb1:
+;CHECK: fstdhi
+ store double %1, double* %y, align 4
+ br label %bb2
+
+bb2:
+ ret void
+}
diff --git a/test/CodeGen/ARM/2009-09-10-postdec.ll b/test/CodeGen/ARM/2009-09-10-postdec.ll
new file mode 100644
index 0000000..10653b5
--- /dev/null
+++ b/test/CodeGen/ARM/2009-09-10-postdec.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=arm < %s | FileCheck %s
+; Radar 7213850
+
+define i32 @test(i8* %d, i32 %x, i32 %y) nounwind {
+ %1 = ptrtoint i8* %d to i32
+;CHECK: sub
+ %2 = sub i32 %x, %1
+ %3 = add nsw i32 %2, %y
+ store i8 0, i8* %d, align 1
+ ret i32 %3
+}
diff --git a/test/CodeGen/ARM/2009-09-13-InvalidSubreg.ll b/test/CodeGen/ARM/2009-09-13-InvalidSubreg.ll
new file mode 100644
index 0000000..13adb24
--- /dev/null
+++ b/test/CodeGen/ARM/2009-09-13-InvalidSubreg.ll
@@ -0,0 +1,61 @@
+; RUN: llc -mattr=+neon < %s
+; PR4965
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
+target triple = "armv7-eabi"
+
+%struct.fr = type { [6 x %struct.pl] }
+%struct.obb = type { %"struct.m4", %"struct.p3" }
+%struct.pl = type { %"struct.p3" }
+%"struct.m4" = type { %"struct.p3", %"struct.p3", %"struct.p3", %"struct.p3" }
+%"struct.p3" = type { <4 x float> }
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define arm_aapcs_vfpcc i8 @foo(%struct.fr* nocapture %this, %struct.obb* %box) nounwind {
+entry:
+ %val.i.i = load <4 x float>* undef ; <<4 x float>> [#uses=1]
+ %val2.i.i = load <4 x float>* null ; <<4 x float>> [#uses=1]
+ %elt3.i.i = getelementptr inbounds %struct.obb* %box, i32 0, i32 0, i32 2, i32 0 ; <<4 x float>*> [#uses=1]
+ %val4.i.i = load <4 x float>* %elt3.i.i ; <<4 x float>> [#uses=1]
+ %0 = shufflevector <2 x float> undef, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
+ %1 = fadd <4 x float> undef, zeroinitializer ; <<4 x float>> [#uses=1]
+ br label %bb33
+
+bb: ; preds = %bb33
+ %2 = fmul <4 x float> %val.i.i, undef ; <<4 x float>> [#uses=1]
+ %3 = fmul <4 x float> %val2.i.i, undef ; <<4 x float>> [#uses=1]
+ %4 = fadd <4 x float> %3, %2 ; <<4 x float>> [#uses=1]
+ %5 = fmul <4 x float> %val4.i.i, undef ; <<4 x float>> [#uses=1]
+ %6 = fadd <4 x float> %5, %4 ; <<4 x float>> [#uses=1]
+ %7 = bitcast <4 x float> %6 to <4 x i32> ; <<4 x i32>> [#uses=1]
+ %8 = and <4 x i32> %7, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648> ; <<4 x i32>> [#uses=1]
+ %9 = or <4 x i32> %8, undef ; <<4 x i32>> [#uses=1]
+ %10 = bitcast <4 x i32> %9 to <4 x float> ; <<4 x float>> [#uses=1]
+ %11 = shufflevector <4 x float> %10, <4 x float> undef, <2 x i32> <i32 0, i32 1> ; <<2 x float>> [#uses=1]
+ %12 = shufflevector <2 x float> %11, <2 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
+ %13 = fmul <4 x float> undef, %12 ; <<4 x float>> [#uses=1]
+ %14 = fmul <4 x float> %0, undef ; <<4 x float>> [#uses=1]
+ %15 = fadd <4 x float> %14, %13 ; <<4 x float>> [#uses=1]
+ %16 = fadd <4 x float> undef, %15 ; <<4 x float>> [#uses=1]
+ %17 = fadd <4 x float> %1, %16 ; <<4 x float>> [#uses=1]
+ %18 = fmul <4 x float> zeroinitializer, %17 ; <<4 x float>> [#uses=1]
+ %19 = insertelement <4 x float> %18, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=2]
+ %20 = shufflevector <4 x float> %19, <4 x float> undef, <2 x i32> <i32 0, i32 1> ; <<2 x float>> [#uses=1]
+ %21 = shufflevector <4 x float> %19, <4 x float> undef, <2 x i32> <i32 2, i32 3> ; <<2 x float>> [#uses=1]
+ %22 = tail call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %20, <2 x float> %21) nounwind ; <<2 x float>> [#uses=2]
+ %23 = tail call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %22, <2 x float> %22) nounwind ; <<2 x float>> [#uses=2]
+ %24 = shufflevector <2 x float> %23, <2 x float> %23, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
+ %25 = fadd <4 x float> %24, zeroinitializer ; <<4 x float>> [#uses=1]
+ %tmp46 = extractelement <4 x float> %25, i32 0 ; <float> [#uses=1]
+ %26 = fcmp olt float %tmp46, 0.000000e+00 ; <i1> [#uses=1]
+ br i1 %26, label %bb41, label %bb33
+
+bb33: ; preds = %bb, %entry
+ br i1 undef, label %bb34, label %bb
+
+bb34: ; preds = %bb33
+ ret i8 undef
+
+bb41: ; preds = %bb
+ ret i8 1
+}
diff --git a/test/CodeGen/ARM/2009-09-13-InvalidSuperReg.ll b/test/CodeGen/ARM/2009-09-13-InvalidSuperReg.ll
new file mode 100644
index 0000000..758b59a
--- /dev/null
+++ b/test/CodeGen/ARM/2009-09-13-InvalidSuperReg.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -march=arm -mattr=+neon -mcpu=cortex-a9
+
+define arm_aapcs_vfpcc <4 x float> @foo(i8* nocapture %pBuffer, i32 %numItems) nounwind {
+ %1 = ptrtoint i8* %pBuffer to i32
+
+ %lsr.iv2641 = inttoptr i32 %1 to float*
+ %tmp29 = add i32 %1, 4
+ %tmp2930 = inttoptr i32 %tmp29 to float*
+ %tmp31 = add i32 %1, 8
+ %tmp3132 = inttoptr i32 %tmp31 to float*
+ %tmp33 = add i32 %1, 12
+ %tmp3334 = inttoptr i32 %tmp33 to float*
+ %tmp35 = add i32 %1, 16
+ %tmp3536 = inttoptr i32 %tmp35 to float*
+ %tmp37 = add i32 %1, 20
+ %tmp3738 = inttoptr i32 %tmp37 to float*
+ %tmp39 = add i32 %1, 24
+ %tmp3940 = inttoptr i32 %tmp39 to float*
+ %2 = load float* %lsr.iv2641, align 4
+ %3 = load float* %tmp2930, align 4
+ %4 = load float* %tmp3132, align 4
+ %5 = load float* %tmp3334, align 4
+ %6 = load float* %tmp3536, align 4
+ %7 = load float* %tmp3738, align 4
+ %8 = load float* %tmp3940, align 4
+ %9 = insertelement <4 x float> undef, float %6, i32 0
+ %10 = shufflevector <4 x float> %9, <4 x float> undef, <4 x i32> zeroinitializer
+ %11 = insertelement <4 x float> %10, float %7, i32 1
+ %12 = insertelement <4 x float> %11, float %8, i32 2
+ %13 = insertelement <4 x float> undef, float %2, i32 0
+ %14 = shufflevector <4 x float> %13, <4 x float> undef, <4 x i32> zeroinitializer
+ %15 = insertelement <4 x float> %14, float %3, i32 1
+ %16 = insertelement <4 x float> %15, float %4, i32 2
+ %17 = insertelement <4 x float> %16, float %5, i32 3
+ %18 = fsub <4 x float> zeroinitializer, %12
+ %19 = shufflevector <4 x float> %18, <4 x float> undef, <4 x i32> zeroinitializer
+ %20 = shufflevector <4 x float> %17, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+ %21 = shufflevector <2 x float> %20, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+
+ ret <4 x float> %21
+}
diff --git a/test/CodeGen/ARM/2009-09-20-LiveIntervalsBug.ll b/test/CodeGen/ARM/2009-09-20-LiveIntervalsBug.ll
new file mode 100644
index 0000000..980f8ce
--- /dev/null
+++ b/test/CodeGen/ARM/2009-09-20-LiveIntervalsBug.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=arm-eabi -mattr=+neon -mcpu=cortex-a9
+
+; PR4986
+
+define arm_aapcs_vfpcc void @foo(i8* nocapture %pBuffer, i32 %numItems) nounwind {
+entry:
+ br i1 undef, label %return, label %bb.preheader
+
+bb.preheader: ; preds = %entry
+ br label %bb
+
+bb: ; preds = %bb, %bb.preheader
+ %0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
+ %1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1]
+ %2 = insertelement <4 x float> %1, float undef, i32 2 ; <<4 x float>> [#uses=1]
+ %3 = insertelement <4 x float> %2, float undef, i32 3 ; <<4 x float>> [#uses=1]
+ %4 = fmul <4 x float> undef, %3 ; <<4 x float>> [#uses=1]
+ %5 = extractelement <4 x float> %4, i32 3 ; <float> [#uses=1]
+ store float %5, float* undef, align 4
+ br i1 undef, label %return, label %bb
+
+return: ; preds = %bb, %entry
+ ret void
+}
+
+define arm_aapcs_vfpcc <4 x float> @bar(i8* nocapture %pBuffer, i32 %numItems) nounwind {
+ %1 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
+ %2 = insertelement <4 x float> %1, float undef, i32 1 ; <<4 x float>> [#uses=1]
+ %3 = insertelement <4 x float> %2, float undef, i32 2 ; <<4 x float>> [#uses=1]
+ %4 = insertelement <4 x float> %3, float undef, i32 3 ; <<4 x float>> [#uses=1]
+ %5 = shufflevector <4 x float> %4, <4 x float> undef, <2 x i32> <i32 0, i32 1> ; <<2 x float>> [#uses=1]
+ %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x float>> [#uses=1]
+ ret <4 x float> %6
+}
diff --git a/test/CodeGen/ARM/2009-09-21-LiveVariablesBug.ll b/test/CodeGen/ARM/2009-09-21-LiveVariablesBug.ll
new file mode 100644
index 0000000..aace475
--- /dev/null
+++ b/test/CodeGen/ARM/2009-09-21-LiveVariablesBug.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mattr=+neon
+
+; PR5024
+
+%bar = type { <4 x float> }
+%foo = type { %bar, %bar, %bar, %bar }
+
+declare arm_aapcs_vfpcc <4 x float> @bbb(%bar*) nounwind
+
+define arm_aapcs_vfpcc void @aaa(%foo* noalias sret %agg.result, %foo* %tfrm) nounwind {
+entry:
+ %0 = call arm_aapcs_vfpcc <4 x float> @bbb(%bar* undef) nounwind ; <<4 x float>> [#uses=0]
+ ret void
+}
diff --git a/test/CodeGen/ARM/2009-09-22-LiveVariablesBug.ll b/test/CodeGen/ARM/2009-09-22-LiveVariablesBug.ll
new file mode 100644
index 0000000..30931a2
--- /dev/null
+++ b/test/CodeGen/ARM/2009-09-22-LiveVariablesBug.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mattr=+neon
+
+; PR5024
+
+%bar = type { %foo, %foo }
+%foo = type { <4 x float> }
+
+declare arm_aapcs_vfpcc float @aaa(%foo* nocapture) nounwind readonly
+
+declare arm_aapcs_vfpcc %bar* @bbb(%bar*, <4 x float>, <4 x float>) nounwind
+
+define arm_aapcs_vfpcc void @ccc(i8* nocapture %pBuffer, i32 %numItems) nounwind {
+entry:
+ br i1 undef, label %return, label %bb.nph
+
+bb.nph: ; preds = %entry
+ %0 = call arm_aapcs_vfpcc %bar* @bbb(%bar* undef, <4 x float> undef, <4 x float> undef) nounwind ; <%bar*> [#uses=0]
+ %1 = call arm_aapcs_vfpcc float @aaa(%foo* undef) nounwind ; <float> [#uses=0]
+ unreachable
+
+return: ; preds = %entry
+ ret void
+}
diff --git a/test/CodeGen/ARM/2009-09-23-LiveVariablesBug.ll b/test/CodeGen/ARM/2009-09-23-LiveVariablesBug.ll
new file mode 100644
index 0000000..2ff479b
--- /dev/null
+++ b/test/CodeGen/ARM/2009-09-23-LiveVariablesBug.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mattr=+neon
+
+; PR5024
+
+%struct.1 = type { %struct.4, %struct.4 }
+%struct.4 = type { <4 x float> }
+
+define arm_aapcs_vfpcc %struct.1* @hhh3(%struct.1* %this, <4 x float> %lenation.0, <4 x float> %legalation.0) nounwind {
+entry:
+ %0 = call arm_aapcs_vfpcc %struct.4* @sss1(%struct.4* undef, float 0.000000e+00) nounwind ; <%struct.4*> [#uses=0]
+ %1 = call arm_aapcs_vfpcc %struct.4* @qqq1(%struct.4* null, float 5.000000e-01) nounwind ; <%struct.4*> [#uses=0]
+ %val92 = load <4 x float>* null ; <<4 x float>> [#uses=1]
+ %2 = call arm_aapcs_vfpcc %struct.4* @zzz2(%struct.4* undef, <4 x float> %val92) nounwind ; <%struct.4*> [#uses=0]
+ ret %struct.1* %this
+}
+
+declare arm_aapcs_vfpcc %struct.4* @qqq1(%struct.4*, float) nounwind
+
+declare arm_aapcs_vfpcc %struct.4* @sss1(%struct.4*, float) nounwind
+
+declare arm_aapcs_vfpcc %struct.4* @zzz2(%struct.4*, <4 x float>) nounwind
diff --git a/test/CodeGen/ARM/2009-09-24-spill-align.ll b/test/CodeGen/ARM/2009-09-24-spill-align.ll
new file mode 100644
index 0000000..6281775
--- /dev/null
+++ b/test/CodeGen/ARM/2009-09-24-spill-align.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; pr4926
+
+define arm_apcscc void @test_vget_lanep16() nounwind {
+entry:
+ %arg0_poly16x4_t = alloca <4 x i16> ; <<4 x i16>*> [#uses=1]
+ %out_poly16_t = alloca i16 ; <i16*> [#uses=1]
+ %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
+; CHECK: fldd
+ %0 = load <4 x i16>* %arg0_poly16x4_t, align 8 ; <<4 x i16>> [#uses=1]
+ %1 = extractelement <4 x i16> %0, i32 1 ; <i16> [#uses=1]
+ store i16 %1, i16* %out_poly16_t, align 2
+ br label %return
+
+return: ; preds = %entry
+ ret void
+}
diff --git a/test/CodeGen/ARM/2009-09-27-CoalescerBug.ll b/test/CodeGen/ARM/2009-09-27-CoalescerBug.ll
new file mode 100644
index 0000000..ea2693a
--- /dev/null
+++ b/test/CodeGen/ARM/2009-09-27-CoalescerBug.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -mtriple=armv7-eabi -mcpu=cortex-a8
+; PR5055
+
+module asm ".globl\09__aeabi_f2lz"
+module asm ".set\09__aeabi_f2lz, __fixsfdi"
+module asm ""
+
+define arm_aapcs_vfpcc i64 @__fixsfdi(float %a) nounwind {
+entry:
+ %0 = fcmp olt float %a, 0.000000e+00 ; <i1> [#uses=1]
+ br i1 %0, label %bb, label %bb1
+
+bb: ; preds = %entry
+ %1 = fsub float -0.000000e+00, %a ; <float> [#uses=1]
+ %2 = tail call arm_aapcs_vfpcc i64 @__fixunssfdi(float %1) nounwind ; <i64> [#uses=1]
+ %3 = sub i64 0, %2 ; <i64> [#uses=1]
+ ret i64 %3
+
+bb1: ; preds = %entry
+ %4 = tail call arm_aapcs_vfpcc i64 @__fixunssfdi(float %a) nounwind ; <i64> [#uses=1]
+ ret i64 %4
+}
+
+declare arm_aapcs_vfpcc i64 @__fixunssfdi(float)
diff --git a/test/CodeGen/ARM/2009-09-28-LdStOptiBug.ll b/test/CodeGen/ARM/2009-09-28-LdStOptiBug.ll
new file mode 100644
index 0000000..53bd668
--- /dev/null
+++ b/test/CodeGen/ARM/2009-09-28-LdStOptiBug.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=armv5-unknown-linux-gnueabi -mcpu=arm10tdmi | FileCheck %s
+; PR4687
+
+%0 = type { double, double }
+
+define arm_aapcscc void @foo(%0* noalias nocapture sret %agg.result, double %x.0, double %y.0) nounwind {
+; CHECK: foo:
+; CHECK: bl __adddf3
+; CHECK-NOT: strd
+; CHECK: mov
+ %x76 = fmul double %y.0, 0.000000e+00 ; <double> [#uses=1]
+ %x77 = fadd double %y.0, 0.000000e+00 ; <double> [#uses=1]
+ %tmpr = fadd double %x.0, %x76 ; <double> [#uses=1]
+ %agg.result.0 = getelementptr %0* %agg.result, i32 0, i32 0 ; <double*> [#uses=1]
+ store double %tmpr, double* %agg.result.0, align 8
+ %agg.result.1 = getelementptr %0* %agg.result, i32 0, i32 1 ; <double*> [#uses=1]
+ store double %x77, double* %agg.result.1, align 8
+ ret void
+}
diff --git a/test/CodeGen/ARM/addrmode.ll b/test/CodeGen/ARM/addrmode.ll
index a3832c0..9ccff07 100644
--- a/test/CodeGen/ARM/addrmode.ll
+++ b/test/CodeGen/ARM/addrmode.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -stats |& grep asm-printer | grep 4
+; RUN: llc < %s -march=arm -stats |& grep asm-printer | grep 4
define i32 @t1(i32 %a) {
%b = mul i32 %a, 9
diff --git a/test/CodeGen/ARM/aliases.ll b/test/CodeGen/ARM/aliases.ll
index 70b2c4d..b2c0314 100644
--- a/test/CodeGen/ARM/aliases.ll
+++ b/test/CodeGen/ARM/aliases.ll
@@ -1,5 +1,4 @@
-; RUN: llvm-as < %s | \
-; RUN: llc -mtriple=arm-linux-gnueabi -o %t -f
+; RUN: llc < %s -mtriple=arm-linux-gnueabi -o %t
; RUN: grep set %t | count 5
; RUN: grep globl %t | count 4
; RUN: grep weak %t | count 1
diff --git a/test/CodeGen/ARM/align.ll b/test/CodeGen/ARM/align.ll
index bb336ce..d73abe6a 100644
--- a/test/CodeGen/ARM/align.ll
+++ b/test/CodeGen/ARM/align.ll
@@ -1,9 +1,9 @@
-; RUN: llvm-as < %s | llc -march=arm | grep align.*1 | count 1
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi | \
+; RUN: llc < %s -march=arm | grep align.*1 | count 1
+; RUN: llc < %s -mtriple=arm-linux-gnueabi | \
; RUN: grep align.*2 | count 2
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi | \
+; RUN: llc < %s -mtriple=arm-linux-gnueabi | \
; RUN: grep align.*3 | count 2
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | \
+; RUN: llc < %s -mtriple=arm-apple-darwin | \
; RUN: grep align.*2 | count 4
@a = global i1 true
diff --git a/test/CodeGen/ARM/alloca.ll b/test/CodeGen/ARM/alloca.ll
index f7e450f..15cf677 100644
--- a/test/CodeGen/ARM/alloca.ll
+++ b/test/CodeGen/ARM/alloca.ll
@@ -1,6 +1,6 @@
-; RUN: llvm-as < %s | llc -march=arm -mtriple=arm-linux-gnu | \
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnu | \
; RUN: grep {mov r11, sp}
-; RUN: llvm-as < %s | llc -march=arm -mtriple=arm-linux-gnu | \
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnu | \
; RUN: grep {mov sp, r11}
define void @f(i32 %a) {
diff --git a/test/CodeGen/ARM/argaddr.ll b/test/CodeGen/ARM/argaddr.ll
index 080827d..116a32f 100644
--- a/test/CodeGen/ARM/argaddr.ll
+++ b/test/CodeGen/ARM/argaddr.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
define void @f(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
entry:
diff --git a/test/CodeGen/ARM/arguments-nosplit-double.ll b/test/CodeGen/ARM/arguments-nosplit-double.ll
index 57ff95c..770e41d 100644
--- a/test/CodeGen/ARM/arguments-nosplit-double.ll
+++ b/test/CodeGen/ARM/arguments-nosplit-double.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi | not grep r3
+; RUN: llc < %s -mtriple=arm-linux-gnueabi | not grep r3
; PR4059
define i32 @f(i64 %z, i32 %a, double %b) {
diff --git a/test/CodeGen/ARM/arguments-nosplit-i64.ll b/test/CodeGen/ARM/arguments-nosplit-i64.ll
index 5464674..815edfd 100644
--- a/test/CodeGen/ARM/arguments-nosplit-i64.ll
+++ b/test/CodeGen/ARM/arguments-nosplit-i64.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi | not grep r3
+; RUN: llc < %s -mtriple=arm-linux-gnueabi | not grep r3
; PR4058
define i32 @f(i64 %z, i32 %a, i64 %b) {
diff --git a/test/CodeGen/ARM/arguments.ll b/test/CodeGen/ARM/arguments.ll
index 833e22d..ad5b2d6 100644
--- a/test/CodeGen/ARM/arguments.ll
+++ b/test/CodeGen/ARM/arguments.ll
@@ -1,6 +1,6 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi | \
+; RUN: llc < %s -mtriple=arm-linux-gnueabi | \
; RUN: grep {mov r0, r2} | count 1
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | \
+; RUN: llc < %s -mtriple=arm-apple-darwin | \
; RUN: grep {mov r0, r1} | count 1
define i32 @f(i32 %a, i64 %b) {
diff --git a/test/CodeGen/ARM/arguments2.ll b/test/CodeGen/ARM/arguments2.ll
index eb7e45b..a515ad7 100644
--- a/test/CodeGen/ARM/arguments2.ll
+++ b/test/CodeGen/ARM/arguments2.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin
+; RUN: llc < %s -mtriple=arm-linux-gnueabi
+; RUN: llc < %s -mtriple=arm-apple-darwin
define i32 @f(i32 %a, i128 %b) {
%tmp = call i32 @g(i128 %b)
diff --git a/test/CodeGen/ARM/arguments3.ll b/test/CodeGen/ARM/arguments3.ll
index 97c0405..58f64c6 100644
--- a/test/CodeGen/ARM/arguments3.ll
+++ b/test/CodeGen/ARM/arguments3.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin
+; RUN: llc < %s -mtriple=arm-linux-gnueabi
+; RUN: llc < %s -mtriple=arm-apple-darwin
define i64 @f(i32 %a, i128 %b) {
%tmp = call i64 @g(i128 %b)
diff --git a/test/CodeGen/ARM/arguments4.ll b/test/CodeGen/ARM/arguments4.ll
index 63ba64b..f5f4207 100644
--- a/test/CodeGen/ARM/arguments4.ll
+++ b/test/CodeGen/ARM/arguments4.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin
+; RUN: llc < %s -mtriple=arm-linux-gnueabi
+; RUN: llc < %s -mtriple=arm-apple-darwin
define float @f(i32 %a, i128 %b) {
%tmp = call float @g(i128 %b)
diff --git a/test/CodeGen/ARM/arguments5.ll b/test/CodeGen/ARM/arguments5.ll
index 2000ff7..388a8eb 100644
--- a/test/CodeGen/ARM/arguments5.ll
+++ b/test/CodeGen/ARM/arguments5.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin
+; RUN: llc < %s -mtriple=arm-linux-gnueabi
+; RUN: llc < %s -mtriple=arm-apple-darwin
define double @f(i32 %a, i128 %b) {
%tmp = call double @g(i128 %b)
diff --git a/test/CodeGen/ARM/arguments6.ll b/test/CodeGen/ARM/arguments6.ll
index a18c621..3f757fee 100644
--- a/test/CodeGen/ARM/arguments6.ll
+++ b/test/CodeGen/ARM/arguments6.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin
+; RUN: llc < %s -mtriple=arm-linux-gnueabi
+; RUN: llc < %s -mtriple=arm-apple-darwin
define i128 @f(i32 %a, i128 %b) {
%tmp = call i128 @g(i128 %b)
diff --git a/test/CodeGen/ARM/arguments7.ll b/test/CodeGen/ARM/arguments7.ll
index 489ffd4..038e417 100644
--- a/test/CodeGen/ARM/arguments7.ll
+++ b/test/CodeGen/ARM/arguments7.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin
+; RUN: llc < %s -mtriple=arm-linux-gnueabi
+; RUN: llc < %s -mtriple=arm-apple-darwin
define double @f(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, double %b) {
%tmp = call double @g(i32 %a2, i32 %a3, i32 %a4, i32 %a5, double %b)
diff --git a/test/CodeGen/ARM/arguments8.ll b/test/CodeGen/ARM/arguments8.ll
index 5ff7e09..6999a4d 100644
--- a/test/CodeGen/ARM/arguments8.ll
+++ b/test/CodeGen/ARM/arguments8.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin
+; RUN: llc < %s -mtriple=arm-linux-gnueabi
+; RUN: llc < %s -mtriple=arm-apple-darwin
define i64 @f(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i64 %b) {
%tmp = call i64 @g(i32 %a2, i32 %a3, i32 %a4, i32 %a5, i64 %b)
diff --git a/test/CodeGen/ARM/arguments_f64_backfill.ll b/test/CodeGen/ARM/arguments_f64_backfill.ll
index 07d928a..690f488 100644
--- a/test/CodeGen/ARM/arguments_f64_backfill.ll
+++ b/test/CodeGen/ARM/arguments_f64_backfill.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi -mattr=+vfp2 -float-abi=hard | grep {fcpys s0, s1}
+; RUN: llc < %s -mtriple=arm-linux-gnueabi -mattr=+vfp2 -float-abi=hard | grep {fcpys s0, s1}
define float @f(float %z, double %a, float %b) {
%tmp = call float @g(float %b)
diff --git a/test/CodeGen/ARM/arm-asm.ll b/test/CodeGen/ARM/arm-asm.ll
index b260b13..2e35e39 100644
--- a/test/CodeGen/ARM/arm-asm.ll
+++ b/test/CodeGen/ARM/arm-asm.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
define void @frame_dummy() {
entry:
diff --git a/test/CodeGen/ARM/arm-frameaddr.ll b/test/CodeGen/ARM/arm-frameaddr.ll
index f1e4c2a..2739860 100644
--- a/test/CodeGen/ARM/arm-frameaddr.ll
+++ b/test/CodeGen/ARM/arm-frameaddr.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep mov | grep r7
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi | grep mov | grep r11
+; RUN: llc < %s -mtriple=arm-apple-darwin | grep mov | grep r7
+; RUN: llc < %s -mtriple=arm-linux-gnueabi | grep mov | grep r11
; PR4344
; PR4416
diff --git a/test/CodeGen/ARM/arm-negative-stride.ll b/test/CodeGen/ARM/arm-negative-stride.ll
index 553c2fb..c4b4ec6 100644
--- a/test/CodeGen/ARM/arm-negative-stride.ll
+++ b/test/CodeGen/ARM/arm-negative-stride.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm | grep {str r1, \\\[r.*, -r.*, lsl #2\}
+; RUN: llc < %s -march=arm | grep {str r1, \\\[r.*, -r.*, lsl #2\}
define void @test(i32* %P, i32 %A, i32 %i) nounwind {
entry:
diff --git a/test/CodeGen/ARM/bfc.ll b/test/CodeGen/ARM/bfc.ll
new file mode 100644
index 0000000..53392de
--- /dev/null
+++ b/test/CodeGen/ARM/bfc.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=arm -mattr=+v6t2 | grep "bfc " | count 3
+
+; 4278190095 = 0xff00000f
+define i32 @f1(i32 %a) {
+ %tmp = and i32 %a, 4278190095
+ ret i32 %tmp
+}
+
+; 4286578688 = 0xff800000
+define i32 @f2(i32 %a) {
+ %tmp = and i32 %a, 4286578688
+ ret i32 %tmp
+}
+
+; 4095 = 0x00000fff
+define i32 @f3(i32 %a) {
+ %tmp = and i32 %a, 4095
+ ret i32 %tmp
+}
diff --git a/test/CodeGen/ARM/bic.ll b/test/CodeGen/ARM/bic.ll
index b4ea433..b16dcc6 100644
--- a/test/CodeGen/ARM/bic.ll
+++ b/test/CodeGen/ARM/bic.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm | grep {bic\\W*r\[0-9\]*,\\W*r\[0-9\]*,\\W*r\[0-9\]*} | count 2
+; RUN: llc < %s -march=arm | grep {bic\\W*r\[0-9\]*,\\W*r\[0-9\]*,\\W*r\[0-9\]*} | count 2
define i32 @f1(i32 %a, i32 %b) {
%tmp = xor i32 %b, 4294967295
diff --git a/test/CodeGen/ARM/bits.ll b/test/CodeGen/ARM/bits.ll
index 0ac4f9a..9e94efe 100644
--- a/test/CodeGen/ARM/bits.ll
+++ b/test/CodeGen/ARM/bits.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm > %t
+; RUN: llc < %s -march=arm > %t
; RUN: grep and %t | count 1
; RUN: grep orr %t | count 1
; RUN: grep eor %t | count 1
diff --git a/test/CodeGen/ARM/bx_fold.ll b/test/CodeGen/ARM/bx_fold.ll
index 437b318..0e3e070 100644
--- a/test/CodeGen/ARM/bx_fold.ll
+++ b/test/CodeGen/ARM/bx_fold.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm
-; RUN: llvm-as < %s | llc -march=arm | not grep bx
+; RUN: llc < %s -march=arm
+; RUN: llc < %s -march=arm | not grep bx
define void @test(i32 %Ptr, i8* %L) {
entry:
diff --git a/test/CodeGen/ARM/call.ll b/test/CodeGen/ARM/call.ll
index 6b19665..52246c3 100644
--- a/test/CodeGen/ARM/call.ll
+++ b/test/CodeGen/ARM/call.ll
@@ -1,6 +1,6 @@
-; RUN: llvm-as < %s | llc -march=arm | grep {mov lr, pc}
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v5t | grep blx
-; RUN: llvm-as < %s | llc -march=arm -mtriple=arm-linux-gnueabi\
+; RUN: llc < %s -march=arm | grep {mov lr, pc}
+; RUN: llc < %s -march=arm -mattr=+v5t | grep blx
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi\
; RUN: -relocation-model=pic | grep {PLT}
@t = weak global i32 ()* null ; <i32 ()**> [#uses=1]
diff --git a/test/CodeGen/ARM/call_nolink.ll b/test/CodeGen/ARM/call_nolink.ll
index 1af6fad..efe29d8 100644
--- a/test/CodeGen/ARM/call_nolink.ll
+++ b/test/CodeGen/ARM/call_nolink.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mtriple=arm-linux-gnueabi | \
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
; RUN: not grep {bx lr}
%struct.anon = type { i32 (i32, i32, i32)*, i32, i32, [3 x i32], i8*, i8*, i8* }
diff --git a/test/CodeGen/ARM/carry.ll b/test/CodeGen/ARM/carry.ll
index 3bf2dc0..294de5f 100644
--- a/test/CodeGen/ARM/carry.ll
+++ b/test/CodeGen/ARM/carry.ll
@@ -1,6 +1,6 @@
-; RUN: llvm-as < %s | llc -march=arm | grep "subs r" | count 2
-; RUN: llvm-as < %s | llc -march=arm | grep "adc r"
-; RUN: llvm-as < %s | llc -march=arm | grep "sbc r" | count 2
+; RUN: llc < %s -march=arm | grep "subs r" | count 2
+; RUN: llc < %s -march=arm | grep "adc r"
+; RUN: llc < %s -march=arm | grep "sbc r" | count 2
define i64 @f1(i64 %a, i64 %b) {
entry:
diff --git a/test/CodeGen/ARM/clz.ll b/test/CodeGen/ARM/clz.ll
index 389fb2c..d2235c9 100644
--- a/test/CodeGen/ARM/clz.ll
+++ b/test/CodeGen/ARM/clz.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v5t | grep clz
+; RUN: llc < %s -march=arm -mattr=+v5t | grep clz
declare i32 @llvm.ctlz.i32(i32)
diff --git a/test/CodeGen/ARM/compare-call.ll b/test/CodeGen/ARM/compare-call.ll
index fcb8b17..5f3ed1d 100644
--- a/test/CodeGen/ARM/compare-call.ll
+++ b/test/CodeGen/ARM/compare-call.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6,+vfp2 | \
+; RUN: llc < %s -march=arm -mattr=+v6,+vfp2 | \
; RUN: grep fcmpes
define void @test3(float* %glob, i32 %X) {
diff --git a/test/CodeGen/ARM/constants.ll b/test/CodeGen/ARM/constants.ll
index 095157b5..e2d8ddc 100644
--- a/test/CodeGen/ARM/constants.ll
+++ b/test/CodeGen/ARM/constants.ll
@@ -1,13 +1,13 @@
-; RUN: llvm-as < %s | llc -march=arm | \
+; RUN: llc < %s -march=arm | \
; RUN: grep {mov r0, #0} | count 1
-; RUN: llvm-as < %s | llc -march=arm | \
+; RUN: llc < %s -march=arm | \
; RUN: grep {mov r0, #255$} | count 1
-; RUN: llvm-as < %s | llc -march=arm -asm-verbose | \
+; RUN: llc < %s -march=arm -asm-verbose | \
; RUN: grep {mov r0.*256} | count 1
-; RUN: llvm-as < %s | llc -march=arm -asm-verbose | grep {orr.*256} | count 1
-; RUN: llvm-as < %s | llc -march=arm -asm-verbose | grep {mov r0, .*-1073741761} | count 1
-; RUN: llvm-as < %s | llc -march=arm -asm-verbose | grep {mov r0, .*1008} | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep {cmp r0, #1, 16} | count 1
+; RUN: llc < %s -march=arm -asm-verbose | grep {orr.*256} | count 1
+; RUN: llc < %s -march=arm -asm-verbose | grep {mov r0, .*-1073741761} | count 1
+; RUN: llc < %s -march=arm -asm-verbose | grep {mov r0, .*1008} | count 1
+; RUN: llc < %s -march=arm | grep {cmp r0, #1, 16} | count 1
define i32 @f1() {
ret i32 0
diff --git a/test/CodeGen/ARM/cse-libcalls.ll b/test/CodeGen/ARM/cse-libcalls.ll
index 4f4091a..0dcf9dd 100644
--- a/test/CodeGen/ARM/cse-libcalls.ll
+++ b/test/CodeGen/ARM/cse-libcalls.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm | grep {bl.\*__ltdf} | count 1
+; RUN: llc < %s -march=arm | grep {bl.\*__ltdf} | count 1
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i386-apple-darwin8"
diff --git a/test/CodeGen/ARM/ctors_dtors.ll b/test/CodeGen/ARM/ctors_dtors.ll
index 5caa5b1..fb94626 100644
--- a/test/CodeGen/ARM/ctors_dtors.ll
+++ b/test/CodeGen/ARM/ctors_dtors.ll
@@ -1,15 +1,15 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | \
-; RUN: grep {\\.mod_init_func}
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | \
-; RUN: grep {\\.mod_term_func}
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnu | \
-; RUN: grep {\\.section \\.ctors,"aw",.progbits}
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnu | \
-; RUN: grep {\\.section \\.dtors,"aw",.progbits}
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi | \
-; RUN: grep {\\.section \\.init_array,"aw",.init_array}
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi | \
-; RUN: grep {\\.section \\.fini_array,"aw",.fini_array}
+; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s -check-prefix=DARWIN
+; RUN: llc < %s -mtriple=arm-linux-gnu | FileCheck %s -check-prefix=ELF
+; RUN: llc < %s -mtriple=arm-linux-gnueabi | FileCheck %s -check-prefix=GNUEABI
+
+; DARWIN: .section __DATA,__mod_init_func,mod_init_funcs
+; DARWIN: .section __DATA,__mod_term_func,mod_term_funcs
+
+; ELF: .section .ctors,"aw",%progbits
+; ELF: .section .dtors,"aw",%progbits
+
+; GNUEABI: .section .init_array,"aw",%init_array
+; GNUEABI: .section .fini_array,"aw",%fini_array
@llvm.global_ctors = appending global [1 x { i32, void ()* }] [ { i32, void ()* } { i32 65535, void ()* @__mf_init } ] ; <[1 x { i32, void ()* }]*> [#uses=0]
@llvm.global_dtors = appending global [1 x { i32, void ()* }] [ { i32, void ()* } { i32 65535, void ()* @__mf_fini } ] ; <[1 x { i32, void ()* }]*> [#uses=0]
diff --git a/test/CodeGen/ARM/div.ll b/test/CodeGen/ARM/div.ll
index 1085ec7..2f724e7 100644
--- a/test/CodeGen/ARM/div.ll
+++ b/test/CodeGen/ARM/div.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm > %t
+; RUN: llc < %s -march=arm > %t
; RUN: grep __divsi3 %t
; RUN: grep __udivsi3 %t
; RUN: grep __modsi3 %t
diff --git a/test/CodeGen/ARM/dyn-stackalloc.ll b/test/CodeGen/ARM/dyn-stackalloc.ll
index e0cd4e1..92e2d13 100644
--- a/test/CodeGen/ARM/dyn-stackalloc.ll
+++ b/test/CodeGen/ARM/dyn-stackalloc.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
%struct.state = type { i32, %struct.info*, float**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i8* }
%struct.info = type { i32, i32, i32, i32, i32, i32, i32, i8* }
diff --git a/test/CodeGen/ARM/extloadi1.ll b/test/CodeGen/ARM/extloadi1.ll
index 2e9041c..dc45ce7 100644
--- a/test/CodeGen/ARM/extloadi1.ll
+++ b/test/CodeGen/ARM/extloadi1.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
@handler_installed.6144.b = external global i1 ; <i1*> [#uses=1]
define void @__mf_sigusr1_respond() {
diff --git a/test/CodeGen/ARM/fabss.ll b/test/CodeGen/ARM/fabss.ll
new file mode 100644
index 0000000..5690a01
--- /dev/null
+++ b/test/CodeGen/ARM/fabss.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm -mattr=+vfp2 | grep -E {fabss\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | grep -E {vabs.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | grep -E {fabss\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | grep -E {vabs.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mcpu=cortex-a9 | grep -E {fabss\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+
+define float @test(float %a, float %b) {
+entry:
+ %dum = fadd float %a, %b
+ %0 = tail call float @fabsf(float %dum)
+ %dum1 = fadd float %0, %b
+ ret float %dum1
+}
+
+declare float @fabsf(float)
diff --git a/test/CodeGen/ARM/fadds.ll b/test/CodeGen/ARM/fadds.ll
new file mode 100644
index 0000000..a01f868
--- /dev/null
+++ b/test/CodeGen/ARM/fadds.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm -mattr=+vfp2 | grep -E {fadds\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | grep -E {vadd.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | grep -E {fadds\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | grep -E {vadd.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mcpu=cortex-a9 | grep -E {fadds\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+
+define float @test(float %a, float %b) {
+entry:
+ %0 = fadd float %a, %b
+ ret float %0
+}
+
diff --git a/test/CodeGen/ARM/fcopysign.ll b/test/CodeGen/ARM/fcopysign.ll
index 66acda9..bf7c305 100644
--- a/test/CodeGen/ARM/fcopysign.ll
+++ b/test/CodeGen/ARM/fcopysign.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm | grep bic | count 2
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6,+vfp2 | \
+; RUN: llc < %s -march=arm | grep bic | count 2
+; RUN: llc < %s -march=arm -mattr=+v6,+vfp2 | \
; RUN: grep fneg | count 2
define float @test1(float %x, double %y) {
diff --git a/test/CodeGen/ARM/fdivs.ll b/test/CodeGen/ARM/fdivs.ll
new file mode 100644
index 0000000..2af250d
--- /dev/null
+++ b/test/CodeGen/ARM/fdivs.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm -mattr=+vfp2 | grep -E {fdivs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | grep -E {fdivs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | grep -E {fdivs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | grep -E {fdivs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mcpu=cortex-a9 | grep -E {fdivs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+
+define float @test(float %a, float %b) {
+entry:
+ %0 = fdiv float %a, %b
+ ret float %0
+}
+
diff --git a/test/CodeGen/ARM/fixunsdfdi.ll b/test/CodeGen/ARM/fixunsdfdi.ll
index 777a3d6..ebf1d84 100644
--- a/test/CodeGen/ARM/fixunsdfdi.ll
+++ b/test/CodeGen/ARM/fixunsdfdi.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2
-; RUN: llvm-as < %s | llc -march=arm -mattr=vfp2 | not grep fstd
+; RUN: llc < %s -march=arm -mattr=+vfp2
+; RUN: llc < %s -march=arm -mattr=vfp2 | not grep fstd
define hidden i64 @__fixunsdfdi(double %x) nounwind readnone {
entry:
diff --git a/test/CodeGen/ARM/fmacs.ll b/test/CodeGen/ARM/fmacs.ll
new file mode 100644
index 0000000..1a1cd07
--- /dev/null
+++ b/test/CodeGen/ARM/fmacs.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=arm -mattr=+vfp2 | grep -E {fmacs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | grep -E {vmla.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | grep -E {fmacs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | grep -E {vmla.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mcpu=cortex-a9 | grep -E {fmacs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+
+define float @test(float %acc, float %a, float %b) {
+entry:
+ %0 = fmul float %a, %b
+ %1 = fadd float %acc, %0
+ ret float %1
+}
+
diff --git a/test/CodeGen/ARM/fmdrr-fmrrd.ll b/test/CodeGen/ARM/fmdrr-fmrrd.ll
index 315e623..eb72faf 100644
--- a/test/CodeGen/ARM/fmdrr-fmrrd.ll
+++ b/test/CodeGen/ARM/fmdrr-fmrrd.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=vfp2 | not grep fmdrr
-; RUN: llvm-as < %s | llc -march=arm -mattr=vfp2 | not grep fmrrd
+; RUN: llc < %s -march=arm -mattr=vfp2 | not grep fmdrr
+; RUN: llc < %s -march=arm -mattr=vfp2 | not grep fmrrd
; naive codegen for this is:
; _i:
diff --git a/test/CodeGen/ARM/fmscs.ll b/test/CodeGen/ARM/fmscs.ll
new file mode 100644
index 0000000..c6e6d40
--- /dev/null
+++ b/test/CodeGen/ARM/fmscs.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=arm -mattr=+vfp2 | grep -E {fmscs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | grep -E {fmscs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | grep -E {fmscs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | grep -E {fmscs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mcpu=cortex-a9 | grep -E {fmscs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+
+define float @test(float %acc, float %a, float %b) {
+entry:
+ %0 = fmul float %a, %b
+ %1 = fsub float %0, %acc
+ ret float %1
+}
+
diff --git a/test/CodeGen/ARM/fmuls.ll b/test/CodeGen/ARM/fmuls.ll
new file mode 100644
index 0000000..cb5dade
--- /dev/null
+++ b/test/CodeGen/ARM/fmuls.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm -mattr=+vfp2 | grep -E {fmuls\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | grep -E {vmul.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | grep -E {fmuls\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | grep -E {vmul.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mcpu=cortex-a9 | grep -E {fmuls\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+
+define float @test(float %a, float %b) {
+entry:
+ %0 = fmul float %a, %b
+ ret float %0
+}
+
diff --git a/test/CodeGen/ARM/fnegs.ll b/test/CodeGen/ARM/fnegs.ll
new file mode 100644
index 0000000..7da443d
--- /dev/null
+++ b/test/CodeGen/ARM/fnegs.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=arm -mattr=+vfp2 | grep -E {fnegs\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 2
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | grep -E {vneg.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 2
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | grep -E {fnegs\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 2
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | grep -E {vneg.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 2
+; RUN: llc < %s -march=arm -mcpu=cortex-a9 | grep -E {fnegs\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 2
+
+define float @test1(float* %a) {
+entry:
+ %0 = load float* %a, align 4 ; <float> [#uses=2]
+ %1 = fsub float -0.000000e+00, %0 ; <float> [#uses=2]
+ %2 = fpext float %1 to double ; <double> [#uses=1]
+ %3 = fcmp olt double %2, 1.234000e+00 ; <i1> [#uses=1]
+ %retval = select i1 %3, float %1, float %0 ; <float> [#uses=1]
+ ret float %retval
+}
+
+define float @test2(float* %a) {
+entry:
+ %0 = load float* %a, align 4 ; <float> [#uses=2]
+ %1 = fmul float -1.000000e+00, %0 ; <float> [#uses=2]
+ %2 = fpext float %1 to double ; <double> [#uses=1]
+ %3 = fcmp olt double %2, 1.234000e+00 ; <i1> [#uses=1]
+ %retval = select i1 %3, float %1, float %0 ; <float> [#uses=1]
+ ret float %retval
+}
diff --git a/test/CodeGen/ARM/fnmacs.ll b/test/CodeGen/ARM/fnmacs.ll
new file mode 100644
index 0000000..e57bbbb
--- /dev/null
+++ b/test/CodeGen/ARM/fnmacs.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=arm -mattr=+vfp2 | grep -E {fnmacs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | grep -E {vmls.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | grep -E {fnmacs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | grep -E {vmls.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mcpu=cortex-a9 | grep -E {fnmacs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+
+define float @test(float %acc, float %a, float %b) {
+entry:
+ %0 = fmul float %a, %b
+ %1 = fsub float %acc, %0
+ ret float %1
+}
+
diff --git a/test/CodeGen/ARM/fnmscs.ll b/test/CodeGen/ARM/fnmscs.ll
new file mode 100644
index 0000000..3ae437d
--- /dev/null
+++ b/test/CodeGen/ARM/fnmscs.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s
+
+define float @test1(float %acc, float %a, float %b) nounwind {
+; CHECK: fnmscs s2, s1, s0
+entry:
+ %0 = fmul float %a, %b
+ %1 = fsub float -0.0, %0
+ %2 = fsub float %1, %acc
+ ret float %2
+}
+
+define float @test2(float %acc, float %a, float %b) nounwind {
+; CHECK: fnmscs s2, s1, s0
+entry:
+ %0 = fmul float %a, %b
+ %1 = fmul float -1.0, %0
+ %2 = fsub float %1, %acc
+ ret float %2
+}
+
diff --git a/test/CodeGen/ARM/fnmul.ll b/test/CodeGen/ARM/fnmul.ll
index 7bbda2d..613b347 100644
--- a/test/CodeGen/ARM/fnmul.ll
+++ b/test/CodeGen/ARM/fnmul.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6,+vfp2 | grep fnmuld
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6,+vfp2 -enable-sign-dependent-rounding-fp-math | grep fmul
+; RUN: llc < %s -march=arm -mattr=+v6,+vfp2 | grep fnmuld
+; RUN: llc < %s -march=arm -mattr=+v6,+vfp2 -enable-sign-dependent-rounding-fp-math | grep fmul
define double @t1(double %a, double %b) {
diff --git a/test/CodeGen/ARM/fnmuls.ll b/test/CodeGen/ARM/fnmuls.ll
new file mode 100644
index 0000000..efd87d2
--- /dev/null
+++ b/test/CodeGen/ARM/fnmuls.ll
@@ -0,0 +1,23 @@
+; XFAIL: *
+; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s
+
+define float @test1(float %a, float %b) nounwind {
+; CHECK: fnmscs s2, s1, s0
+entry:
+ %0 = fmul float %a, %b
+ %1 = fsub float -0.0, %0
+ ret float %1
+}
+
+define float @test2(float %a, float %b) nounwind {
+; CHECK: fnmscs s2, s1, s0
+entry:
+ %0 = fmul float %a, %b
+ %1 = fmul float -1.0, %0
+ ret float %1
+}
+
diff --git a/test/CodeGen/ARM/formal.ll b/test/CodeGen/ARM/formal.ll
index 6d6d108..4ac10ba 100644
--- a/test/CodeGen/ARM/formal.ll
+++ b/test/CodeGen/ARM/formal.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2
+; RUN: llc < %s -march=arm -mattr=+vfp2
declare void @bar(i64 %x, i64 %y)
diff --git a/test/CodeGen/ARM/fp.ll b/test/CodeGen/ARM/fp.ll
index ba199db..4e4ef72 100644
--- a/test/CodeGen/ARM/fp.ll
+++ b/test/CodeGen/ARM/fp.ll
@@ -1,55 +1,71 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 > %t
-; RUN: grep fmsr %t | count 4
-; RUN: grep fsitos %t
-; RUN: grep fmrs %t | count 2
-; RUN: grep fsitod %t
-; RUN: grep fmrrd %t | count 3
-; RUN: not grep fmdrr %t
-; RUN: grep fldd %t
-; RUN: grep fuitod %t
-; RUN: grep fuitos %t
-; RUN: grep 1065353216 %t
+; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s
define float @f(i32 %a) {
+;CHECK: f:
+;CHECK: fmsr
+;CHECK-NEXT: fsitos
+;CHECK-NEXT: fmrs
entry:
%tmp = sitofp i32 %a to float ; <float> [#uses=1]
ret float %tmp
}
define double @g(i32 %a) {
+;CHECK: g:
+;CHECK: fmsr
+;CHECK-NEXT: fsitod
+;CHECK-NEXT: fmrrd
entry:
%tmp = sitofp i32 %a to double ; <double> [#uses=1]
ret double %tmp
}
define double @uint_to_double(i32 %a) {
+;CHECK: uint_to_double:
+;CHECK: fmsr
+;CHECK-NEXT: fuitod
+;CHECK-NEXT: fmrrd
entry:
%tmp = uitofp i32 %a to double ; <double> [#uses=1]
ret double %tmp
}
define float @uint_to_float(i32 %a) {
+;CHECK: uint_to_float:
+;CHECK: fmsr
+;CHECK-NEXT: fuitos
+;CHECK-NEXT: fmrs
entry:
%tmp = uitofp i32 %a to float ; <float> [#uses=1]
ret float %tmp
}
define double @h(double* %v) {
+;CHECK: h:
+;CHECK: fldd
+;CHECK-NEXT: fmrrd
entry:
%tmp = load double* %v ; <double> [#uses=1]
ret double %tmp
}
define float @h2() {
+;CHECK: h2:
+;CHECK: 1065353216
entry:
ret float 1.000000e+00
}
define double @f2(double %a) {
+;CHECK: f2:
+;CHECK-NOT: fmdrr
ret double %a
}
define void @f3() {
+;CHECK: f3:
+;CHECK-NOT: fmdrr
+;CHECK: f4
entry:
%tmp = call double @f5( ) ; <double> [#uses=1]
call void @f4( double %tmp )
diff --git a/test/CodeGen/ARM/fp_convert.ll b/test/CodeGen/ARM/fp_convert.ll
new file mode 100644
index 0000000..9ce2ac5
--- /dev/null
+++ b/test/CodeGen/ARM/fp_convert.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | FileCheck %s -check-prefix=NEON
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | FileCheck %s -check-prefix=VFP2
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=NEON
+; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=VFP2
+
+define i32 @test1(float %a, float %b) {
+; VFP2: test1:
+; VFP2: ftosizs s0, s0
+; NEON: test1:
+; NEON: vcvt.s32.f32 d0, d0
+entry:
+ %0 = fadd float %a, %b
+ %1 = fptosi float %0 to i32
+ ret i32 %1
+}
+
+define i32 @test2(float %a, float %b) {
+; VFP2: test2:
+; VFP2: ftouizs s0, s0
+; NEON: test2:
+; NEON: vcvt.u32.f32 d0, d0
+entry:
+ %0 = fadd float %a, %b
+ %1 = fptoui float %0 to i32
+ ret i32 %1
+}
+
+define float @test3(i32 %a, i32 %b) {
+; VFP2: test3:
+; VFP2: fuitos s0, s0
+; NEON: test3:
+; NEON: vcvt.f32.u32 d0, d0
+entry:
+ %0 = add i32 %a, %b
+ %1 = uitofp i32 %0 to float
+ ret float %1
+}
+
+define float @test4(i32 %a, i32 %b) {
+; VFP2: test4:
+; VFP2: fsitos s0, s0
+; NEON: test4:
+; NEON: vcvt.f32.s32 d0, d0
+entry:
+ %0 = add i32 %a, %b
+ %1 = sitofp i32 %0 to float
+ ret float %1
+}
diff --git a/test/CodeGen/ARM/fparith.ll b/test/CodeGen/ARM/fparith.ll
index 568a6c4..ebeeb18 100644
--- a/test/CodeGen/ARM/fparith.ll
+++ b/test/CodeGen/ARM/fparith.ll
@@ -1,74 +1,88 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 > %t
-; RUN: grep fadds %t
-; RUN: grep faddd %t
-; RUN: grep fmuls %t
-; RUN: grep fmuld %t
-; RUN: grep eor %t
-; RUN: grep fnegd %t
-; RUN: grep fdivs %t
-; RUN: grep fdivd %t
+; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s
define float @f1(float %a, float %b) {
+;CHECK: f1:
+;CHECK: fadds
entry:
%tmp = fadd float %a, %b ; <float> [#uses=1]
ret float %tmp
}
define double @f2(double %a, double %b) {
+;CHECK: f2:
+;CHECK: faddd
entry:
%tmp = fadd double %a, %b ; <double> [#uses=1]
ret double %tmp
}
define float @f3(float %a, float %b) {
+;CHECK: f3:
+;CHECK: fmuls
entry:
%tmp = fmul float %a, %b ; <float> [#uses=1]
ret float %tmp
}
define double @f4(double %a, double %b) {
+;CHECK: f4:
+;CHECK: fmuld
entry:
%tmp = fmul double %a, %b ; <double> [#uses=1]
ret double %tmp
}
define float @f5(float %a, float %b) {
+;CHECK: f5:
+;CHECK: fsubs
entry:
%tmp = fsub float %a, %b ; <float> [#uses=1]
ret float %tmp
}
define double @f6(double %a, double %b) {
+;CHECK: f6:
+;CHECK: fsubd
entry:
%tmp = fsub double %a, %b ; <double> [#uses=1]
ret double %tmp
}
define float @f7(float %a) {
+;CHECK: f7:
+;CHECK: eor
entry:
%tmp1 = fsub float -0.000000e+00, %a ; <float> [#uses=1]
ret float %tmp1
}
define double @f8(double %a) {
+;CHECK: f8:
+;CHECK: fnegd
entry:
%tmp1 = fsub double -0.000000e+00, %a ; <double> [#uses=1]
ret double %tmp1
}
define float @f9(float %a, float %b) {
+;CHECK: f9:
+;CHECK: fdivs
entry:
%tmp1 = fdiv float %a, %b ; <float> [#uses=1]
ret float %tmp1
}
define double @f10(double %a, double %b) {
+;CHECK: f10:
+;CHECK: fdivd
entry:
%tmp1 = fdiv double %a, %b ; <double> [#uses=1]
ret double %tmp1
}
define float @f11(float %a) {
+;CHECK: f11:
+;CHECK: bic
entry:
%tmp1 = call float @fabsf( float %a ) ; <float> [#uses=1]
ret float %tmp1
@@ -77,6 +91,8 @@ entry:
declare float @fabsf(float)
define double @f12(double %a) {
+;CHECK: f12:
+;CHECK: fabsd
entry:
%tmp1 = call double @fabs( double %a ) ; <double> [#uses=1]
ret double %tmp1
diff --git a/test/CodeGen/ARM/fpcmp.ll b/test/CodeGen/ARM/fpcmp.ll
index ce0f4029..2c9591c 100644
--- a/test/CodeGen/ARM/fpcmp.ll
+++ b/test/CodeGen/ARM/fpcmp.ll
@@ -1,13 +1,9 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 > %t
-; RUN: grep movmi %t
-; RUN: grep moveq %t
-; RUN: grep movgt %t
-; RUN: grep movge %t
-; RUN: grep movne %t
-; RUN: grep fcmped %t | count 1
-; RUN: grep fcmpes %t | count 6
+; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s
define i32 @f1(float %a) {
+;CHECK: f1:
+;CHECK: fcmpes
+;CHECK: movmi
entry:
%tmp = fcmp olt float %a, 1.000000e+00 ; <i1> [#uses=1]
%tmp1 = zext i1 %tmp to i32 ; <i32> [#uses=1]
@@ -15,6 +11,9 @@ entry:
}
define i32 @f2(float %a) {
+;CHECK: f2:
+;CHECK: fcmpes
+;CHECK: moveq
entry:
%tmp = fcmp oeq float %a, 1.000000e+00 ; <i1> [#uses=1]
%tmp2 = zext i1 %tmp to i32 ; <i32> [#uses=1]
@@ -22,6 +21,9 @@ entry:
}
define i32 @f3(float %a) {
+;CHECK: f3:
+;CHECK: fcmpes
+;CHECK: movgt
entry:
%tmp = fcmp ogt float %a, 1.000000e+00 ; <i1> [#uses=1]
%tmp3 = zext i1 %tmp to i32 ; <i32> [#uses=1]
@@ -29,6 +31,9 @@ entry:
}
define i32 @f4(float %a) {
+;CHECK: f4:
+;CHECK: fcmpes
+;CHECK: movge
entry:
%tmp = fcmp oge float %a, 1.000000e+00 ; <i1> [#uses=1]
%tmp4 = zext i1 %tmp to i32 ; <i32> [#uses=1]
@@ -36,6 +41,9 @@ entry:
}
define i32 @f5(float %a) {
+;CHECK: f5:
+;CHECK: fcmpes
+;CHECK: movls
entry:
%tmp = fcmp ole float %a, 1.000000e+00 ; <i1> [#uses=1]
%tmp5 = zext i1 %tmp to i32 ; <i32> [#uses=1]
@@ -43,6 +51,9 @@ entry:
}
define i32 @f6(float %a) {
+;CHECK: f6:
+;CHECK: fcmpes
+;CHECK: movne
entry:
%tmp = fcmp une float %a, 1.000000e+00 ; <i1> [#uses=1]
%tmp6 = zext i1 %tmp to i32 ; <i32> [#uses=1]
@@ -50,6 +61,9 @@ entry:
}
define i32 @g1(double %a) {
+;CHECK: g1:
+;CHECK: fcmped
+;CHECK: movmi
entry:
%tmp = fcmp olt double %a, 1.000000e+00 ; <i1> [#uses=1]
%tmp7 = zext i1 %tmp to i32 ; <i32> [#uses=1]
diff --git a/test/CodeGen/ARM/fpcmp_ueq.ll b/test/CodeGen/ARM/fpcmp_ueq.ll
index 3e749af..67f70e9 100644
--- a/test/CodeGen/ARM/fpcmp_ueq.ll
+++ b/test/CodeGen/ARM/fpcmp_ueq.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm | grep moveq
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 | grep movvs
+; RUN: llc < %s -march=arm | grep moveq
+; RUN: llc < %s -march=arm -mattr=+vfp2 | grep movvs
define i32 @f7(float %a, float %b) {
entry:
diff --git a/test/CodeGen/ARM/fpconv.ll b/test/CodeGen/ARM/fpconv.ll
index 218b25f..ee3c338 100644
--- a/test/CodeGen/ARM/fpconv.ll
+++ b/test/CodeGen/ARM/fpconv.ll
@@ -1,81 +1,101 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 > %t
-; RUN: grep fcvtsd %t
-; RUN: grep fcvtds %t
-; RUN: grep ftosizs %t
-; RUN: grep ftouizs %t
-; RUN: grep ftosizd %t
-; RUN: grep ftouizd %t
-; RUN: grep fsitos %t
-; RUN: grep fsitod %t
-; RUN: grep fuitos %t
-; RUN: grep fuitod %t
-; RUN: llvm-as < %s | llc -march=arm > %t
-; RUN: grep truncdfsf2 %t
-; RUN: grep extendsfdf2 %t
-; RUN: grep fixsfsi %t
-; RUN: grep fixunssfsi %t
-; RUN: grep fixdfsi %t
-; RUN: grep fixunsdfsi %t
-; RUN: grep floatsisf %t
-; RUN: grep floatsidf %t
-; RUN: grep floatunsisf %t
-; RUN: grep floatunsidf %t
+; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s --check-prefix=CHECK-VFP
+; RUN: llc < %s -march=arm | FileCheck %s
define float @f1(double %x) {
+;CHECK-VFP: f1:
+;CHECK-VFP: fcvtsd
+;CHECK: f1:
+;CHECK: truncdfsf2
entry:
%tmp1 = fptrunc double %x to float ; <float> [#uses=1]
ret float %tmp1
}
define double @f2(float %x) {
+;CHECK-VFP: f2:
+;CHECK-VFP: fcvtds
+;CHECK: f2:
+;CHECK: extendsfdf2
entry:
%tmp1 = fpext float %x to double ; <double> [#uses=1]
ret double %tmp1
}
define i32 @f3(float %x) {
+;CHECK-VFP: f3:
+;CHECK-VFP: ftosizs
+;CHECK: f3:
+;CHECK: fixsfsi
entry:
%tmp = fptosi float %x to i32 ; <i32> [#uses=1]
ret i32 %tmp
}
define i32 @f4(float %x) {
+;CHECK-VFP: f4:
+;CHECK-VFP: ftouizs
+;CHECK: f4:
+;CHECK: fixunssfsi
entry:
%tmp = fptoui float %x to i32 ; <i32> [#uses=1]
ret i32 %tmp
}
define i32 @f5(double %x) {
+;CHECK-VFP: f5:
+;CHECK-VFP: ftosizd
+;CHECK: f5:
+;CHECK: fixdfsi
entry:
%tmp = fptosi double %x to i32 ; <i32> [#uses=1]
ret i32 %tmp
}
define i32 @f6(double %x) {
+;CHECK-VFP: f6:
+;CHECK-VFP: ftouizd
+;CHECK: f6:
+;CHECK: fixunsdfsi
entry:
%tmp = fptoui double %x to i32 ; <i32> [#uses=1]
ret i32 %tmp
}
define float @f7(i32 %a) {
+;CHECK-VFP: f7:
+;CHECK-VFP: fsitos
+;CHECK: f7:
+;CHECK: floatsisf
entry:
%tmp = sitofp i32 %a to float ; <float> [#uses=1]
ret float %tmp
}
define double @f8(i32 %a) {
+;CHECK-VFP: f8:
+;CHECK-VFP: fsitod
+;CHECK: f8:
+;CHECK: floatsidf
entry:
%tmp = sitofp i32 %a to double ; <double> [#uses=1]
ret double %tmp
}
define float @f9(i32 %a) {
+;CHECK-VFP: f9:
+;CHECK-VFP: fuitos
+;CHECK: f9:
+;CHECK: floatunsisf
entry:
%tmp = uitofp i32 %a to float ; <float> [#uses=1]
ret float %tmp
}
define double @f10(i32 %a) {
+;CHECK-VFP: f10:
+;CHECK-VFP: fuitod
+;CHECK: f10:
+;CHECK: floatunsidf
entry:
%tmp = uitofp i32 %a to double ; <double> [#uses=1]
ret double %tmp
diff --git a/test/CodeGen/ARM/fpmem.ll b/test/CodeGen/ARM/fpmem.ll
index 13653bb..fa897bf 100644
--- a/test/CodeGen/ARM/fpmem.ll
+++ b/test/CodeGen/ARM/fpmem.ll
@@ -1,8 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm | \
+; RUN: llc < %s -march=arm | \
; RUN: grep {mov r0, #0} | count 1
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 | \
+; RUN: llc < %s -march=arm -mattr=+vfp2 | \
; RUN: grep {flds.*\\\[} | count 1
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 | \
+; RUN: llc < %s -march=arm -mattr=+vfp2 | \
; RUN: grep {fsts.*\\\[} | count 1
define float @f1(float %a) {
diff --git a/test/CodeGen/ARM/fpow.ll b/test/CodeGen/ARM/fpow.ll
index 461a2c9..6d48792 100644
--- a/test/CodeGen/ARM/fpow.ll
+++ b/test/CodeGen/ARM/fpow.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
define double @t(double %x, double %y) nounwind optsize {
entry:
diff --git a/test/CodeGen/ARM/fpowi.ll b/test/CodeGen/ARM/fpowi.ll
index ab09fff..174106b 100644
--- a/test/CodeGen/ARM/fpowi.ll
+++ b/test/CodeGen/ARM/fpowi.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi | grep powidf2
+; RUN: llc < %s -mtriple=arm-linux-gnueabi | grep powidf2
; PR1287
; ModuleID = '<stdin>'
diff --git a/test/CodeGen/ARM/fptoint.ll b/test/CodeGen/ARM/fptoint.ll
index 41168ac..0d270b0 100644
--- a/test/CodeGen/ARM/fptoint.ll
+++ b/test/CodeGen/ARM/fptoint.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6,+vfp2 | grep fmrs | count 1
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6,+vfp2 | not grep fmrrd
+; RUN: llc < %s -march=arm -mattr=+v6,+vfp2 | grep fmrs | count 1
+; RUN: llc < %s -march=arm -mattr=+v6,+vfp2 | not grep fmrrd
@i = weak global i32 0 ; <i32*> [#uses=2]
@u = weak global i32 0 ; <i32*> [#uses=2]
diff --git a/test/CodeGen/ARM/fsubs.ll b/test/CodeGen/ARM/fsubs.ll
new file mode 100644
index 0000000..060dd46
--- /dev/null
+++ b/test/CodeGen/ARM/fsubs.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -march=arm -mattr=+vfp2 | grep -E {fsubs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=1 | grep -E {vsub.f32\\W*d\[0-9\]+,\\W*d\[0-9\]+,\\W*d\[0-9\]+} | count 1
+; RUN: llc < %s -march=arm -mattr=+neon -arm-use-neon-fp=0 | grep -E {fsubs\\W*s\[0-9\]+,\\W*s\[0-9\]+,\\W*s\[0-9\]+} | count 1
+
+define float @test(float %a, float %b) {
+entry:
+ %0 = fsub float %a, %b
+ ret float %0
+}
+
diff --git a/test/CodeGen/ARM/hardfloat_neon.ll b/test/CodeGen/ARM/hardfloat_neon.ll
new file mode 100644
index 0000000..4abf04b
--- /dev/null
+++ b/test/CodeGen/ARM/hardfloat_neon.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mtriple=arm-linux-gnueabi -mattr=+neon -float-abi=hard
+
+define <16 x i8> @vmulQi8_reg(<16 x i8> %A, <16 x i8> %B) nounwind {
+ %tmp1 = mul <16 x i8> %A, %B
+ ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @f(<16 x i8> %a, <16 x i8> %b) {
+ %tmp = call <16 x i8> @g(<16 x i8> %b)
+ ret <16 x i8> %tmp
+}
+
+declare <16 x i8> @g(<16 x i8>)
diff --git a/test/CodeGen/ARM/hello.ll b/test/CodeGen/ARM/hello.ll
index 16231da..ccdc7bf 100644
--- a/test/CodeGen/ARM/hello.ll
+++ b/test/CodeGen/ARM/hello.ll
@@ -1,8 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi | grep mov | count 1
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnu --disable-fp-elim | \
+; RUN: llc < %s -march=arm
+; RUN: llc < %s -mtriple=arm-linux-gnueabi | grep mov | count 1
+; RUN: llc < %s -mtriple=arm-linux-gnu --disable-fp-elim | \
; RUN: grep mov | count 3
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep mov | count 2
+; RUN: llc < %s -mtriple=arm-apple-darwin | grep mov | count 2
@str = internal constant [12 x i8] c"Hello World\00"
diff --git a/test/CodeGen/ARM/hidden-vis-2.ll b/test/CodeGen/ARM/hidden-vis-2.ll
index 6cf69aa..90f5308 100644
--- a/test/CodeGen/ARM/hidden-vis-2.ll
+++ b/test/CodeGen/ARM/hidden-vis-2.ll
@@ -1,9 +1,12 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep ldr | count 2
+; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s
@x = weak hidden global i32 0 ; <i32*> [#uses=1]
define i32 @t() nounwind readonly {
entry:
+; CHECK: t:
+; CHECK: ldr
+; CHECK-NEXT: ldr
%0 = load i32* @x, align 4 ; <i32> [#uses=1]
ret i32 %0
}
diff --git a/test/CodeGen/ARM/hidden-vis-3.ll b/test/CodeGen/ARM/hidden-vis-3.ll
index 4477f2a..3bd710a 100644
--- a/test/CodeGen/ARM/hidden-vis-3.ll
+++ b/test/CodeGen/ARM/hidden-vis-3.ll
@@ -1,12 +1,15 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep ldr | count 6
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep non_lazy_ptr
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep long | count 4
+; RUN: llc < %s -mtriple=arm-apple-darwin9 | FileCheck %s
@x = external hidden global i32 ; <i32*> [#uses=1]
@y = extern_weak hidden global i32 ; <i32*> [#uses=1]
define i32 @t() nounwind readonly {
entry:
+; CHECK: LCPI1_0:
+; CHECK-NEXT: .long _x
+; CHECK: LCPI1_1:
+; CHECK-NEXT: .long _y
+
%0 = load i32* @x, align 4 ; <i32> [#uses=1]
%1 = load i32* @y, align 4 ; <i32> [#uses=1]
%2 = add i32 %1, %0 ; <i32> [#uses=1]
diff --git a/test/CodeGen/ARM/hidden-vis.ll b/test/CodeGen/ARM/hidden-vis.ll
index 93f81ec..3544ae8 100644
--- a/test/CodeGen/ARM/hidden-vis.ll
+++ b/test/CodeGen/ARM/hidden-vis.ll
@@ -1,18 +1,23 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | \
-; RUN: grep .private_extern | count 2
+; RUN: llc < %s -mtriple=arm-linux | FileCheck %s -check-prefix=LINUX
+; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s -check-prefix=DARWIN
-%struct.Person = type { i32 }
@a = hidden global i32 0
@b = external global i32
+define weak hidden void @t1() nounwind {
+; LINUX: .hidden t1
+; LINUX: t1:
-define weak hidden void @_ZN6Person13privateMethodEv(%struct.Person* %this) {
+; DARWIN: .private_extern _t1
+; DARWIN: t1:
ret void
}
-declare void @function(i32)
+define weak void @t2() nounwind {
+; LINUX: t2:
+; LINUX: .hidden a
-define weak void @_ZN6PersonC1Ei(%struct.Person* %this, i32 %_c) {
+; DARWIN: t2:
+; DARWIN: .private_extern _a
ret void
}
-
diff --git a/test/CodeGen/ARM/iabs.ll b/test/CodeGen/ARM/iabs.ll
index ede6d74..1054f27 100644
--- a/test/CodeGen/ARM/iabs.ll
+++ b/test/CodeGen/ARM/iabs.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -stats |& \
+; RUN: llc < %s -march=arm -stats |& \
; RUN: grep {3 .*Number of machine instrs printed}
;; Integer absolute value, should produce something as good as: ARM:
diff --git a/test/CodeGen/ARM/ifcvt1.ll b/test/CodeGen/ARM/ifcvt1.ll
index 7d42955..e6aa044 100644
--- a/test/CodeGen/ARM/ifcvt1.ll
+++ b/test/CodeGen/ARM/ifcvt1.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm
-; RUN: llvm-as < %s | llc -march=arm | grep bx | count 1
+; RUN: llc < %s -march=arm
+; RUN: llc < %s -march=arm | grep bx | count 1
define i32 @t1(i32 %a, i32 %b) {
%tmp2 = icmp eq i32 %a, 0
diff --git a/test/CodeGen/ARM/ifcvt2.ll b/test/CodeGen/ARM/ifcvt2.ll
index 3942061..ce57d73 100644
--- a/test/CodeGen/ARM/ifcvt2.ll
+++ b/test/CodeGen/ARM/ifcvt2.ll
@@ -1,7 +1,7 @@
-; RUN: llvm-as < %s | llc -march=arm
-; RUN: llvm-as < %s | llc -march=arm | grep bxlt | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep bxgt | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep bxge | count 1
+; RUN: llc < %s -march=arm
+; RUN: llc < %s -march=arm | grep bxlt | count 1
+; RUN: llc < %s -march=arm | grep bxgt | count 1
+; RUN: llc < %s -march=arm | grep bxge | count 1
define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) {
%tmp2 = icmp sgt i32 %c, 10
diff --git a/test/CodeGen/ARM/ifcvt3.ll b/test/CodeGen/ARM/ifcvt3.ll
index 620bcbe..f7ebac6 100644
--- a/test/CodeGen/ARM/ifcvt3.ll
+++ b/test/CodeGen/ARM/ifcvt3.ll
@@ -1,6 +1,6 @@
-; RUN: llvm-as < %s | llc -march=arm
-; RUN: llvm-as < %s | llc -march=arm | grep cmpne | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep bx | count 2
+; RUN: llc < %s -march=arm
+; RUN: llc < %s -march=arm | grep cmpne | count 1
+; RUN: llc < %s -march=arm | grep bx | count 2
define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) {
switch i32 %c, label %cond_next [
diff --git a/test/CodeGen/ARM/ifcvt4.ll b/test/CodeGen/ARM/ifcvt4.ll
index ce5a679..f28c61b 100644
--- a/test/CodeGen/ARM/ifcvt4.ll
+++ b/test/CodeGen/ARM/ifcvt4.ll
@@ -1,6 +1,6 @@
-; RUN: llvm-as < %s | llc -march=arm
-; RUN: llvm-as < %s | llc -march=arm | grep subgt | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep suble | count 1
+; RUN: llc < %s -march=arm
+; RUN: llc < %s -march=arm | grep subgt | count 1
+; RUN: llc < %s -march=arm | grep suble | count 1
; FIXME: Check for # of unconditional branch after adding branch folding post ifcvt.
define i32 @t(i32 %a, i32 %b) {
diff --git a/test/CodeGen/ARM/ifcvt5.ll b/test/CodeGen/ARM/ifcvt5.ll
index f8d4f82..e9145ac 100644
--- a/test/CodeGen/ARM/ifcvt5.ll
+++ b/test/CodeGen/ARM/ifcvt5.ll
@@ -1,5 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
-; RUN: llvm-as < %s | llc -march=arm | grep blge | count 1
+; RUN: llc < %s -mtriple=armv7-apple-darwin | FileCheck %s
@x = external global i32* ; <i32**> [#uses=1]
@@ -11,6 +10,8 @@ entry:
}
define void @t1(i32 %a, i32 %b) {
+; CHECK: t1:
+; CHECK: ldmltfd sp!, {r7, pc}
entry:
%tmp1 = icmp sgt i32 %a, 10 ; <i1> [#uses=1]
br i1 %tmp1, label %cond_true, label %UnifiedReturnBlock
diff --git a/test/CodeGen/ARM/ifcvt6.ll b/test/CodeGen/ARM/ifcvt6.ll
index 63c4a08..5824115 100644
--- a/test/CodeGen/ARM/ifcvt6.ll
+++ b/test/CodeGen/ARM/ifcvt6.ll
@@ -1,10 +1,6 @@
-; RUN: llvm-as < %s | \
-; RUN: llc -march=arm -mtriple=arm-apple-darwin
-; RUN: llvm-as < %s | \
-; RUN: llc -march=arm -mtriple=arm-apple-darwin | \
+; RUN: llc < %s -march=arm -mtriple=arm-apple-darwin | \
; RUN: grep cmpne | count 1
-; RUN: llvm-as < %s | \
-; RUN: llc -march=arm -mtriple=arm-apple-darwin | \
+; RUN: llc < %s -march=arm -mtriple=arm-apple-darwin | \
; RUN: grep ldmhi | count 1
define void @foo(i32 %X, i32 %Y) {
diff --git a/test/CodeGen/ARM/ifcvt7.ll b/test/CodeGen/ARM/ifcvt7.ll
index 6bb4b56..f9cf88f 100644
--- a/test/CodeGen/ARM/ifcvt7.ll
+++ b/test/CodeGen/ARM/ifcvt7.ll
@@ -1,13 +1,8 @@
-; RUN: llvm-as < %s | \
-; RUN: llc -march=arm -mtriple=arm-apple-darwin
-; RUN: llvm-as < %s | \
-; RUN: llc -march=arm -mtriple=arm-apple-darwin | \
+; RUN: llc < %s -march=arm -mtriple=arm-apple-darwin | \
; RUN: grep cmpeq | count 1
-; RUN: llvm-as < %s | \
-; RUN: llc -march=arm -mtriple=arm-apple-darwin | \
+; RUN: llc < %s -march=arm -mtriple=arm-apple-darwin | \
; RUN: grep moveq | count 1
-; RUN: llvm-as < %s | \
-; RUN: llc -march=arm -mtriple=arm-apple-darwin | \
+; RUN: llc < %s -march=arm -mtriple=arm-apple-darwin | \
; RUN: grep ldmeq | count 1
; FIXME: Need post-ifcvt branch folding to get rid of the extra br at end of BB1.
diff --git a/test/CodeGen/ARM/ifcvt8.ll b/test/CodeGen/ARM/ifcvt8.ll
index 85bd8c7..6cb8e7b 100644
--- a/test/CodeGen/ARM/ifcvt8.ll
+++ b/test/CodeGen/ARM/ifcvt8.ll
@@ -1,7 +1,4 @@
-; RUN: llvm-as < %s | \
-; RUN: llc -march=arm -mtriple=arm-apple-darwin
-; RUN: llvm-as < %s | \
-; RUN: llc -march=arm -mtriple=arm-apple-darwin | \
+; RUN: llc < %s -march=arm -mtriple=arm-apple-darwin | \
; RUN: grep ldmne | count 1
%struct.SString = type { i8*, i32, i32 }
diff --git a/test/CodeGen/ARM/ifcvt9.ll b/test/CodeGen/ARM/ifcvt9.ll
index bbd2f2e..05bdc45 100644
--- a/test/CodeGen/ARM/ifcvt9.ll
+++ b/test/CodeGen/ARM/ifcvt9.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
define fastcc void @t() nounwind {
entry:
diff --git a/test/CodeGen/ARM/illegal-vector-bitcast.ll b/test/CodeGen/ARM/illegal-vector-bitcast.ll
index ad24eb5..febe6f5 100644
--- a/test/CodeGen/ARM/illegal-vector-bitcast.ll
+++ b/test/CodeGen/ARM/illegal-vector-bitcast.ll
@@ -1,4 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
+; RUN: llc < %s -mtriple=arm-linux
define void @foo(<8 x float>* %f, <8 x float>* %g, <4 x i64>* %y)
{
diff --git a/test/CodeGen/ARM/imm.ll b/test/CodeGen/ARM/imm.ll
index 998adba..6f25f9d 100644
--- a/test/CodeGen/ARM/imm.ll
+++ b/test/CodeGen/ARM/imm.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm | not grep CPI
+; RUN: llc < %s -march=arm | not grep CPI
define i32 @test1(i32 %A) {
%B = add i32 %A, -268435441 ; <i32> [#uses=1]
diff --git a/test/CodeGen/ARM/inlineasm-imm-arm.ll b/test/CodeGen/ARM/inlineasm-imm-arm.ll
index 2ceceae..45dfcf0 100644
--- a/test/CodeGen/ARM/inlineasm-imm-arm.ll
+++ b/test/CodeGen/ARM/inlineasm-imm-arm.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
; Test ARM-mode "I" constraint, for any Data Processing immediate.
define i32 @testI(i32 %x) {
diff --git a/test/CodeGen/ARM/inlineasm.ll b/test/CodeGen/ARM/inlineasm.ll
index 2f7332a..d522348 100644
--- a/test/CodeGen/ARM/inlineasm.ll
+++ b/test/CodeGen/ARM/inlineasm.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6
+; RUN: llc < %s -march=arm -mattr=+v6
define i32 @test1(i32 %tmp54) {
%tmp56 = tail call i32 asm "uxtb16 $0,$1", "=r,r"( i32 %tmp54 ) ; <i32> [#uses=1]
diff --git a/test/CodeGen/ARM/inlineasm2.ll b/test/CodeGen/ARM/inlineasm2.ll
index 69394eb..a99bccf 100644
--- a/test/CodeGen/ARM/inlineasm2.ll
+++ b/test/CodeGen/ARM/inlineasm2.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2
+; RUN: llc < %s -march=arm -mattr=+vfp2
define double @__ieee754_sqrt(double %x) {
%tmp2 = tail call double asm "fsqrtd ${0:P}, ${1:P}", "=w,w"( double %x )
diff --git a/test/CodeGen/ARM/insn-sched1.ll b/test/CodeGen/ARM/insn-sched1.ll
index f203443..59f0d53 100644
--- a/test/CodeGen/ARM/insn-sched1.ll
+++ b/test/CodeGen/ARM/insn-sched1.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -mattr=+v6 |\
+; RUN: llc < %s -march=arm -mattr=+v6
+; RUN: llc < %s -mtriple=arm-apple-darwin -mattr=+v6 |\
; RUN: grep mov | count 3
define i32 @test(i32 %x) {
diff --git a/test/CodeGen/ARM/ispositive.ll b/test/CodeGen/ARM/ispositive.ll
index 7e8eb42..5116ac8 100644
--- a/test/CodeGen/ARM/ispositive.ll
+++ b/test/CodeGen/ARM/ispositive.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm | grep {mov r0, r0, lsr #31}
+; RUN: llc < %s -march=arm | grep {mov r0, r0, lsr #31}
define i32 @test1(i32 %X) {
entry:
diff --git a/test/CodeGen/ARM/large-stack.ll b/test/CodeGen/ARM/large-stack.ll
index b1738a4..ddf0f0e 100644
--- a/test/CodeGen/ARM/large-stack.ll
+++ b/test/CodeGen/ARM/large-stack.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
define void @test1() {
%tmp = alloca [ 64 x i32 ] , align 4
diff --git a/test/CodeGen/ARM/ldm.ll b/test/CodeGen/ARM/ldm.ll
index 6a05457..774b3c0 100644
--- a/test/CodeGen/ARM/ldm.ll
+++ b/test/CodeGen/ARM/ldm.ll
@@ -1,8 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm | \
+; RUN: llc < %s -march=arm | \
; RUN: grep ldmia | count 2
-; RUN: llvm-as < %s | llc -march=arm | \
+; RUN: llc < %s -march=arm | \
; RUN: grep ldmib | count 1
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | \
+; RUN: llc < %s -mtriple=arm-apple-darwin | \
; RUN: grep {ldmfd sp\!} | count 3
@X = external global [0 x i32] ; <[0 x i32]*> [#uses=5]
diff --git a/test/CodeGen/ARM/ldr.ll b/test/CodeGen/ARM/ldr.ll
index ea99655..954fb5b 100644
--- a/test/CodeGen/ARM/ldr.ll
+++ b/test/CodeGen/ARM/ldr.ll
@@ -1,8 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm | grep {ldr r0} | count 7
-; RUN: llvm-as < %s | llc -march=arm | grep mov | grep 1
-; RUN: llvm-as < %s | llc -march=arm | not grep mvn
-; RUN: llvm-as < %s | llc -march=arm | grep ldr | grep lsl
-; RUN: llvm-as < %s | llc -march=arm | grep ldr | grep lsr
+; RUN: llc < %s -march=arm | grep {ldr r0} | count 7
+; RUN: llc < %s -march=arm | grep mov | grep 1
+; RUN: llc < %s -march=arm | not grep mvn
+; RUN: llc < %s -march=arm | grep ldr | grep lsl
+; RUN: llc < %s -march=arm | grep ldr | grep lsr
define i32 @f1(i32* %v) {
entry:
diff --git a/test/CodeGen/ARM/ldr_ext.ll b/test/CodeGen/ARM/ldr_ext.ll
index b99c721..d29eb02 100644
--- a/test/CodeGen/ARM/ldr_ext.ll
+++ b/test/CodeGen/ARM/ldr_ext.ll
@@ -1,27 +1,36 @@
-; RUN: llvm-as < %s | llc -march=arm | grep ldrb | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep ldrh | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep ldrsb | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep ldrsh | count 1
+; RUN: llc < %s -march=arm | FileCheck %s
-define i32 @test1(i8* %v.pntr.s0.u1) {
- %tmp.u = load i8* %v.pntr.s0.u1
+define i32 @test1(i8* %t1) nounwind {
+; CHECK: ldrb
+ %tmp.u = load i8* %t1
%tmp1.s = zext i8 %tmp.u to i32
ret i32 %tmp1.s
}
-define i32 @test2(i16* %v.pntr.s0.u1) {
- %tmp.u = load i16* %v.pntr.s0.u1
+define i32 @test2(i16* %t1) nounwind {
+; CHECK: ldrh
+ %tmp.u = load i16* %t1
%tmp1.s = zext i16 %tmp.u to i32
ret i32 %tmp1.s
}
-define i32 @test3(i8* %v.pntr.s1.u0) {
- %tmp.s = load i8* %v.pntr.s1.u0
+define i32 @test3(i8* %t0) nounwind {
+; CHECK: ldrsb
+ %tmp.s = load i8* %t0
%tmp1.s = sext i8 %tmp.s to i32
ret i32 %tmp1.s
}
-define i32 @test4() {
+define i32 @test4(i16* %t0) nounwind {
+; CHECK: ldrsh
+ %tmp.s = load i16* %t0
+ %tmp1.s = sext i16 %tmp.s to i32
+ ret i32 %tmp1.s
+}
+
+define i32 @test5() nounwind {
+; CHECK: mov r0, #0
+; CHECK: ldrsh
%tmp.s = load i16* null
%tmp1.s = sext i16 %tmp.s to i32
ret i32 %tmp1.s
diff --git a/test/CodeGen/ARM/ldr_frame.ll b/test/CodeGen/ARM/ldr_frame.ll
index 4431506..a3abdb6 100644
--- a/test/CodeGen/ARM/ldr_frame.ll
+++ b/test/CodeGen/ARM/ldr_frame.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm | not grep mov
+; RUN: llc < %s -march=arm | not grep mov
define i32 @f1() {
%buf = alloca [32 x i32], align 4
diff --git a/test/CodeGen/ARM/ldr_post.ll b/test/CodeGen/ARM/ldr_post.ll
index 0491563..97a48e1 100644
--- a/test/CodeGen/ARM/ldr_post.ll
+++ b/test/CodeGen/ARM/ldr_post.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm | \
+; RUN: llc < %s -march=arm | \
; RUN: grep {ldr.*\\\[.*\],} | count 1
define i32 @test(i32 %a, i32 %b, i32 %c) {
diff --git a/test/CodeGen/ARM/ldr_pre.ll b/test/CodeGen/ARM/ldr_pre.ll
index 7e44742..7c44284 100644
--- a/test/CodeGen/ARM/ldr_pre.ll
+++ b/test/CodeGen/ARM/ldr_pre.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm | \
+; RUN: llc < %s -march=arm | \
; RUN: grep {ldr.*\\!} | count 2
define i32* @test1(i32* %X, i32* %dest) {
diff --git a/test/CodeGen/ARM/ldrd.ll b/test/CodeGen/ARM/ldrd.ll
index f1bee05..8f7ae55 100644
--- a/test/CodeGen/ARM/ldrd.ll
+++ b/test/CodeGen/ARM/ldrd.ll
@@ -1,12 +1,20 @@
-; RUN: llvm-as < %s | llc -mtriple=armv6-apple-darwin | grep ldrd
-; RUN: llvm-as < %s | llc -mtriple=armv5-apple-darwin | not grep ldrd
-; RUN: llvm-as < %s | llc -mtriple=armv6-eabi | not grep ldrd
+; RUN: llc < %s -mtriple=armv6-apple-darwin | FileCheck %s -check-prefix=V6
+; RUN: llc < %s -mtriple=armv5-apple-darwin | FileCheck %s -check-prefix=V5
+; RUN: llc < %s -mtriple=armv6-eabi | FileCheck %s -check-prefix=EABI
; rdar://r6949835
@b = external global i64*
define i64 @t(i64 %a) nounwind readonly {
entry:
+;V6: ldrd r2, [r2]
+
+;V5: ldr r3, [r2]
+;V5-NEXT: ldr r2, [r2, #+4]
+
+;EABI: ldr r3, [r2]
+;EABI-NEXT: ldr r2, [r2, #+4]
+
%0 = load i64** @b, align 4
%1 = load i64* %0, align 4
%2 = mul i64 %1, %a
diff --git a/test/CodeGen/ARM/load-global.ll b/test/CodeGen/ARM/load-global.ll
index 8896ead..56a4a47 100644
--- a/test/CodeGen/ARM/load-global.ll
+++ b/test/CodeGen/ARM/load-global.ll
@@ -1,14 +1,10 @@
-; RUN: llvm-as < %s | \
-; RUN: llc -mtriple=arm-apple-darwin -relocation-model=static | \
+; RUN: llc < %s -mtriple=arm-apple-darwin -relocation-model=static | \
; RUN: not grep {L_G\$non_lazy_ptr}
-; RUN: llvm-as < %s | \
-; RUN: llc -mtriple=arm-apple-darwin -relocation-model=dynamic-no-pic | \
+; RUN: llc < %s -mtriple=arm-apple-darwin -relocation-model=dynamic-no-pic | \
; RUN: grep {L_G\$non_lazy_ptr} | count 2
-; RUN: llvm-as < %s | \
-; RUN: llc -mtriple=arm-apple-darwin -relocation-model=pic | \
+; RUN: llc < %s -mtriple=arm-apple-darwin -relocation-model=pic | \
; RUN: grep {ldr.*pc} | count 1
-; RUN: llvm-as < %s | \
-; RUN: llc -mtriple=arm-linux-gnueabi -relocation-model=pic | \
+; RUN: llc < %s -mtriple=arm-linux-gnueabi -relocation-model=pic | \
; RUN: grep {GOT} | count 1
@G = external global i32
diff --git a/test/CodeGen/ARM/load.ll b/test/CodeGen/ARM/load.ll
index 0509732..253b0e1 100644
--- a/test/CodeGen/ARM/load.ll
+++ b/test/CodeGen/ARM/load.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm > %t
+; RUN: llc < %s -march=arm > %t
; RUN: grep ldrsb %t
; RUN: grep ldrb %t
; RUN: grep ldrsh %t
diff --git a/test/CodeGen/ARM/long-setcc.ll b/test/CodeGen/ARM/long-setcc.ll
index 4bab330..c76a5e4 100644
--- a/test/CodeGen/ARM/long-setcc.ll
+++ b/test/CodeGen/ARM/long-setcc.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm | grep cmp | count 1
+; RUN: llc < %s -march=arm | grep cmp | count 1
define i1 @t1(i64 %x) {
diff --git a/test/CodeGen/ARM/long.ll b/test/CodeGen/ARM/long.ll
index fe0ee54..2fcaac0 100644
--- a/test/CodeGen/ARM/long.ll
+++ b/test/CodeGen/ARM/long.ll
@@ -1,13 +1,13 @@
-; RUN: llvm-as < %s | llc -march=arm -asm-verbose | \
+; RUN: llc < %s -march=arm -asm-verbose | \
; RUN: grep -- {-2147483648} | count 3
-; RUN: llvm-as < %s | llc -march=arm | grep mvn | count 3
-; RUN: llvm-as < %s | llc -march=arm | grep adds | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep adc | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep {subs } | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep sbc | count 1
-; RUN: llvm-as < %s | llc -march=arm | \
+; RUN: llc < %s -march=arm | grep mvn | count 3
+; RUN: llc < %s -march=arm | grep adds | count 1
+; RUN: llc < %s -march=arm | grep adc | count 1
+; RUN: llc < %s -march=arm | grep {subs } | count 1
+; RUN: llc < %s -march=arm | grep sbc | count 1
+; RUN: llc < %s -march=arm | \
; RUN: grep smull | count 1
-; RUN: llvm-as < %s | llc -march=arm | \
+; RUN: llc < %s -march=arm | \
; RUN: grep umull | count 1
define i64 @f1() {
diff --git a/test/CodeGen/ARM/long_shift.ll b/test/CodeGen/ARM/long_shift.ll
index 55d0cdc..057b5f0 100644
--- a/test/CodeGen/ARM/long_shift.ll
+++ b/test/CodeGen/ARM/long_shift.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm > %t
+; RUN: llc < %s -march=arm > %t
; RUN: grep rrx %t | count 1
; RUN: grep __ashldi3 %t
; RUN: grep __ashrdi3 %t
diff --git a/test/CodeGen/ARM/lsr-code-insertion.ll b/test/CodeGen/ARM/lsr-code-insertion.ll
index 3881e91..507ec2c 100644
--- a/test/CodeGen/ARM/lsr-code-insertion.ll
+++ b/test/CodeGen/ARM/lsr-code-insertion.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -stats |& grep {39.*Number of machine instrs printed}
-; RUN: llvm-as < %s | llc -stats |& grep {.*Number of re-materialization}
+; RUN: llc < %s -stats |& grep {40.*Number of machine instrs printed}
+; RUN: llc < %s -stats |& grep {.*Number of re-materialization}
; This test really wants to check that the resultant "cond_true" block only
; has a single store in it, and that cond_true55 only has code to materialize
; the constant and do a store. We do *not* want something like this:
diff --git a/test/CodeGen/ARM/lsr-scale-addr-mode.ll b/test/CodeGen/ARM/lsr-scale-addr-mode.ll
index 02902f2..8130019 100644
--- a/test/CodeGen/ARM/lsr-scale-addr-mode.ll
+++ b/test/CodeGen/ARM/lsr-scale-addr-mode.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm | grep lsl | grep -F {lsl #2\]}
+; RUN: llc < %s -march=arm | grep lsl | grep -F {lsl #2\]}
; Should use scaled addressing mode.
define void @sintzero(i32* %a) nounwind {
diff --git a/test/CodeGen/ARM/mem.ll b/test/CodeGen/ARM/mem.ll
index e983165..f46c7a5 100644
--- a/test/CodeGen/ARM/mem.ll
+++ b/test/CodeGen/ARM/mem.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm | grep strb
-; RUN: llvm-as < %s | llc -march=arm | grep strh
+; RUN: llc < %s -march=arm | grep strb
+; RUN: llc < %s -march=arm | grep strh
define void @f1() {
entry:
diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll
index 4bf0b4f..ed20c32 100644
--- a/test/CodeGen/ARM/memcpy-inline.ll
+++ b/test/CodeGen/ARM/memcpy-inline.ll
@@ -1,7 +1,7 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep ldmia
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep stmia
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep ldrb
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin | grep ldrh
+; RUN: llc < %s -mtriple=arm-apple-darwin | grep ldmia
+; RUN: llc < %s -mtriple=arm-apple-darwin | grep stmia
+; RUN: llc < %s -mtriple=arm-apple-darwin | grep ldrb
+; RUN: llc < %s -mtriple=arm-apple-darwin | grep ldrh
%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
@src = external global %struct.x
diff --git a/test/CodeGen/ARM/memfunc.ll b/test/CodeGen/ARM/memfunc.ll
index 0b58bf6..41d5944 100644
--- a/test/CodeGen/ARM/memfunc.ll
+++ b/test/CodeGen/ARM/memfunc.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
define void @f() {
entry:
diff --git a/test/CodeGen/ARM/mls.ll b/test/CodeGen/ARM/mls.ll
new file mode 100644
index 0000000..85407fa
--- /dev/null
+++ b/test/CodeGen/ARM/mls.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=arm -mattr=+v6t2 | grep {mls\\W*r\[0-9\],\\W*r\[0-9\],\\W*r\[0-9\],\\W*r\[0-9\]} | count 1
+
+define i32 @f1(i32 %a, i32 %b, i32 %c) {
+ %tmp1 = mul i32 %a, %b
+ %tmp2 = sub i32 %c, %tmp1
+ ret i32 %tmp2
+}
+
+; sub doesn't commute, so no mls for this one
+define i32 @f2(i32 %a, i32 %b, i32 %c) {
+ %tmp1 = mul i32 %a, %b
+ %tmp2 = sub i32 %tmp1, %c
+ ret i32 %tmp2
+}
diff --git a/test/CodeGen/ARM/mul.ll b/test/CodeGen/ARM/mul.ll
index 3543b5d..466a802 100644
--- a/test/CodeGen/ARM/mul.ll
+++ b/test/CodeGen/ARM/mul.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm | grep mul | count 2
-; RUN: llvm-as < %s | llc -march=arm | grep lsl | count 2
+; RUN: llc < %s -march=arm | grep mul | count 2
+; RUN: llc < %s -march=arm | grep lsl | count 2
define i32 @f1(i32 %u) {
%tmp = mul i32 %u, %u
diff --git a/test/CodeGen/ARM/mul_const.ll b/test/CodeGen/ARM/mul_const.ll
new file mode 100644
index 0000000..93188cd
--- /dev/null
+++ b/test/CodeGen/ARM/mul_const.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=arm | FileCheck %s
+
+define i32 @t1(i32 %v) nounwind readnone {
+entry:
+; CHECK: t1:
+; CHECK: add r0, r0, r0, lsl #3
+ %0 = mul i32 %v, 9
+ ret i32 %0
+}
+
+define i32 @t2(i32 %v) nounwind readnone {
+entry:
+; CHECK: t2:
+; CHECK: rsb r0, r0, r0, lsl #3
+ %0 = mul i32 %v, 7
+ ret i32 %0
+}
diff --git a/test/CodeGen/ARM/mulhi.ll b/test/CodeGen/ARM/mulhi.ll
index de75e96..148f291 100644
--- a/test/CodeGen/ARM/mulhi.ll
+++ b/test/CodeGen/ARM/mulhi.ll
@@ -1,7 +1,7 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6 | \
+; RUN: llc < %s -march=arm -mattr=+v6
+; RUN: llc < %s -march=arm -mattr=+v6 | \
; RUN: grep smmul | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep umull | count 1
+; RUN: llc < %s -march=arm | grep umull | count 1
define i32 @smulhi(i32 %x, i32 %y) {
%tmp = sext i32 %x to i64 ; <i64> [#uses=1]
diff --git a/test/CodeGen/ARM/mvn.ll b/test/CodeGen/ARM/mvn.ll
index a7ef907..571c21a 100644
--- a/test/CodeGen/ARM/mvn.ll
+++ b/test/CodeGen/ARM/mvn.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm | grep mvn | count 8
+; RUN: llc < %s -march=arm | grep mvn | count 8
define i32 @f1() {
entry:
diff --git a/test/CodeGen/ARM/neon_arith1.ll b/test/CodeGen/ARM/neon_arith1.ll
index 18b516f..5892737 100644
--- a/test/CodeGen/ARM/neon_arith1.ll
+++ b/test/CodeGen/ARM/neon_arith1.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep vadd
+; RUN: llc < %s -march=arm -mattr=+neon | grep vadd
define <8 x i8> @t_i8x8(<8 x i8> %a, <8 x i8> %b) nounwind {
entry:
diff --git a/test/CodeGen/ARM/neon_ld1.ll b/test/CodeGen/ARM/neon_ld1.ll
index 8901ba1..2796dec 100644
--- a/test/CodeGen/ARM/neon_ld1.ll
+++ b/test/CodeGen/ARM/neon_ld1.ll
@@ -1,6 +1,6 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fldd | count 4
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fstd
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fmrrd
+; RUN: llc < %s -march=arm -mattr=+neon | grep fldd | count 4
+; RUN: llc < %s -march=arm -mattr=+neon | grep fstd
+; RUN: llc < %s -march=arm -mattr=+neon | grep fmrrd
define void @t1(<2 x i32>* %r, <4 x i16>* %a, <4 x i16>* %b) nounwind {
entry:
diff --git a/test/CodeGen/ARM/neon_ld2.ll b/test/CodeGen/ARM/neon_ld2.ll
index a26904a..547bab7 100644
--- a/test/CodeGen/ARM/neon_ld2.ll
+++ b/test/CodeGen/ARM/neon_ld2.ll
@@ -1,6 +1,6 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep vldmia | count 4
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep vstmia | count 1
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fmrrd | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | grep vldmia | count 4
+; RUN: llc < %s -march=arm -mattr=+neon | grep vstmia | count 1
+; RUN: llc < %s -march=arm -mattr=+neon | grep fmrrd | count 2
define void @t1(<4 x i32>* %r, <2 x i64>* %a, <2 x i64>* %b) nounwind {
entry:
diff --git a/test/CodeGen/ARM/pack.ll b/test/CodeGen/ARM/pack.ll
index 151beac..1e2e7aa 100644
--- a/test/CodeGen/ARM/pack.ll
+++ b/test/CodeGen/ARM/pack.ll
@@ -1,6 +1,6 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6 | \
+; RUN: llc < %s -march=arm -mattr=+v6 | \
; RUN: grep pkhbt | count 5
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6 | \
+; RUN: llc < %s -march=arm -mattr=+v6 | \
; RUN: grep pkhtb | count 4
define i32 @test1(i32 %X, i32 %Y) {
diff --git a/test/CodeGen/ARM/pr3502.ll b/test/CodeGen/ARM/pr3502.ll
index dee3fc4..606d969 100644
--- a/test/CodeGen/ARM/pr3502.ll
+++ b/test/CodeGen/ARM/pr3502.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-none-linux-gnueabi
+; RUN: llc < %s -mtriple=arm-none-linux-gnueabi
;pr3502
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/ARM/private.ll b/test/CodeGen/ARM/private.ll
index e5eeccb..03376a4 100644
--- a/test/CodeGen/ARM/private.ll
+++ b/test/CodeGen/ARM/private.ll
@@ -1,6 +1,6 @@
; Test to make sure that the 'private' is used correctly.
;
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi > %t
+; RUN: llc < %s -mtriple=arm-linux-gnueabi > %t
; RUN: grep .Lfoo: %t
; RUN: egrep bl.*\.Lfoo %t
; RUN: grep .Lbaz: %t
diff --git a/test/CodeGen/ARM/remat.ll b/test/CodeGen/ARM/remat.ll
index 454d36b..ba9699e 100644
--- a/test/CodeGen/ARM/remat.ll
+++ b/test/CodeGen/ARM/remat.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -stats -info-output-file - | grep "Number of re-materialization" | grep 2
+; RUN: llc < %s -mtriple=arm-apple-darwin
+; RUN: llc < %s -mtriple=arm-apple-darwin -stats -info-output-file - | grep "Number of re-materialization" | grep 4
%struct.CONTENTBOX = type { i32, i32, i32, i32, i32 }
%struct.LOCBOX = type { i32, i32, i32, i32 }
diff --git a/test/CodeGen/ARM/ret0.ll b/test/CodeGen/ARM/ret0.ll
index 792b169..5c312eb 100644
--- a/test/CodeGen/ARM/ret0.ll
+++ b/test/CodeGen/ARM/ret0.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
define i32 @test() {
ret i32 0
diff --git a/test/CodeGen/ARM/ret_arg1.ll b/test/CodeGen/ARM/ret_arg1.ll
index 48a1fda..1ab947b 100644
--- a/test/CodeGen/ARM/ret_arg1.ll
+++ b/test/CodeGen/ARM/ret_arg1.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
define i32 @test(i32 %a1) {
ret i32 %a1
diff --git a/test/CodeGen/ARM/ret_arg2.ll b/test/CodeGen/ARM/ret_arg2.ll
index a74870f..84477d0 100644
--- a/test/CodeGen/ARM/ret_arg2.ll
+++ b/test/CodeGen/ARM/ret_arg2.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
define i32 @test(i32 %a1, i32 %a2) {
ret i32 %a2
diff --git a/test/CodeGen/ARM/ret_arg3.ll b/test/CodeGen/ARM/ret_arg3.ll
index 9210e7b..f7f9057 100644
--- a/test/CodeGen/ARM/ret_arg3.ll
+++ b/test/CodeGen/ARM/ret_arg3.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
define i32 @test(i32 %a1, i32 %a2, i32 %a3) {
ret i32 %a3
}
diff --git a/test/CodeGen/ARM/ret_arg4.ll b/test/CodeGen/ARM/ret_arg4.ll
index a9c66e9..f7b3e4a 100644
--- a/test/CodeGen/ARM/ret_arg4.ll
+++ b/test/CodeGen/ARM/ret_arg4.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
define i32 @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
ret i32 %a4
diff --git a/test/CodeGen/ARM/ret_arg5.ll b/test/CodeGen/ARM/ret_arg5.ll
index 620a017..c4f9fb5 100644
--- a/test/CodeGen/ARM/ret_arg5.ll
+++ b/test/CodeGen/ARM/ret_arg5.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
define i32 @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5) {
ret i32 %a5
diff --git a/test/CodeGen/ARM/ret_f32_arg2.ll b/test/CodeGen/ARM/ret_f32_arg2.ll
index 287d92b..2bafea6 100644
--- a/test/CodeGen/ARM/ret_f32_arg2.ll
+++ b/test/CodeGen/ARM/ret_f32_arg2.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2
+; RUN: llc < %s -march=arm -mattr=+vfp2
define float @test_f32(float %a1, float %a2) {
ret float %a2
diff --git a/test/CodeGen/ARM/ret_f32_arg5.ll b/test/CodeGen/ARM/ret_f32_arg5.ll
index 3418be9..c6ce60e 100644
--- a/test/CodeGen/ARM/ret_f32_arg5.ll
+++ b/test/CodeGen/ARM/ret_f32_arg5.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2
+; RUN: llc < %s -march=arm -mattr=+vfp2
define float @test_f32_arg5(float %a1, float %a2, float %a3, float %a4, float %a5) {
ret float %a5
diff --git a/test/CodeGen/ARM/ret_f64_arg2.ll b/test/CodeGen/ARM/ret_f64_arg2.ll
index 66848d5..386e85f 100644
--- a/test/CodeGen/ARM/ret_f64_arg2.ll
+++ b/test/CodeGen/ARM/ret_f64_arg2.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2
+; RUN: llc < %s -march=arm -mattr=+vfp2
define double @test_f64(double %a1, double %a2) {
ret double %a2
diff --git a/test/CodeGen/ARM/ret_f64_arg_reg_split.ll b/test/CodeGen/ARM/ret_f64_arg_reg_split.ll
index 626ee6f..bdb0a60 100644
--- a/test/CodeGen/ARM/ret_f64_arg_reg_split.ll
+++ b/test/CodeGen/ARM/ret_f64_arg_reg_split.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mcpu=arm8 -mattr=+vfp2
+; RUN: llc < %s -march=arm -mcpu=arm8 -mattr=+vfp2
define double @test_double_arg_reg_split(i32 %a1, double %a2) {
ret double %a2
diff --git a/test/CodeGen/ARM/ret_f64_arg_split.ll b/test/CodeGen/ARM/ret_f64_arg_split.ll
index b03b604..4f841a3 100644
--- a/test/CodeGen/ARM/ret_f64_arg_split.ll
+++ b/test/CodeGen/ARM/ret_f64_arg_split.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2
+; RUN: llc < %s -march=arm -mattr=+vfp2
define double @test_double_arg_split(i64 %a1, i32 %a2, double %a3) {
ret double %a3
diff --git a/test/CodeGen/ARM/ret_f64_arg_stack.ll b/test/CodeGen/ARM/ret_f64_arg_stack.ll
index ba3ec7f..2144317 100644
--- a/test/CodeGen/ARM/ret_f64_arg_stack.ll
+++ b/test/CodeGen/ARM/ret_f64_arg_stack.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2
+; RUN: llc < %s -march=arm -mattr=+vfp2
define double @test_double_arg_stack(i64 %a1, i32 %a2, i32 %a3, double %a4) {
ret double %a4
diff --git a/test/CodeGen/ARM/ret_i128_arg2.ll b/test/CodeGen/ARM/ret_i128_arg2.ll
index 0fe98e6..908c34f 100644
--- a/test/CodeGen/ARM/ret_i128_arg2.ll
+++ b/test/CodeGen/ARM/ret_i128_arg2.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2
+; RUN: llc < %s -march=arm -mattr=+vfp2
define i128 @test_i128(i128 %a1, i128 %a2, i128 %a3) {
ret i128 %a3
diff --git a/test/CodeGen/ARM/ret_i64_arg2.ll b/test/CodeGen/ARM/ret_i64_arg2.ll
index b015a96..b1a1024 100644
--- a/test/CodeGen/ARM/ret_i64_arg2.ll
+++ b/test/CodeGen/ARM/ret_i64_arg2.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2
+; RUN: llc < %s -march=arm -mattr=+vfp2
define i64 @test_i64(i64 %a1, i64 %a2) {
ret i64 %a2
diff --git a/test/CodeGen/ARM/ret_i64_arg3.ll b/test/CodeGen/ARM/ret_i64_arg3.ll
index 5dfecca..ffc1d2f 100644
--- a/test/CodeGen/ARM/ret_i64_arg3.ll
+++ b/test/CodeGen/ARM/ret_i64_arg3.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2
+; RUN: llc < %s -march=arm -mattr=+vfp2
define i64 @test_i64_arg3(i64 %a1, i64 %a2, i64 %a3) {
ret i64 %a3
diff --git a/test/CodeGen/ARM/ret_i64_arg_split.ll b/test/CodeGen/ARM/ret_i64_arg_split.ll
index 5bd5cb2..956bce5 100644
--- a/test/CodeGen/ARM/ret_i64_arg_split.ll
+++ b/test/CodeGen/ARM/ret_i64_arg_split.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2
+; RUN: llc < %s -march=arm -mattr=+vfp2
define i64 @test_i64_arg_split(i64 %a1, i32 %a2, i64 %a3) {
ret i64 %a3
diff --git a/test/CodeGen/ARM/ret_void.ll b/test/CodeGen/ARM/ret_void.ll
index 68db8c4..2b7ae05 100644
--- a/test/CodeGen/ARM/ret_void.ll
+++ b/test/CodeGen/ARM/ret_void.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
define void @test() {
ret void
diff --git a/test/CodeGen/ARM/rev.ll b/test/CodeGen/ARM/rev.ll
index 68f6264..1c12268 100644
--- a/test/CodeGen/ARM/rev.ll
+++ b/test/CodeGen/ARM/rev.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6 | grep rev16
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6 | grep revsh
+; RUN: llc < %s -march=arm -mattr=+v6 | grep rev16
+; RUN: llc < %s -march=arm -mattr=+v6 | grep revsh
define i32 @test1(i32 %X) {
%tmp1 = lshr i32 %X, 8 ; <i32> [#uses=3]
diff --git a/test/CodeGen/ARM/sbfx.ll b/test/CodeGen/ARM/sbfx.ll
new file mode 100644
index 0000000..923f52a
--- /dev/null
+++ b/test/CodeGen/ARM/sbfx.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=arm -mattr=+v6t2 | FileCheck %s
+
+define i32 @f1(i32 %a) {
+entry:
+; CHECK: f1:
+; CHECK: sbfx r0, r0, #0, #20
+ %tmp = shl i32 %a, 12
+ %tmp2 = ashr i32 %tmp, 12
+ ret i32 %tmp2
+}
+
+define i32 @f2(i32 %a) {
+entry:
+; CHECK: f2:
+; CHECK: ubfx r0, r0, #0, #20
+ %tmp = shl i32 %a, 12
+ %tmp2 = lshr i32 %tmp, 12
+ ret i32 %tmp2
+}
+
+define i32 @f3(i32 %a) {
+entry:
+; CHECK: f3:
+; CHECK: sbfx r0, r0, #5, #3
+ %tmp = shl i32 %a, 24
+ %tmp2 = ashr i32 %tmp, 29
+ ret i32 %tmp2
+}
+
+define i32 @f4(i32 %a) {
+entry:
+; CHECK: f4:
+; CHECK: ubfx r0, r0, #5, #3
+ %tmp = shl i32 %a, 24
+ %tmp2 = lshr i32 %tmp, 29
+ ret i32 %tmp2
+}
diff --git a/test/CodeGen/ARM/section.ll b/test/CodeGen/ARM/section.ll
index aa65845..7a566d4 100644
--- a/test/CodeGen/ARM/section.ll
+++ b/test/CodeGen/ARM/section.ll
@@ -1,6 +1,6 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-linux | \
+; RUN: llc < %s -mtriple=arm-linux | \
; RUN: grep {__DTOR_END__:}
-; RUN: llvm-as < %s | llc -mtriple=arm-linux | \
+; RUN: llc < %s -mtriple=arm-linux | \
; RUN: grep {\\.section.\\.dtors,"aw",.progbits}
@__DTOR_END__ = internal global [1 x i32] zeroinitializer, section ".dtors" ; <[1 x i32]*> [#uses=0]
diff --git a/test/CodeGen/ARM/select.ll b/test/CodeGen/ARM/select.ll
index 5148a5b..85c8b5b 100644
--- a/test/CodeGen/ARM/select.ll
+++ b/test/CodeGen/ARM/select.ll
@@ -1,13 +1,9 @@
-; RUN: llvm-as < %s | llc -march=arm | grep moveq | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep movgt | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep movlt | count 3
-; RUN: llvm-as < %s | llc -march=arm | grep movle | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep movls | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep movhi | count 1
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 | \
-; RUN: grep fcpydmi | count 1
+; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s --check-prefix=CHECK-VFP
define i32 @f1(i32 %a.s) {
+;CHECK: f1:
+;CHECK: moveq
entry:
%tmp = icmp eq i32 %a.s, 4
%tmp1.s = select i1 %tmp, i32 2, i32 3
@@ -15,6 +11,8 @@ entry:
}
define i32 @f2(i32 %a.s) {
+;CHECK: f2:
+;CHECK: movgt
entry:
%tmp = icmp sgt i32 %a.s, 4
%tmp1.s = select i1 %tmp, i32 2, i32 3
@@ -22,6 +20,8 @@ entry:
}
define i32 @f3(i32 %a.s, i32 %b.s) {
+;CHECK: f3:
+;CHECK: movlt
entry:
%tmp = icmp slt i32 %a.s, %b.s
%tmp1.s = select i1 %tmp, i32 2, i32 3
@@ -29,6 +29,8 @@ entry:
}
define i32 @f4(i32 %a.s, i32 %b.s) {
+;CHECK: f4:
+;CHECK: movle
entry:
%tmp = icmp sle i32 %a.s, %b.s
%tmp1.s = select i1 %tmp, i32 2, i32 3
@@ -36,6 +38,8 @@ entry:
}
define i32 @f5(i32 %a.u, i32 %b.u) {
+;CHECK: f5:
+;CHECK: movls
entry:
%tmp = icmp ule i32 %a.u, %b.u
%tmp1.s = select i1 %tmp, i32 2, i32 3
@@ -43,6 +47,8 @@ entry:
}
define i32 @f6(i32 %a.u, i32 %b.u) {
+;CHECK: f6:
+;CHECK: movhi
entry:
%tmp = icmp ugt i32 %a.u, %b.u
%tmp1.s = select i1 %tmp, i32 2, i32 3
@@ -50,6 +56,11 @@ entry:
}
define double @f7(double %a, double %b) {
+;CHECK: f7:
+;CHECK: movlt
+;CHECK: movlt
+;CHECK-VFP: f7:
+;CHECK-VFP: fcpydmi
%tmp = fcmp olt double %a, 1.234e+00
%tmp1 = select i1 %tmp, double -1.000e+00, double %b
ret double %tmp1
diff --git a/test/CodeGen/ARM/select_xform.ll b/test/CodeGen/ARM/select_xform.ll
index 6855e32..7fd91ce 100644
--- a/test/CodeGen/ARM/select_xform.ll
+++ b/test/CodeGen/ARM/select_xform.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm | grep mov | count 2
+; RUN: llc < %s -march=arm | grep mov | count 2
define i32 @t1(i32 %a, i32 %b, i32 %c) nounwind {
%tmp1 = icmp sgt i32 %c, 10
diff --git a/test/CodeGen/ARM/shifter_operand.ll b/test/CodeGen/ARM/shifter_operand.ll
index cae1c44..2bbe9fd 100644
--- a/test/CodeGen/ARM/shifter_operand.ll
+++ b/test/CodeGen/ARM/shifter_operand.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm | grep add | grep lsl
-; RUN: llvm-as < %s | llc -march=arm | grep bic | grep asr
+; RUN: llc < %s -march=arm | grep add | grep lsl
+; RUN: llc < %s -march=arm | grep bic | grep asr
define i32 @test1(i32 %X, i32 %Y, i8 %sh) {
diff --git a/test/CodeGen/ARM/smul.ll b/test/CodeGen/ARM/smul.ll
index 7a4e488..b7ab2e7 100644
--- a/test/CodeGen/ARM/smul.ll
+++ b/test/CodeGen/ARM/smul.ll
@@ -1,10 +1,10 @@
-; RUN: llvm-as < %s | llc -march=arm
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v5TE
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v5TE | \
+; RUN: llc < %s -march=arm
+; RUN: llc < %s -march=arm -mattr=+v5TE
+; RUN: llc < %s -march=arm -mattr=+v5TE | \
; RUN: grep smulbt | count 1
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v5TE | \
+; RUN: llc < %s -march=arm -mattr=+v5TE | \
; RUN: grep smultt | count 1
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v5TE | \
+; RUN: llc < %s -march=arm -mattr=+v5TE | \
; RUN: grep smlabt | count 1
@x = weak global i16 0 ; <i16*> [#uses=1]
diff --git a/test/CodeGen/ARM/spill-q.ll b/test/CodeGen/ARM/spill-q.ll
new file mode 100644
index 0000000..f4b27a7
--- /dev/null
+++ b/test/CodeGen/ARM/spill-q.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -mtriple=armv7-elf -mattr=+neon | FileCheck %s
+; PR4789
+
+%bar = type { float, float, float }
+%baz = type { i32, [16 x %bar], [16 x float], [16 x i32], i8 }
+%foo = type { <4 x float> }
+%quux = type { i32 (...)**, %baz*, i32 }
+%quuz = type { %quux, i32, %bar, [128 x i8], [16 x %foo], %foo, %foo, %foo }
+
+declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*) nounwind readonly
+
+define arm_apcscc void @aaa(%quuz* %this, i8* %block) {
+; CHECK: aaa:
+; CHECK: vstmia sp
+; CHECK: vldmia sp
+entry:
+ %0 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) nounwind ; <<4 x float>> [#uses=1]
+ store float 6.300000e+01, float* undef, align 4
+ %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) nounwind ; <<4 x float>> [#uses=1]
+ store float 0.000000e+00, float* undef, align 4
+ %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef) nounwind ; <<4 x float>> [#uses=1]
+ %val173 = load <4 x float>* undef ; <<4 x float>> [#uses=1]
+ br label %bb4
+
+bb4: ; preds = %bb193, %entry
+ %besterror.0.2264 = phi <4 x float> [ undef, %entry ], [ %besterror.0.0, %bb193 ] ; <<4 x float>> [#uses=2]
+ %part0.0.0261 = phi <4 x float> [ zeroinitializer, %entry ], [ %23, %bb193 ] ; <<4 x float>> [#uses=2]
+ %3 = fmul <4 x float> zeroinitializer, %0 ; <<4 x float>> [#uses=2]
+ %4 = fadd <4 x float> %3, %part0.0.0261 ; <<4 x float>> [#uses=1]
+ %5 = shufflevector <4 x float> %3, <4 x float> undef, <2 x i32> <i32 2, i32 3> ; <<2 x float>> [#uses=1]
+ %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x float>> [#uses=1]
+ %7 = fmul <4 x float> %1, undef ; <<4 x float>> [#uses=1]
+ %8 = fadd <4 x float> %7, <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01> ; <<4 x float>> [#uses=1]
+ %9 = fptosi <4 x float> %8 to <4 x i32> ; <<4 x i32>> [#uses=1]
+ %10 = sitofp <4 x i32> %9 to <4 x float> ; <<4 x float>> [#uses=1]
+ %11 = fmul <4 x float> %10, %2 ; <<4 x float>> [#uses=1]
+ %12 = fmul <4 x float> undef, %6 ; <<4 x float>> [#uses=1]
+ %13 = fmul <4 x float> %11, %4 ; <<4 x float>> [#uses=1]
+ %14 = fsub <4 x float> %12, %13 ; <<4 x float>> [#uses=1]
+ %15 = fsub <4 x float> %14, undef ; <<4 x float>> [#uses=1]
+ %16 = fmul <4 x float> %15, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> ; <<4 x float>> [#uses=1]
+ %17 = fadd <4 x float> %16, undef ; <<4 x float>> [#uses=1]
+ %18 = fmul <4 x float> %17, %val173 ; <<4 x float>> [#uses=1]
+ %19 = shufflevector <4 x float> %18, <4 x float> undef, <2 x i32> <i32 2, i32 3> ; <<2 x float>> [#uses=1]
+ %20 = shufflevector <2 x float> %19, <2 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
+ %21 = fadd <4 x float> zeroinitializer, %20 ; <<4 x float>> [#uses=2]
+ %22 = fcmp ogt <4 x float> %besterror.0.2264, %21 ; <<4 x i1>> [#uses=0]
+ br i1 undef, label %bb193, label %bb186
+
+bb186: ; preds = %bb4
+ br label %bb193
+
+bb193: ; preds = %bb186, %bb4
+ %besterror.0.0 = phi <4 x float> [ %21, %bb186 ], [ %besterror.0.2264, %bb4 ] ; <<4 x float>> [#uses=1]
+ %23 = fadd <4 x float> %part0.0.0261, zeroinitializer ; <<4 x float>> [#uses=1]
+ br label %bb4
+}
diff --git a/test/CodeGen/ARM/stack-frame.ll b/test/CodeGen/ARM/stack-frame.ll
index c3dd65a..1dd57dd 100644
--- a/test/CodeGen/ARM/stack-frame.ll
+++ b/test/CodeGen/ARM/stack-frame.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm
-; RUN: llvm-as < %s | llc -march=arm | grep add | count 1
+; RUN: llc < %s -march=arm
+; RUN: llc < %s -march=arm | grep add | count 1
define void @f1() {
%c = alloca i8, align 1
diff --git a/test/CodeGen/ARM/stm.ll b/test/CodeGen/ARM/stm.ll
index ed5e4c5..22a7ecb 100644
--- a/test/CodeGen/ARM/stm.ll
+++ b/test/CodeGen/ARM/stm.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -mattr=+v6,+vfp2 | grep stm | count 2
+; RUN: llc < %s -mtriple=arm-apple-darwin -mattr=+v6,+vfp2 | grep stm | count 2
@"\01LC" = internal constant [32 x i8] c"Boolean Not: %d %d %d %d %d %d\0A\00", section "__TEXT,__cstring,cstring_literals" ; <[32 x i8]*> [#uses=1]
@"\01LC1" = internal constant [26 x i8] c"Bitwise Not: %d %d %d %d\0A\00", section "__TEXT,__cstring,cstring_literals" ; <[26 x i8]*> [#uses=1]
diff --git a/test/CodeGen/ARM/str_post.ll b/test/CodeGen/ARM/str_post.ll
index ba81380..801b9ce 100644
--- a/test/CodeGen/ARM/str_post.ll
+++ b/test/CodeGen/ARM/str_post.ll
@@ -1,6 +1,6 @@
-; RUN: llvm-as < %s | llc -march=arm | \
+; RUN: llc < %s -march=arm | \
; RUN: grep {strh .*\\\[.*\], #-4} | count 1
-; RUN: llvm-as < %s | llc -march=arm | \
+; RUN: llc < %s -march=arm | \
; RUN: grep {str .*\\\[.*\],} | count 1
define i16 @test1(i32* %X, i16* %A) {
diff --git a/test/CodeGen/ARM/str_pre-2.ll b/test/CodeGen/ARM/str_pre-2.ll
index e9f1945..f8d3df2 100644
--- a/test/CodeGen/ARM/str_pre-2.ll
+++ b/test/CodeGen/ARM/str_pre-2.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnu | grep {str.*\\!}
-; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnu | grep {ldr.*\\\[.*\], #+4}
+; RUN: llc < %s -mtriple=arm-linux-gnu | grep {str.*\\!}
+; RUN: llc < %s -mtriple=arm-linux-gnu | grep {ldr.*\\\[.*\], #+4}
@b = external global i64*
diff --git a/test/CodeGen/ARM/str_pre.ll b/test/CodeGen/ARM/str_pre.ll
index c02663f..e56e3f2 100644
--- a/test/CodeGen/ARM/str_pre.ll
+++ b/test/CodeGen/ARM/str_pre.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm | \
+; RUN: llc < %s -march=arm | \
; RUN: grep {str.*\\!} | count 2
define void @test1(i32* %X, i32* %A, i32** %dest) {
diff --git a/test/CodeGen/ARM/str_trunc.ll b/test/CodeGen/ARM/str_trunc.ll
index 77c66ec..2f1166b6 100644
--- a/test/CodeGen/ARM/str_trunc.ll
+++ b/test/CodeGen/ARM/str_trunc.ll
@@ -1,6 +1,6 @@
-; RUN: llvm-as < %s | llc -march=arm | \
+; RUN: llc < %s -march=arm | \
; RUN: grep strb | count 1
-; RUN: llvm-as < %s | llc -march=arm | \
+; RUN: llc < %s -march=arm | \
; RUN: grep strh | count 1
define void @test1(i32 %v, i16* %ptr) {
diff --git a/test/CodeGen/ARM/sxt_rot.ll b/test/CodeGen/ARM/sxt_rot.ll
index e9f302c..4752f17 100644
--- a/test/CodeGen/ARM/sxt_rot.ll
+++ b/test/CodeGen/ARM/sxt_rot.ll
@@ -1,8 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6 | \
+; RUN: llc < %s -march=arm -mattr=+v6 | \
; RUN: grep sxtb | count 2
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6 | \
+; RUN: llc < %s -march=arm -mattr=+v6 | \
; RUN: grep sxtb | grep ror | count 1
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6 | \
+; RUN: llc < %s -march=arm -mattr=+v6 | \
; RUN: grep sxtab | count 1
define i32 @test0(i8 %A) {
diff --git a/test/CodeGen/ARM/t2-imm.ll b/test/CodeGen/ARM/t2-imm.ll
new file mode 100644
index 0000000..848a4df
--- /dev/null
+++ b/test/CodeGen/ARM/t2-imm.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=arm -mattr=+thumb2 | FileCheck %s
+
+define i32 @f6(i32 %a) {
+; CHECK:f6
+; CHECK: movw r0, #:lower16:65537123
+; CHECK: movt r0, #:upper16:65537123
+ %tmp = add i32 0, 65537123
+ ret i32 %tmp
+}
diff --git a/test/CodeGen/ARM/thread_pointer.ll b/test/CodeGen/ARM/thread_pointer.ll
index 6476b48..3143387 100644
--- a/test/CodeGen/ARM/thread_pointer.ll
+++ b/test/CodeGen/ARM/thread_pointer.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mtriple=arm-linux-gnueabi | \
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
; RUN: grep {__aeabi_read_tp}
define i8* @test() {
diff --git a/test/CodeGen/ARM/tls1.ll b/test/CodeGen/ARM/tls1.ll
index 6866a42..1087094 100644
--- a/test/CodeGen/ARM/tls1.ll
+++ b/test/CodeGen/ARM/tls1.ll
@@ -1,8 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mtriple=arm-linux-gnueabi | \
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
; RUN: grep {i(tpoff)}
-; RUN: llvm-as < %s | llc -march=arm -mtriple=arm-linux-gnueabi | \
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
; RUN: grep {__aeabi_read_tp}
-; RUN: llvm-as < %s | llc -march=arm -mtriple=arm-linux-gnueabi \
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi \
; RUN: -relocation-model=pic | grep {__tls_get_addr}
diff --git a/test/CodeGen/ARM/tls2.ll b/test/CodeGen/ARM/tls2.ll
index 90e3bcf..32847208 100644
--- a/test/CodeGen/ARM/tls2.ll
+++ b/test/CodeGen/ARM/tls2.ll
@@ -1,8 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mtriple=arm-linux-gnueabi | \
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
; RUN: grep {i(gottpoff)}
-; RUN: llvm-as < %s | llc -march=arm -mtriple=arm-linux-gnueabi | \
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
; RUN: grep {ldr r., \[pc, r.\]}
-; RUN: llvm-as < %s | llc -march=arm -mtriple=arm-linux-gnueabi \
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi \
; RUN: -relocation-model=pic | grep {__tls_get_addr}
@i = external thread_local global i32 ; <i32*> [#uses=2]
diff --git a/test/CodeGen/ARM/tls3.ll b/test/CodeGen/ARM/tls3.ll
index df2913b..df7a4ca 100644
--- a/test/CodeGen/ARM/tls3.ll
+++ b/test/CodeGen/ARM/tls3.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mtriple=arm-linux-gnueabi | \
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | \
; RUN: grep {tbss}
%struct.anon = type { i32, i32 }
diff --git a/test/CodeGen/ARM/trunc_ldr.ll b/test/CodeGen/ARM/trunc_ldr.ll
index 6111ec9..3033c2b 100644
--- a/test/CodeGen/ARM/trunc_ldr.ll
+++ b/test/CodeGen/ARM/trunc_ldr.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm | grep ldrb.*7 | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep ldrsb.*7 | count 1
+; RUN: llc < %s -march=arm | grep ldrb.*7 | count 1
+; RUN: llc < %s -march=arm | grep ldrsb.*7 | count 1
%struct.A = type { i8, i8, i8, i8, i16, i8, i8, %struct.B** }
%struct.B = type { float, float, i32, i32, i32, [0 x i8] }
diff --git a/test/CodeGen/ARM/truncstore-dag-combine.ll b/test/CodeGen/ARM/truncstore-dag-combine.ll
index 0e85fb6..2da08b6 100644
--- a/test/CodeGen/ARM/truncstore-dag-combine.ll
+++ b/test/CodeGen/ARM/truncstore-dag-combine.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm | not grep orr
-; RUN: llvm-as < %s | llc -march=arm | not grep mov
+; RUN: llc < %s -march=arm | not grep orr
+; RUN: llc < %s -march=arm | not grep mov
define void @bar(i8* %P, i16* %Q) {
entry:
diff --git a/test/CodeGen/ARM/tst_teq.ll b/test/CodeGen/ARM/tst_teq.ll
index bdeee3f..c83111e 100644
--- a/test/CodeGen/ARM/tst_teq.ll
+++ b/test/CodeGen/ARM/tst_teq.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm | grep tst
-; RUN: llvm-as < %s | llc -march=arm | grep teq
+; RUN: llc < %s -march=arm | grep tst
+; RUN: llc < %s -march=arm | grep teq
define i32 @f(i32 %a) {
entry:
diff --git a/test/CodeGen/ARM/uint64tof64.ll b/test/CodeGen/ARM/uint64tof64.ll
index 055c3c3..32eb225 100644
--- a/test/CodeGen/ARM/uint64tof64.ll
+++ b/test/CodeGen/ARM/uint64tof64.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -mattr=+vfp2
+; RUN: llc < %s -mtriple=arm-apple-darwin -mattr=+vfp2
%struct.FILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
%struct.__sFILEX = type opaque
diff --git a/test/CodeGen/ARM/unaligned_load_store.ll b/test/CodeGen/ARM/unaligned_load_store.ll
index dad1897..fcaa2b3 100644
--- a/test/CodeGen/ARM/unaligned_load_store.ll
+++ b/test/CodeGen/ARM/unaligned_load_store.ll
@@ -1,16 +1,31 @@
-; RUN: llvm-as < %s | \
-; RUN: llc -march=arm -o %t -f
-; RUN: grep ldrb %t | count 4
-; RUN: grep strb %t | count 4
+; RUN: llc < %s -march=arm | FileCheck %s -check-prefix=GENERIC
+; RUN: llc < %s -mtriple=armv6-apple-darwin | FileCheck %s -check-prefix=DARWIN_V6
+; RUN: llc < %s -march=arm -mattr=+v7a | FileCheck %s -check-prefix=V7
+; rdar://7113725
- %struct.p = type <{ i8, i32 }>
-@t = global %struct.p <{ i8 1, i32 10 }> ; <%struct.p*> [#uses=1]
-@u = weak global %struct.p zeroinitializer ; <%struct.p*> [#uses=1]
-
-define i32 @main() {
+define arm_apcscc void @t(i8* nocapture %a, i8* nocapture %b) nounwind {
entry:
- %tmp3 = load i32* getelementptr (%struct.p* @t, i32 0, i32 1), align 1 ; <i32> [#uses=2]
- store i32 %tmp3, i32* getelementptr (%struct.p* @u, i32 0, i32 1), align 1
- ret i32 %tmp3
+; GENERIC: t:
+; GENERIC: ldrb r2
+; GENERIC: ldrb r3
+; GENERIC: ldrb r12
+; GENERIC: ldrb r1
+; GENERIC: strb r1
+; GENERIC: strb r12
+; GENERIC: strb r3
+; GENERIC: strb r2
+
+; DARWIN_V6: t:
+; DARWIN_V6: ldr r1
+; DARWIN_V6: str r1
+
+; V7: t:
+; V7: ldr r1
+; V7: str r1
+ %__src1.i = bitcast i8* %b to i32* ; <i32*> [#uses=1]
+ %__dest2.i = bitcast i8* %a to i32* ; <i32*> [#uses=1]
+ %tmp.i = load i32* %__src1.i, align 1 ; <i32> [#uses=1]
+ store i32 %tmp.i, i32* %__dest2.i, align 1
+ ret void
}
diff --git a/test/CodeGen/ARM/unord.ll b/test/CodeGen/ARM/unord.ll
index 149afc4..bd28034 100644
--- a/test/CodeGen/ARM/unord.ll
+++ b/test/CodeGen/ARM/unord.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm | grep movne | count 1
-; RUN: llvm-as < %s | llc -march=arm | grep moveq | count 1
+; RUN: llc < %s -march=arm | grep movne | count 1
+; RUN: llc < %s -march=arm | grep moveq | count 1
define i32 @f1(float %X, float %Y) {
%tmp = fcmp uno float %X, %Y
diff --git a/test/CodeGen/ARM/uxt_rot.ll b/test/CodeGen/ARM/uxt_rot.ll
index 09c74eb..6307795 100644
--- a/test/CodeGen/ARM/uxt_rot.ll
+++ b/test/CodeGen/ARM/uxt_rot.ll
@@ -1,6 +1,6 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6 | grep uxtb | count 1
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6 | grep uxtab | count 1
-; RUN: llvm-as < %s | llc -march=arm -mattr=+v6 | grep uxth | count 1
+; RUN: llc < %s -march=arm -mattr=+v6 | grep uxtb | count 1
+; RUN: llc < %s -march=arm -mattr=+v6 | grep uxtab | count 1
+; RUN: llc < %s -march=arm -mattr=+v6 | grep uxth | count 1
define i8 @test1(i32 %A.u) zeroext {
%B.u = trunc i32 %A.u to i8
diff --git a/test/CodeGen/ARM/uxtb.ll b/test/CodeGen/ARM/uxtb.ll
index 73e918b..9d6e4bd 100644
--- a/test/CodeGen/ARM/uxtb.ll
+++ b/test/CodeGen/ARM/uxtb.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -mtriple=armv6-apple-darwin | \
+; RUN: llc < %s -mtriple=armv6-apple-darwin | \
; RUN: grep uxt | count 10
define i32 @test1(i32 %x) {
diff --git a/test/CodeGen/ARM/vaba.ll b/test/CodeGen/ARM/vaba.ll
index 98ee1e1..e2dca46 100644
--- a/test/CodeGen/ARM/vaba.ll
+++ b/test/CodeGen/ARM/vaba.ll
@@ -1,12 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vaba\\.s8} %t | count 2
-; RUN: grep {vaba\\.s16} %t | count 2
-; RUN: grep {vaba\\.s32} %t | count 2
-; RUN: grep {vaba\\.u8} %t | count 2
-; RUN: grep {vaba\\.u16} %t | count 2
-; RUN: grep {vaba\\.u32} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vabas8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK: vabas8:
+;CHECK: vaba.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
@@ -15,6 +11,8 @@ define <8 x i8> @vabas8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
}
define <4 x i16> @vabas16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK: vabas16:
+;CHECK: vaba.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
@@ -23,6 +21,8 @@ define <4 x i16> @vabas16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind
}
define <2 x i32> @vabas32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK: vabas32:
+;CHECK: vaba.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
@@ -31,6 +31,8 @@ define <2 x i32> @vabas32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind
}
define <8 x i8> @vabau8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK: vabau8:
+;CHECK: vaba.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
@@ -39,6 +41,8 @@ define <8 x i8> @vabau8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
}
define <4 x i16> @vabau16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK: vabau16:
+;CHECK: vaba.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
@@ -47,6 +51,8 @@ define <4 x i16> @vabau16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind
}
define <2 x i32> @vabau32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK: vabau32:
+;CHECK: vaba.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
@@ -55,6 +61,8 @@ define <2 x i32> @vabau32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind
}
define <16 x i8> @vabaQs8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+;CHECK: vabaQs8:
+;CHECK: vaba.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = load <16 x i8>* %C
@@ -63,6 +71,8 @@ define <16 x i8> @vabaQs8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind
}
define <8 x i16> @vabaQs16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+;CHECK: vabaQs16:
+;CHECK: vaba.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = load <8 x i16>* %C
@@ -71,6 +81,8 @@ define <8 x i16> @vabaQs16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind
}
define <4 x i32> @vabaQs32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+;CHECK: vabaQs32:
+;CHECK: vaba.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = load <4 x i32>* %C
@@ -79,6 +91,8 @@ define <4 x i32> @vabaQs32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind
}
define <16 x i8> @vabaQu8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+;CHECK: vabaQu8:
+;CHECK: vaba.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = load <16 x i8>* %C
@@ -87,6 +101,8 @@ define <16 x i8> @vabaQu8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind
}
define <8 x i16> @vabaQu16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+;CHECK: vabaQu16:
+;CHECK: vaba.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = load <8 x i16>* %C
@@ -95,6 +111,8 @@ define <8 x i16> @vabaQu16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind
}
define <4 x i32> @vabaQu32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+;CHECK: vabaQu32:
+;CHECK: vaba.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = load <4 x i32>* %C
@@ -117,3 +135,71 @@ declare <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) no
declare <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i16> @vabals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK: vabals8:
+;CHECK: vabal.s8
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = load <8 x i8>* %C
+ %tmp4 = call <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vabals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK: vabals16:
+;CHECK: vabal.s16
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i16>* %C
+ %tmp4 = call <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vabals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK: vabals32:
+;CHECK: vabal.s32
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i32>* %C
+ %tmp4 = call <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @vabalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK: vabalu8:
+;CHECK: vabal.u8
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = load <8 x i8>* %C
+ %tmp4 = call <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vabalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK: vabalu16:
+;CHECK: vabal.u16
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i16>* %C
+ %tmp4 = call <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vabalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK: vabalu32:
+;CHECK: vabal.u32
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i32>* %C
+ %tmp4 = call <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+ ret <2 x i64> %tmp4
+}
+
+declare <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vabd.ll b/test/CodeGen/ARM/vabd.ll
index 0fe5ddb..2b45393 100644
--- a/test/CodeGen/ARM/vabd.ll
+++ b/test/CodeGen/ARM/vabd.ll
@@ -1,13 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vabd\\.s8} %t | count 2
-; RUN: grep {vabd\\.s16} %t | count 2
-; RUN: grep {vabd\\.s32} %t | count 2
-; RUN: grep {vabd\\.u8} %t | count 2
-; RUN: grep {vabd\\.u16} %t | count 2
-; RUN: grep {vabd\\.u32} %t | count 2
-; RUN: grep {vabd\\.f32} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vabds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vabds8:
+;CHECK: vabd.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -15,6 +10,8 @@ define <8 x i8> @vabds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vabds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vabds16:
+;CHECK: vabd.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -22,6 +19,8 @@ define <4 x i16> @vabds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vabds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vabds32:
+;CHECK: vabd.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -29,6 +28,8 @@ define <2 x i32> @vabds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <8 x i8> @vabdu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vabdu8:
+;CHECK: vabd.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -36,6 +37,8 @@ define <8 x i8> @vabdu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vabdu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vabdu16:
+;CHECK: vabd.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -43,6 +46,8 @@ define <4 x i16> @vabdu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vabdu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vabdu32:
+;CHECK: vabd.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -50,13 +55,17 @@ define <2 x i32> @vabdu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <2 x float> @vabdf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vabdf32:
+;CHECK: vabd.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
- %tmp3 = call <2 x float> @llvm.arm.neon.vabdf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ %tmp3 = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
define <16 x i8> @vabdQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vabdQs8:
+;CHECK: vabd.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -64,6 +73,8 @@ define <16 x i8> @vabdQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vabdQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vabdQs16:
+;CHECK: vabd.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -71,6 +82,8 @@ define <8 x i16> @vabdQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vabdQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vabdQs32:
+;CHECK: vabd.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -78,6 +91,8 @@ define <4 x i32> @vabdQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <16 x i8> @vabdQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vabdQu8:
+;CHECK: vabd.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -85,6 +100,8 @@ define <16 x i8> @vabdQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vabdQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vabdQu16:
+;CHECK: vabd.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -92,6 +109,8 @@ define <8 x i16> @vabdQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vabdQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vabdQu32:
+;CHECK: vabd.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -99,9 +118,11 @@ define <4 x i32> @vabdQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <4 x float> @vabdQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK: vabdQf32:
+;CHECK: vabd.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
- %tmp3 = call <4 x float> @llvm.arm.neon.vabdf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+ %tmp3 = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x float> %tmp3
}
@@ -113,7 +134,7 @@ declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>) nounwind readnon
declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <2 x float> @llvm.arm.neon.vabdf.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float>, <2 x float>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
@@ -123,4 +144,66 @@ declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>) nounwind read
declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <4 x float> @llvm.arm.neon.vabdf.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <8 x i16> @vabdls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vabdls8:
+;CHECK: vabdl.s8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vabdls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vabdls16:
+;CHECK: vabdl.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vabdls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vabdls32:
+;CHECK: vabdl.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vabdlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vabdlu8:
+;CHECK: vabdl.u8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vabdlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vabdlu16:
+;CHECK: vabdl.u16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vabdlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vabdlu32:
+;CHECK: vabdl.u32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vabs.ll b/test/CodeGen/ARM/vabs.ll
index 629baa7..18ba61f 100644
--- a/test/CodeGen/ARM/vabs.ll
+++ b/test/CodeGen/ARM/vabs.ll
@@ -1,64 +1,131 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vabs\\.s8} %t | count 2
-; RUN: grep {vabs\\.s16} %t | count 2
-; RUN: grep {vabs\\.s32} %t | count 2
-; RUN: grep {vabs\\.f32} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vabss8(<8 x i8>* %A) nounwind {
+;CHECK: vabss8:
+;CHECK: vabs.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %tmp1)
ret <8 x i8> %tmp2
}
define <4 x i16> @vabss16(<4 x i16>* %A) nounwind {
+;CHECK: vabss16:
+;CHECK: vabs.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %tmp1)
ret <4 x i16> %tmp2
}
define <2 x i32> @vabss32(<2 x i32>* %A) nounwind {
+;CHECK: vabss32:
+;CHECK: vabs.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %tmp1)
ret <2 x i32> %tmp2
}
define <2 x float> @vabsf32(<2 x float>* %A) nounwind {
+;CHECK: vabsf32:
+;CHECK: vabs.f32
%tmp1 = load <2 x float>* %A
- %tmp2 = call <2 x float> @llvm.arm.neon.vabsf.v2f32(<2 x float> %tmp1)
+ %tmp2 = call <2 x float> @llvm.arm.neon.vabs.v2f32(<2 x float> %tmp1)
ret <2 x float> %tmp2
}
define <16 x i8> @vabsQs8(<16 x i8>* %A) nounwind {
+;CHECK: vabsQs8:
+;CHECK: vabs.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %tmp1)
ret <16 x i8> %tmp2
}
define <8 x i16> @vabsQs16(<8 x i16>* %A) nounwind {
+;CHECK: vabsQs16:
+;CHECK: vabs.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %tmp1)
ret <8 x i16> %tmp2
}
define <4 x i32> @vabsQs32(<4 x i32>* %A) nounwind {
+;CHECK: vabsQs32:
+;CHECK: vabs.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %tmp1)
ret <4 x i32> %tmp2
}
define <4 x float> @vabsQf32(<4 x float>* %A) nounwind {
+;CHECK: vabsQf32:
+;CHECK: vabs.f32
%tmp1 = load <4 x float>* %A
- %tmp2 = call <4 x float> @llvm.arm.neon.vabsf.v4f32(<4 x float> %tmp1)
+ %tmp2 = call <4 x float> @llvm.arm.neon.vabs.v4f32(<4 x float> %tmp1)
ret <4 x float> %tmp2
}
declare <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32>) nounwind readnone
-declare <2 x float> @llvm.arm.neon.vabsf.v2f32(<2 x float>) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vabs.v2f32(<2 x float>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32>) nounwind readnone
-declare <4 x float> @llvm.arm.neon.vabsf.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vabs.v4f32(<4 x float>) nounwind readnone
+define <8 x i8> @vqabss8(<8 x i8>* %A) nounwind {
+;CHECK: vqabss8:
+;CHECK: vqabs.s8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %tmp1)
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqabss16(<4 x i16>* %A) nounwind {
+;CHECK: vqabss16:
+;CHECK: vqabs.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %tmp1)
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqabss32(<2 x i32>* %A) nounwind {
+;CHECK: vqabss32:
+;CHECK: vqabs.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %tmp1)
+ ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vqabsQs8(<16 x i8>* %A) nounwind {
+;CHECK: vqabsQs8:
+;CHECK: vqabs.s8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %tmp1)
+ ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vqabsQs16(<8 x i16>* %A) nounwind {
+;CHECK: vqabsQs16:
+;CHECK: vqabs.s16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %tmp1)
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vqabsQs32(<4 x i32>* %A) nounwind {
+;CHECK: vqabsQs32:
+;CHECK: vqabs.s32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %tmp1)
+ ret <4 x i32> %tmp2
+}
+
+declare <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vadd.ll b/test/CodeGen/ARM/vadd.ll
index b2b0e23..9fa5307 100644
--- a/test/CodeGen/ARM/vadd.ll
+++ b/test/CodeGen/ARM/vadd.ll
@@ -1,11 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vadd\\.i8} %t | count 2
-; RUN: grep {vadd\\.i16} %t | count 2
-; RUN: grep {vadd\\.i32} %t | count 2
-; RUN: grep {vadd\\.i64} %t | count 2
-; RUN: grep {vadd\\.f32} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vaddi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vaddi8:
+;CHECK: vadd.i8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = add <8 x i8> %tmp1, %tmp2
@@ -13,6 +10,8 @@ define <8 x i8> @vaddi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vaddi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vaddi16:
+;CHECK: vadd.i16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = add <4 x i16> %tmp1, %tmp2
@@ -20,6 +19,8 @@ define <4 x i16> @vaddi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vaddi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vaddi32:
+;CHECK: vadd.i32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = add <2 x i32> %tmp1, %tmp2
@@ -27,6 +28,8 @@ define <2 x i32> @vaddi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vaddi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vaddi64:
+;CHECK: vadd.i64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = add <1 x i64> %tmp1, %tmp2
@@ -34,6 +37,8 @@ define <1 x i64> @vaddi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <2 x float> @vaddf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vaddf32:
+;CHECK: vadd.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = add <2 x float> %tmp1, %tmp2
@@ -41,6 +46,8 @@ define <2 x float> @vaddf32(<2 x float>* %A, <2 x float>* %B) nounwind {
}
define <16 x i8> @vaddQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vaddQi8:
+;CHECK: vadd.i8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = add <16 x i8> %tmp1, %tmp2
@@ -48,6 +55,8 @@ define <16 x i8> @vaddQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vaddQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vaddQi16:
+;CHECK: vadd.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = add <8 x i16> %tmp1, %tmp2
@@ -55,6 +64,8 @@ define <8 x i16> @vaddQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vaddQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vaddQi32:
+;CHECK: vadd.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = add <4 x i32> %tmp1, %tmp2
@@ -62,6 +73,8 @@ define <4 x i32> @vaddQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vaddQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vaddQi64:
+;CHECK: vadd.i64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = add <2 x i64> %tmp1, %tmp2
@@ -69,8 +82,196 @@ define <2 x i64> @vaddQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
}
define <4 x float> @vaddQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK: vaddQf32:
+;CHECK: vadd.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = add <4 x float> %tmp1, %tmp2
ret <4 x float> %tmp3
}
+
+define <8 x i8> @vaddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vaddhni16:
+;CHECK: vaddhn.i16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vaddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vaddhni32:
+;CHECK: vaddhn.i32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vaddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vaddhni64:
+;CHECK: vaddhn.i64
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @vraddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vraddhni16:
+;CHECK: vraddhn.i16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vraddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vraddhni32:
+;CHECK: vraddhn.i32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vraddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vraddhni64:
+;CHECK: vraddhn.i64
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @vaddls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vaddls8:
+;CHECK: vaddl.s8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vaddls16:
+;CHECK: vaddl.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vaddls32:
+;CHECK: vaddl.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vaddlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vaddlu8:
+;CHECK: vaddl.u8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vaddlu16:
+;CHECK: vaddl.u16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vaddlu32:
+;CHECK: vaddl.u32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <8 x i16> @vaddws8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vaddws8:
+;CHECK: vaddw.s8
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddws16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vaddws16:
+;CHECK: vaddw.s16
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddws32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vaddws32:
+;CHECK: vaddw.s32
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vaddwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vaddwu8:
+;CHECK: vaddw.u8
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vaddwu16:
+;CHECK: vaddw.u16
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vaddwu32:
+;CHECK: vaddw.u32
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vargs.ll b/test/CodeGen/ARM/vargs.ll
index 4bf79c0..5f3536c 100644
--- a/test/CodeGen/ARM/vargs.ll
+++ b/test/CodeGen/ARM/vargs.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm
+; RUN: llc < %s -march=arm
@str = internal constant [43 x i8] c"Hello World %d %d %d %d %d %d %d %d %d %d\0A\00" ; <[43 x i8]*> [#uses=1]
define i32 @main() {
diff --git a/test/CodeGen/ARM/vargs_align.ll b/test/CodeGen/ARM/vargs_align.ll
index 1f2f05b..e4ef9e3 100644
--- a/test/CodeGen/ARM/vargs_align.ll
+++ b/test/CodeGen/ARM/vargs_align.ll
@@ -1,7 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm -mtriple=arm-linux-gnueabi | \
-; RUN: grep {add sp, sp, #16} | count 1
-; RUN: llvm-as < %s | llc -march=arm -mtriple=arm-linux-gnu | \
-; RUN: grep {add sp, sp, #12} | count 2
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnueabi | FileCheck %s -check-prefix=EABI
+; RUN: llc < %s -march=arm -mtriple=arm-linux-gnu | FileCheck %s -check-prefix=OABI
define i32 @f(i32 %a, ...) {
entry:
@@ -18,4 +16,8 @@ entry:
return: ; preds = %entry
%retval2 = load i32* %retval ; <i32> [#uses=1]
ret i32 %retval2
+; EABI: add sp, sp, #12
+; EABI: add sp, sp, #16
+; OABI: add sp, sp, #12
+; OABI: add sp, sp, #12
}
diff --git a/test/CodeGen/ARM/vbits.ll b/test/CodeGen/ARM/vbits.ll
new file mode 100644
index 0000000..e1d23a1
--- /dev/null
+++ b/test/CodeGen/ARM/vbits.ll
@@ -0,0 +1,507 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+define <8 x i8> @v_andi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: v_andi8:
+;CHECK: vand
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = and <8 x i8> %tmp1, %tmp2
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @v_andi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: v_andi16:
+;CHECK: vand
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = and <4 x i16> %tmp1, %tmp2
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @v_andi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: v_andi32:
+;CHECK: vand
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = and <2 x i32> %tmp1, %tmp2
+ ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @v_andi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: v_andi64:
+;CHECK: vand
+ %tmp1 = load <1 x i64>* %A
+ %tmp2 = load <1 x i64>* %B
+ %tmp3 = and <1 x i64> %tmp1, %tmp2
+ ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @v_andQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: v_andQi8:
+;CHECK: vand
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = and <16 x i8> %tmp1, %tmp2
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @v_andQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: v_andQi16:
+;CHECK: vand
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = and <8 x i16> %tmp1, %tmp2
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @v_andQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: v_andQi32:
+;CHECK: vand
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = and <4 x i32> %tmp1, %tmp2
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @v_andQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: v_andQi64:
+;CHECK: vand
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = and <2 x i64> %tmp1, %tmp2
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @v_bici8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: v_bici8:
+;CHECK: vbic
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+ %tmp4 = and <8 x i8> %tmp1, %tmp3
+ ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @v_bici16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: v_bici16:
+;CHECK: vbic
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 >
+ %tmp4 = and <4 x i16> %tmp1, %tmp3
+ ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @v_bici32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: v_bici32:
+;CHECK: vbic
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 >
+ %tmp4 = and <2 x i32> %tmp1, %tmp3
+ ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @v_bici64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: v_bici64:
+;CHECK: vbic
+ %tmp1 = load <1 x i64>* %A
+ %tmp2 = load <1 x i64>* %B
+ %tmp3 = xor <1 x i64> %tmp2, < i64 -1 >
+ %tmp4 = and <1 x i64> %tmp1, %tmp3
+ ret <1 x i64> %tmp4
+}
+
+define <16 x i8> @v_bicQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: v_bicQi8:
+;CHECK: vbic
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+ %tmp4 = and <16 x i8> %tmp1, %tmp3
+ ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @v_bicQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: v_bicQi16:
+;CHECK: vbic
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
+ %tmp4 = and <8 x i16> %tmp1, %tmp3
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @v_bicQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: v_bicQi32:
+;CHECK: vbic
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 >
+ %tmp4 = and <4 x i32> %tmp1, %tmp3
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @v_bicQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: v_bicQi64:
+;CHECK: vbic
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 >
+ %tmp4 = and <2 x i64> %tmp1, %tmp3
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @v_eori8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: v_eori8:
+;CHECK: veor
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = xor <8 x i8> %tmp1, %tmp2
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @v_eori16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: v_eori16:
+;CHECK: veor
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = xor <4 x i16> %tmp1, %tmp2
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @v_eori32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: v_eori32:
+;CHECK: veor
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = xor <2 x i32> %tmp1, %tmp2
+ ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @v_eori64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: v_eori64:
+;CHECK: veor
+ %tmp1 = load <1 x i64>* %A
+ %tmp2 = load <1 x i64>* %B
+ %tmp3 = xor <1 x i64> %tmp1, %tmp2
+ ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @v_eorQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: v_eorQi8:
+;CHECK: veor
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = xor <16 x i8> %tmp1, %tmp2
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @v_eorQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: v_eorQi16:
+;CHECK: veor
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = xor <8 x i16> %tmp1, %tmp2
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @v_eorQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: v_eorQi32:
+;CHECK: veor
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = xor <4 x i32> %tmp1, %tmp2
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @v_eorQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: v_eorQi64:
+;CHECK: veor
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = xor <2 x i64> %tmp1, %tmp2
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @v_mvni8(<8 x i8>* %A) nounwind {
+;CHECK: v_mvni8:
+;CHECK: vmvn
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @v_mvni16(<4 x i16>* %A) nounwind {
+;CHECK: v_mvni16:
+;CHECK: vmvn
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 >
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @v_mvni32(<2 x i32>* %A) nounwind {
+;CHECK: v_mvni32:
+;CHECK: vmvn
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 >
+ ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @v_mvni64(<1 x i64>* %A) nounwind {
+;CHECK: v_mvni64:
+;CHECK: vmvn
+ %tmp1 = load <1 x i64>* %A
+ %tmp2 = xor <1 x i64> %tmp1, < i64 -1 >
+ ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @v_mvnQi8(<16 x i8>* %A) nounwind {
+;CHECK: v_mvnQi8:
+;CHECK: vmvn
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+ ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @v_mvnQi16(<8 x i16>* %A) nounwind {
+;CHECK: v_mvnQi16:
+;CHECK: vmvn
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @v_mvnQi32(<4 x i32>* %A) nounwind {
+;CHECK: v_mvnQi32:
+;CHECK: vmvn
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 >
+ ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @v_mvnQi64(<2 x i64>* %A) nounwind {
+;CHECK: v_mvnQi64:
+;CHECK: vmvn
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 >
+ ret <2 x i64> %tmp2
+}
+
+define <8 x i8> @v_orri8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: v_orri8:
+;CHECK: vorr
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = or <8 x i8> %tmp1, %tmp2
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @v_orri16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: v_orri16:
+;CHECK: vorr
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = or <4 x i16> %tmp1, %tmp2
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @v_orri32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: v_orri32:
+;CHECK: vorr
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = or <2 x i32> %tmp1, %tmp2
+ ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @v_orri64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: v_orri64:
+;CHECK: vorr
+ %tmp1 = load <1 x i64>* %A
+ %tmp2 = load <1 x i64>* %B
+ %tmp3 = or <1 x i64> %tmp1, %tmp2
+ ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @v_orrQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: v_orrQi8:
+;CHECK: vorr
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = or <16 x i8> %tmp1, %tmp2
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @v_orrQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: v_orrQi16:
+;CHECK: vorr
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = or <8 x i16> %tmp1, %tmp2
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @v_orrQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: v_orrQi32:
+;CHECK: vorr
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = or <4 x i32> %tmp1, %tmp2
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @v_orrQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: v_orrQi64:
+;CHECK: vorr
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = or <2 x i64> %tmp1, %tmp2
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @v_orni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: v_orni8:
+;CHECK: vorn
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+ %tmp4 = or <8 x i8> %tmp1, %tmp3
+ ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @v_orni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: v_orni16:
+;CHECK: vorn
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 >
+ %tmp4 = or <4 x i16> %tmp1, %tmp3
+ ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @v_orni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: v_orni32:
+;CHECK: vorn
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 >
+ %tmp4 = or <2 x i32> %tmp1, %tmp3
+ ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @v_orni64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: v_orni64:
+;CHECK: vorn
+ %tmp1 = load <1 x i64>* %A
+ %tmp2 = load <1 x i64>* %B
+ %tmp3 = xor <1 x i64> %tmp2, < i64 -1 >
+ %tmp4 = or <1 x i64> %tmp1, %tmp3
+ ret <1 x i64> %tmp4
+}
+
+define <16 x i8> @v_ornQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: v_ornQi8:
+;CHECK: vorn
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+ %tmp4 = or <16 x i8> %tmp1, %tmp3
+ ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @v_ornQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: v_ornQi16:
+;CHECK: vorn
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
+ %tmp4 = or <8 x i16> %tmp1, %tmp3
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @v_ornQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: v_ornQi32:
+;CHECK: vorn
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 >
+ %tmp4 = or <4 x i32> %tmp1, %tmp3
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @v_ornQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: v_ornQi64:
+;CHECK: vorn
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 >
+ %tmp4 = or <2 x i64> %tmp1, %tmp3
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @vtsti8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vtsti8:
+;CHECK: vtst.i8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = and <8 x i8> %tmp1, %tmp2
+ %tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer
+ %tmp5 = sext <8 x i1> %tmp4 to <8 x i8>
+ ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vtsti16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vtsti16:
+;CHECK: vtst.i16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = and <4 x i16> %tmp1, %tmp2
+ %tmp4 = icmp ne <4 x i16> %tmp3, zeroinitializer
+ %tmp5 = sext <4 x i1> %tmp4 to <4 x i16>
+ ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @vtsti32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vtsti32:
+;CHECK: vtst.i32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = and <2 x i32> %tmp1, %tmp2
+ %tmp4 = icmp ne <2 x i32> %tmp3, zeroinitializer
+ %tmp5 = sext <2 x i1> %tmp4 to <2 x i32>
+ ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @vtstQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vtstQi8:
+;CHECK: vtst.i8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = and <16 x i8> %tmp1, %tmp2
+ %tmp4 = icmp ne <16 x i8> %tmp3, zeroinitializer
+ %tmp5 = sext <16 x i1> %tmp4 to <16 x i8>
+ ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vtstQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vtstQi16:
+;CHECK: vtst.i16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = and <8 x i16> %tmp1, %tmp2
+ %tmp4 = icmp ne <8 x i16> %tmp3, zeroinitializer
+ %tmp5 = sext <8 x i1> %tmp4 to <8 x i16>
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vtstQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vtstQi32:
+;CHECK: vtst.i32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = and <4 x i32> %tmp1, %tmp2
+ %tmp4 = icmp ne <4 x i32> %tmp3, zeroinitializer
+ %tmp5 = sext <4 x i1> %tmp4 to <4 x i32>
+ ret <4 x i32> %tmp5
+}
diff --git a/test/CodeGen/ARM/vbsl.ll b/test/CodeGen/ARM/vbsl.ll
index 37ddf4d..9f3bb4e 100644
--- a/test/CodeGen/ARM/vbsl.ll
+++ b/test/CodeGen/ARM/vbsl.ll
@@ -1,8 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep vbsl %t | count 8
-; Note: function names do not include "vbsl" to allow simple grep for opcodes
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK: v_bsli8:
+;CHECK: vbsl
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
@@ -14,6 +14,8 @@ define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
}
define <4 x i16> @v_bsli16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK: v_bsli16:
+;CHECK: vbsl
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
@@ -25,6 +27,8 @@ define <4 x i16> @v_bsli16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind
}
define <2 x i32> @v_bsli32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK: v_bsli32:
+;CHECK: vbsl
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
@@ -36,6 +40,8 @@ define <2 x i32> @v_bsli32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind
}
define <1 x i64> @v_bsli64(<1 x i64>* %A, <1 x i64>* %B, <1 x i64>* %C) nounwind {
+;CHECK: v_bsli64:
+;CHECK: vbsl
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = load <1 x i64>* %C
@@ -47,6 +53,8 @@ define <1 x i64> @v_bsli64(<1 x i64>* %A, <1 x i64>* %B, <1 x i64>* %C) nounwind
}
define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+;CHECK: v_bslQi8:
+;CHECK: vbsl
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = load <16 x i8>* %C
@@ -58,6 +66,8 @@ define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind
}
define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+;CHECK: v_bslQi16:
+;CHECK: vbsl
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = load <8 x i16>* %C
@@ -69,6 +79,8 @@ define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwin
}
define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+;CHECK: v_bslQi32:
+;CHECK: vbsl
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = load <4 x i32>* %C
@@ -80,6 +92,8 @@ define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwin
}
define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwind {
+;CHECK: v_bslQi64:
+;CHECK: vbsl
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = load <2 x i64>* %C
diff --git a/test/CodeGen/ARM/vceq.ll b/test/CodeGen/ARM/vceq.ll
index 77f1890..e478751 100644
--- a/test/CodeGen/ARM/vceq.ll
+++ b/test/CodeGen/ARM/vceq.ll
@@ -1,61 +1,81 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vceq\\.i8} %t | count 2
-; RUN: grep {vceq\\.i16} %t | count 2
-; RUN: grep {vceq\\.i32} %t | count 2
-; RUN: grep {vceq\\.f32} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vceqi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vceqi8:
+;CHECK: vceq.i8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
- %tmp3 = vicmp eq <8 x i8> %tmp1, %tmp2
- ret <8 x i8> %tmp3
+ %tmp3 = icmp eq <8 x i8> %tmp1, %tmp2
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
}
define <4 x i16> @vceqi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vceqi16:
+;CHECK: vceq.i16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
- %tmp3 = vicmp eq <4 x i16> %tmp1, %tmp2
- ret <4 x i16> %tmp3
+ %tmp3 = icmp eq <4 x i16> %tmp1, %tmp2
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
}
define <2 x i32> @vceqi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vceqi32:
+;CHECK: vceq.i32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
- %tmp3 = vicmp eq <2 x i32> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = icmp eq <2 x i32> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
define <2 x i32> @vceqf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vceqf32:
+;CHECK: vceq.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
- %tmp3 = vfcmp oeq <2 x float> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = fcmp oeq <2 x float> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
define <16 x i8> @vceqQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vceqQi8:
+;CHECK: vceq.i8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
- %tmp3 = vicmp eq <16 x i8> %tmp1, %tmp2
- ret <16 x i8> %tmp3
+ %tmp3 = icmp eq <16 x i8> %tmp1, %tmp2
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
}
define <8 x i16> @vceqQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vceqQi16:
+;CHECK: vceq.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
- %tmp3 = vicmp eq <8 x i16> %tmp1, %tmp2
- ret <8 x i16> %tmp3
+ %tmp3 = icmp eq <8 x i16> %tmp1, %tmp2
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
}
define <4 x i32> @vceqQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vceqQi32:
+;CHECK: vceq.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
- %tmp3 = vicmp eq <4 x i32> %tmp1, %tmp2
- ret <4 x i32> %tmp3
+ %tmp3 = icmp eq <4 x i32> %tmp1, %tmp2
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
}
define <4 x i32> @vceqQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK: vceqQf32:
+;CHECK: vceq.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
- %tmp3 = vfcmp oeq <4 x float> %tmp1, %tmp2
- ret <4 x i32> %tmp3
+ %tmp3 = fcmp oeq <4 x float> %tmp1, %tmp2
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
}
diff --git a/test/CodeGen/ARM/vcge.ll b/test/CodeGen/ARM/vcge.ll
index 14c623e..2c16111 100644
--- a/test/CodeGen/ARM/vcge.ll
+++ b/test/CodeGen/ARM/vcge.ll
@@ -1,106 +1,162 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vcge\\.s8} %t | count 2
-; RUN: grep {vcge\\.s16} %t | count 2
-; RUN: grep {vcge\\.s32} %t | count 2
-; RUN: grep {vcge\\.u8} %t | count 2
-; RUN: grep {vcge\\.u16} %t | count 2
-; RUN: grep {vcge\\.u32} %t | count 2
-; RUN: grep {vcge\\.f32} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vcges8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vcges8:
+;CHECK: vcge.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
- %tmp3 = vicmp sge <8 x i8> %tmp1, %tmp2
- ret <8 x i8> %tmp3
+ %tmp3 = icmp sge <8 x i8> %tmp1, %tmp2
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
}
define <4 x i16> @vcges16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vcges16:
+;CHECK: vcge.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
- %tmp3 = vicmp sge <4 x i16> %tmp1, %tmp2
- ret <4 x i16> %tmp3
+ %tmp3 = icmp sge <4 x i16> %tmp1, %tmp2
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
}
define <2 x i32> @vcges32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vcges32:
+;CHECK: vcge.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
- %tmp3 = vicmp sge <2 x i32> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = icmp sge <2 x i32> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
define <8 x i8> @vcgeu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vcgeu8:
+;CHECK: vcge.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
- %tmp3 = vicmp uge <8 x i8> %tmp1, %tmp2
- ret <8 x i8> %tmp3
+ %tmp3 = icmp uge <8 x i8> %tmp1, %tmp2
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
}
define <4 x i16> @vcgeu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vcgeu16:
+;CHECK: vcge.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
- %tmp3 = vicmp uge <4 x i16> %tmp1, %tmp2
- ret <4 x i16> %tmp3
+ %tmp3 = icmp uge <4 x i16> %tmp1, %tmp2
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
}
define <2 x i32> @vcgeu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vcgeu32:
+;CHECK: vcge.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
- %tmp3 = vicmp uge <2 x i32> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = icmp uge <2 x i32> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
define <2 x i32> @vcgef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vcgef32:
+;CHECK: vcge.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
- %tmp3 = vfcmp oge <2 x float> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = fcmp oge <2 x float> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
define <16 x i8> @vcgeQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vcgeQs8:
+;CHECK: vcge.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
- %tmp3 = vicmp sge <16 x i8> %tmp1, %tmp2
- ret <16 x i8> %tmp3
+ %tmp3 = icmp sge <16 x i8> %tmp1, %tmp2
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
}
define <8 x i16> @vcgeQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vcgeQs16:
+;CHECK: vcge.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
- %tmp3 = vicmp sge <8 x i16> %tmp1, %tmp2
- ret <8 x i16> %tmp3
+ %tmp3 = icmp sge <8 x i16> %tmp1, %tmp2
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
}
define <4 x i32> @vcgeQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vcgeQs32:
+;CHECK: vcge.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
- %tmp3 = vicmp sge <4 x i32> %tmp1, %tmp2
- ret <4 x i32> %tmp3
+ %tmp3 = icmp sge <4 x i32> %tmp1, %tmp2
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
}
define <16 x i8> @vcgeQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vcgeQu8:
+;CHECK: vcge.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
- %tmp3 = vicmp uge <16 x i8> %tmp1, %tmp2
- ret <16 x i8> %tmp3
+ %tmp3 = icmp uge <16 x i8> %tmp1, %tmp2
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
}
define <8 x i16> @vcgeQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vcgeQu16:
+;CHECK: vcge.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
- %tmp3 = vicmp uge <8 x i16> %tmp1, %tmp2
- ret <8 x i16> %tmp3
+ %tmp3 = icmp uge <8 x i16> %tmp1, %tmp2
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
}
define <4 x i32> @vcgeQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vcgeQu32:
+;CHECK: vcge.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
- %tmp3 = vicmp uge <4 x i32> %tmp1, %tmp2
- ret <4 x i32> %tmp3
+ %tmp3 = icmp uge <4 x i32> %tmp1, %tmp2
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
}
define <4 x i32> @vcgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK: vcgeQf32:
+;CHECK: vcge.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
- %tmp3 = vfcmp oge <4 x float> %tmp1, %tmp2
+ %tmp3 = fcmp oge <4 x float> %tmp1, %tmp2
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i32> @vacgef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vacgef32:
+;CHECK: vacge.f32
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @vacgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK: vacgeQf32:
+;CHECK: vacge.f32
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x i32> %tmp3
}
+
+declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vcgt.ll b/test/CodeGen/ARM/vcgt.ll
index 3f7e550..6b11ba5 100644
--- a/test/CodeGen/ARM/vcgt.ll
+++ b/test/CodeGen/ARM/vcgt.ll
@@ -1,106 +1,162 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vcgt\\.s8} %t | count 2
-; RUN: grep {vcgt\\.s16} %t | count 2
-; RUN: grep {vcgt\\.s32} %t | count 2
-; RUN: grep {vcgt\\.u8} %t | count 2
-; RUN: grep {vcgt\\.u16} %t | count 2
-; RUN: grep {vcgt\\.u32} %t | count 2
-; RUN: grep {vcgt\\.f32} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vcgts8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vcgts8:
+;CHECK: vcgt.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
- %tmp3 = vicmp sgt <8 x i8> %tmp1, %tmp2
- ret <8 x i8> %tmp3
+ %tmp3 = icmp sgt <8 x i8> %tmp1, %tmp2
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
}
define <4 x i16> @vcgts16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vcgts16:
+;CHECK: vcgt.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
- %tmp3 = vicmp sgt <4 x i16> %tmp1, %tmp2
- ret <4 x i16> %tmp3
+ %tmp3 = icmp sgt <4 x i16> %tmp1, %tmp2
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
}
define <2 x i32> @vcgts32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vcgts32:
+;CHECK: vcgt.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
- %tmp3 = vicmp sgt <2 x i32> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = icmp sgt <2 x i32> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
define <8 x i8> @vcgtu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vcgtu8:
+;CHECK: vcgt.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
- %tmp3 = vicmp ugt <8 x i8> %tmp1, %tmp2
- ret <8 x i8> %tmp3
+ %tmp3 = icmp ugt <8 x i8> %tmp1, %tmp2
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
}
define <4 x i16> @vcgtu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vcgtu16:
+;CHECK: vcgt.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
- %tmp3 = vicmp ugt <4 x i16> %tmp1, %tmp2
- ret <4 x i16> %tmp3
+ %tmp3 = icmp ugt <4 x i16> %tmp1, %tmp2
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
}
define <2 x i32> @vcgtu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vcgtu32:
+;CHECK: vcgt.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
- %tmp3 = vicmp ugt <2 x i32> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = icmp ugt <2 x i32> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
define <2 x i32> @vcgtf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vcgtf32:
+;CHECK: vcgt.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
- %tmp3 = vfcmp ogt <2 x float> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = fcmp ogt <2 x float> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
define <16 x i8> @vcgtQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vcgtQs8:
+;CHECK: vcgt.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
- %tmp3 = vicmp sgt <16 x i8> %tmp1, %tmp2
- ret <16 x i8> %tmp3
+ %tmp3 = icmp sgt <16 x i8> %tmp1, %tmp2
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
}
define <8 x i16> @vcgtQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vcgtQs16:
+;CHECK: vcgt.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
- %tmp3 = vicmp sgt <8 x i16> %tmp1, %tmp2
- ret <8 x i16> %tmp3
+ %tmp3 = icmp sgt <8 x i16> %tmp1, %tmp2
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
}
define <4 x i32> @vcgtQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vcgtQs32:
+;CHECK: vcgt.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
- %tmp3 = vicmp sgt <4 x i32> %tmp1, %tmp2
- ret <4 x i32> %tmp3
+ %tmp3 = icmp sgt <4 x i32> %tmp1, %tmp2
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
}
define <16 x i8> @vcgtQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vcgtQu8:
+;CHECK: vcgt.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
- %tmp3 = vicmp ugt <16 x i8> %tmp1, %tmp2
- ret <16 x i8> %tmp3
+ %tmp3 = icmp ugt <16 x i8> %tmp1, %tmp2
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
}
define <8 x i16> @vcgtQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vcgtQu16:
+;CHECK: vcgt.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
- %tmp3 = vicmp ugt <8 x i16> %tmp1, %tmp2
- ret <8 x i16> %tmp3
+ %tmp3 = icmp ugt <8 x i16> %tmp1, %tmp2
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
}
define <4 x i32> @vcgtQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vcgtQu32:
+;CHECK: vcgt.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
- %tmp3 = vicmp ugt <4 x i32> %tmp1, %tmp2
- ret <4 x i32> %tmp3
+ %tmp3 = icmp ugt <4 x i32> %tmp1, %tmp2
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
}
define <4 x i32> @vcgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK: vcgtQf32:
+;CHECK: vcgt.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
- %tmp3 = vfcmp ogt <4 x float> %tmp1, %tmp2
+ %tmp3 = fcmp ogt <4 x float> %tmp1, %tmp2
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i32> @vacgtf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vacgtf32:
+;CHECK: vacgt.f32
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @vacgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK: vacgtQf32:
+;CHECK: vacgt.f32
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %tmp1, <4 x float> %tmp2)
ret <4 x i32> %tmp3
}
+
+declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vcnt.ll b/test/CodeGen/ARM/vcnt.ll
index 9817168..450f90d 100644
--- a/test/CodeGen/ARM/vcnt.ll
+++ b/test/CodeGen/ARM/vcnt.ll
@@ -1,13 +1,16 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vcnt\\.8} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind {
+;CHECK: vcnt8:
+;CHECK: vcnt.8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vcnt.v8i8(<8 x i8> %tmp1)
ret <8 x i8> %tmp2
}
define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind {
+;CHECK: vcntQ8:
+;CHECK: vcnt.8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8> %tmp1)
ret <16 x i8> %tmp2
@@ -15,3 +18,115 @@ define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind {
declare <8 x i8> @llvm.arm.neon.vcnt.v8i8(<8 x i8>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8>) nounwind readnone
+
+define <8 x i8> @vclz8(<8 x i8>* %A) nounwind {
+;CHECK: vclz8:
+;CHECK: vclz.i8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = call <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8> %tmp1)
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vclz16(<4 x i16>* %A) nounwind {
+;CHECK: vclz16:
+;CHECK: vclz.i16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = call <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16> %tmp1)
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vclz32(<2 x i32>* %A) nounwind {
+;CHECK: vclz32:
+;CHECK: vclz.i32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32> %tmp1)
+ ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind {
+;CHECK: vclzQ8:
+;CHECK: vclz.i8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = call <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8> %tmp1)
+ ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind {
+;CHECK: vclzQ16:
+;CHECK: vclz.i16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = call <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16> %tmp1)
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind {
+;CHECK: vclzQ32:
+;CHECK: vclz.i32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32> %tmp1)
+ ret <4 x i32> %tmp2
+}
+
+declare <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32>) nounwind readnone
+
+define <8 x i8> @vclss8(<8 x i8>* %A) nounwind {
+;CHECK: vclss8:
+;CHECK: vcls.s8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1)
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vclss16(<4 x i16>* %A) nounwind {
+;CHECK: vclss16:
+;CHECK: vcls.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1)
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vclss32(<2 x i32>* %A) nounwind {
+;CHECK: vclss32:
+;CHECK: vcls.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1)
+ ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind {
+;CHECK: vclsQs8:
+;CHECK: vcls.s8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1)
+ ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind {
+;CHECK: vclsQs16:
+;CHECK: vcls.s16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1)
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind {
+;CHECK: vclsQs32:
+;CHECK: vcls.s32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1)
+ ret <4 x i32> %tmp2
+}
+
+declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vcombine.ll b/test/CodeGen/ARM/vcombine.ll
new file mode 100644
index 0000000..e673305
--- /dev/null
+++ b/test/CodeGen/ARM/vcombine.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -march=arm -mattr=+neon
+
+define <16 x i8> @vcombine8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vcombine16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vcombine32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %tmp3
+}
+
+define <4 x float> @vcombinefloat(<2 x float>* %A, <2 x float>* %B) nounwind {
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x float> %tmp3
+}
+
+define <2 x i64> @vcombine64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+ %tmp1 = load <1 x i64>* %A
+ %tmp2 = load <1 x i64>* %B
+ %tmp3 = shufflevector <1 x i64> %tmp1, <1 x i64> %tmp2, <2 x i32> <i32 0, i32 1>
+ ret <2 x i64> %tmp3
+}
diff --git a/test/CodeGen/ARM/vcvt.ll b/test/CodeGen/ARM/vcvt.ll
index 1cb42bf..f4cc536 100644
--- a/test/CodeGen/ARM/vcvt.ll
+++ b/test/CodeGen/ARM/vcvt.ll
@@ -1,53 +1,140 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vcvt\\.s32\\.f32} %t | count 2
-; RUN: grep {vcvt\\.u32\\.f32} %t | count 2
-; RUN: grep {vcvt\\.f32\\.s32} %t | count 2
-; RUN: grep {vcvt\\.f32\\.u32} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <2 x i32> @vcvt_f32tos32(<2 x float>* %A) nounwind {
+;CHECK: vcvt_f32tos32:
+;CHECK: vcvt.s32.f32
%tmp1 = load <2 x float>* %A
%tmp2 = fptosi <2 x float> %tmp1 to <2 x i32>
ret <2 x i32> %tmp2
}
define <2 x i32> @vcvt_f32tou32(<2 x float>* %A) nounwind {
+;CHECK: vcvt_f32tou32:
+;CHECK: vcvt.u32.f32
%tmp1 = load <2 x float>* %A
%tmp2 = fptoui <2 x float> %tmp1 to <2 x i32>
ret <2 x i32> %tmp2
}
define <2 x float> @vcvt_s32tof32(<2 x i32>* %A) nounwind {
+;CHECK: vcvt_s32tof32:
+;CHECK: vcvt.f32.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = sitofp <2 x i32> %tmp1 to <2 x float>
ret <2 x float> %tmp2
}
define <2 x float> @vcvt_u32tof32(<2 x i32>* %A) nounwind {
+;CHECK: vcvt_u32tof32:
+;CHECK: vcvt.f32.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = uitofp <2 x i32> %tmp1 to <2 x float>
ret <2 x float> %tmp2
}
define <4 x i32> @vcvtQ_f32tos32(<4 x float>* %A) nounwind {
+;CHECK: vcvtQ_f32tos32:
+;CHECK: vcvt.s32.f32
%tmp1 = load <4 x float>* %A
%tmp2 = fptosi <4 x float> %tmp1 to <4 x i32>
ret <4 x i32> %tmp2
}
define <4 x i32> @vcvtQ_f32tou32(<4 x float>* %A) nounwind {
+;CHECK: vcvtQ_f32tou32:
+;CHECK: vcvt.u32.f32
%tmp1 = load <4 x float>* %A
%tmp2 = fptoui <4 x float> %tmp1 to <4 x i32>
ret <4 x i32> %tmp2
}
define <4 x float> @vcvtQ_s32tof32(<4 x i32>* %A) nounwind {
+;CHECK: vcvtQ_s32tof32:
+;CHECK: vcvt.f32.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = sitofp <4 x i32> %tmp1 to <4 x float>
ret <4 x float> %tmp2
}
define <4 x float> @vcvtQ_u32tof32(<4 x i32>* %A) nounwind {
+;CHECK: vcvtQ_u32tof32:
+;CHECK: vcvt.f32.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = uitofp <4 x i32> %tmp1 to <4 x float>
ret <4 x float> %tmp2
}
+
+define <2 x i32> @vcvt_n_f32tos32(<2 x float>* %A) nounwind {
+;CHECK: vcvt_n_f32tos32:
+;CHECK: vcvt.s32.f32
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %tmp1, i32 1)
+ ret <2 x i32> %tmp2
+}
+
+define <2 x i32> @vcvt_n_f32tou32(<2 x float>* %A) nounwind {
+;CHECK: vcvt_n_f32tou32:
+;CHECK: vcvt.u32.f32
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %tmp1, i32 1)
+ ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vcvt_n_s32tof32(<2 x i32>* %A) nounwind {
+;CHECK: vcvt_n_s32tof32:
+;CHECK: vcvt.f32.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1)
+ ret <2 x float> %tmp2
+}
+
+define <2 x float> @vcvt_n_u32tof32(<2 x i32>* %A) nounwind {
+;CHECK: vcvt_n_u32tof32:
+;CHECK: vcvt.f32.u32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1)
+ ret <2 x float> %tmp2
+}
+
+declare <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+
+define <4 x i32> @vcvtQ_n_f32tos32(<4 x float>* %A) nounwind {
+;CHECK: vcvtQ_n_f32tos32:
+;CHECK: vcvt.s32.f32
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %tmp1, i32 1)
+ ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @vcvtQ_n_f32tou32(<4 x float>* %A) nounwind {
+;CHECK: vcvtQ_n_f32tou32:
+;CHECK: vcvt.u32.f32
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %tmp1, i32 1)
+ ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vcvtQ_n_s32tof32(<4 x i32>* %A) nounwind {
+;CHECK: vcvtQ_n_s32tof32:
+;CHECK: vcvt.f32.s32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1)
+ ret <4 x float> %tmp2
+}
+
+define <4 x float> @vcvtQ_n_u32tof32(<4 x i32>* %A) nounwind {
+;CHECK: vcvtQ_n_u32tof32:
+;CHECK: vcvt.f32.u32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1)
+ ret <4 x float> %tmp2
+}
+
+declare <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+
diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll
index 1c0887a..c9a68ca 100644
--- a/test/CodeGen/ARM/vdup.ll
+++ b/test/CodeGen/ARM/vdup.ll
@@ -1,9 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep vdup.8 %t | count 4
-; RUN: grep vdup.16 %t | count 4
-; RUN: grep vdup.32 %t | count 8
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @v_dup8(i8 %A) nounwind {
+;CHECK: v_dup8:
+;CHECK: vdup.8
%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
@@ -16,6 +15,8 @@ define <8 x i8> @v_dup8(i8 %A) nounwind {
}
define <4 x i16> @v_dup16(i16 %A) nounwind {
+;CHECK: v_dup16:
+;CHECK: vdup.16
%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
@@ -24,18 +25,24 @@ define <4 x i16> @v_dup16(i16 %A) nounwind {
}
define <2 x i32> @v_dup32(i32 %A) nounwind {
+;CHECK: v_dup32:
+;CHECK: vdup.32
%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
ret <2 x i32> %tmp2
}
define <2 x float> @v_dupfloat(float %A) nounwind {
+;CHECK: v_dupfloat:
+;CHECK: vdup.32
%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
ret <2 x float> %tmp2
}
define <16 x i8> @v_dupQ8(i8 %A) nounwind {
+;CHECK: v_dupQ8:
+;CHECK: vdup.8
%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
@@ -56,6 +63,8 @@ define <16 x i8> @v_dupQ8(i8 %A) nounwind {
}
define <8 x i16> @v_dupQ16(i16 %A) nounwind {
+;CHECK: v_dupQ16:
+;CHECK: vdup.16
%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
@@ -68,6 +77,8 @@ define <8 x i16> @v_dupQ16(i16 %A) nounwind {
}
define <4 x i32> @v_dupQ32(i32 %A) nounwind {
+;CHECK: v_dupQ32:
+;CHECK: vdup.32
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
@@ -76,6 +87,8 @@ define <4 x i32> @v_dupQ32(i32 %A) nounwind {
}
define <4 x float> @v_dupQfloat(float %A) nounwind {
+;CHECK: v_dupQfloat:
+;CHECK: vdup.32
%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
@@ -86,49 +99,171 @@ define <4 x float> @v_dupQfloat(float %A) nounwind {
; Check to make sure it works with shuffles, too.
define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
+;CHECK: v_shuffledup8:
+;CHECK: vdup.8
%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
ret <8 x i8> %tmp2
}
define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
+;CHECK: v_shuffledup16:
+;CHECK: vdup.16
%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
ret <4 x i16> %tmp2
}
define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
+;CHECK: v_shuffledup32:
+;CHECK: vdup.32
%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
ret <2 x i32> %tmp2
}
define <2 x float> @v_shuffledupfloat(float %A) nounwind {
+;CHECK: v_shuffledupfloat:
+;CHECK: vdup.32
%tmp1 = insertelement <2 x float> undef, float %A, i32 0
%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
ret <2 x float> %tmp2
}
define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
+;CHECK: v_shuffledupQ8:
+;CHECK: vdup.8
%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
ret <16 x i8> %tmp2
}
define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
+;CHECK: v_shuffledupQ16:
+;CHECK: vdup.16
%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
ret <8 x i16> %tmp2
}
define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
+;CHECK: v_shuffledupQ32:
+;CHECK: vdup.32
%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
ret <4 x i32> %tmp2
}
define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
+;CHECK: v_shuffledupQfloat:
+;CHECK: vdup.32
%tmp1 = insertelement <4 x float> undef, float %A, i32 0
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
ret <4 x float> %tmp2
}
+
+define <2 x float> @v_shuffledupfloat2(float* %A) nounwind {
+;CHECK: v_shuffledupfloat2:
+;CHECK: vdup.32
+ %tmp0 = load float* %A
+ %tmp1 = insertelement <2 x float> undef, float %tmp0, i32 0
+ %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
+ ret <2 x float> %tmp2
+}
+
+define <4 x float> @v_shuffledupQfloat2(float* %A) nounwind {
+;CHECK: v_shuffledupQfloat2:
+;CHECK: vdup.32
+ %tmp0 = load float* %A
+ %tmp1 = insertelement <4 x float> undef, float %tmp0, i32 0
+ %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %tmp2
+}
+
+define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
+;CHECK: vduplane8:
+;CHECK: vdup.8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
+;CHECK: vduplane16:
+;CHECK: vdup.16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
+;CHECK: vduplane32:
+;CHECK: vdup.32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
+ ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
+;CHECK: vduplanefloat:
+;CHECK: vdup.32
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
+ ret <2 x float> %tmp2
+}
+
+define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
+;CHECK: vduplaneQ8:
+;CHECK: vdup.8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+ ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
+;CHECK: vduplaneQ16:
+;CHECK: vdup.16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
+;CHECK: vduplaneQ32:
+;CHECK: vdup.32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+ ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
+;CHECK: vduplaneQfloat:
+;CHECK: vdup.32
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+ ret <4 x float> %tmp2
+}
+
+define arm_apcscc <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
+entry:
+ %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+ ret <2 x i64> %0
+}
+
+define arm_apcscc <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
+entry:
+ %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+ ret <2 x i64> %0
+}
+
+define arm_apcscc <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
+entry:
+ %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+ ret <2 x double> %0
+}
+
+define arm_apcscc <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
+entry:
+ %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ ret <2 x double> %0
+}
diff --git a/test/CodeGen/ARM/vext.ll b/test/CodeGen/ARM/vext.ll
new file mode 100644
index 0000000..20d953b
--- /dev/null
+++ b/test/CodeGen/ARM/vext.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+define arm_apcscc <8 x i8> @test_vextd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: test_vextd:
+;CHECK: vext
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+ ret <8 x i8> %tmp3
+}
+
+define arm_apcscc <8 x i8> @test_vextRd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: test_vextRd:
+;CHECK: vext
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4>
+ ret <8 x i8> %tmp3
+}
+
+define arm_apcscc <16 x i8> @test_vextq(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: test_vextq:
+;CHECK: vext
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
+ ret <16 x i8> %tmp3
+}
+
+define arm_apcscc <16 x i8> @test_vextRq(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: test_vextRq:
+;CHECK: vext
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ ret <16 x i8> %tmp3
+}
+
+define arm_apcscc <4 x i16> @test_vextd16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: test_vextd16:
+;CHECK: vext
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+ ret <4 x i16> %tmp3
+}
+
+define arm_apcscc <4 x i32> @test_vextq32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: test_vextq32:
+;CHECK: vext
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+ ret <4 x i32> %tmp3
+}
+
diff --git a/test/CodeGen/ARM/vfcmp.ll b/test/CodeGen/ARM/vfcmp.ll
index 58c2068..6946d02 100644
--- a/test/CodeGen/ARM/vfcmp.ll
+++ b/test/CodeGen/ARM/vfcmp.ll
@@ -1,96 +1,139 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vceq\\.f32} %t | count 1
-; RUN: grep {vcgt\\.f32} %t | count 9
-; RUN: grep {vcge\\.f32} %t | count 5
-; RUN: grep vorr %t | count 4
-; RUN: grep vmvn %t | count 7
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
-; This tests vfcmp operations that do not map directly to NEON instructions.
+; This tests fcmp operations that do not map directly to NEON instructions.
; une is implemented with VCEQ/VMVN
define <2 x i32> @vcunef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vcunef32:
+;CHECK: vceq.f32
+;CHECK-NEXT: vmvn
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
- %tmp3 = vfcmp une <2 x float> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = fcmp une <2 x float> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
; olt is implemented with VCGT
define <2 x i32> @vcoltf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vcoltf32:
+;CHECK: vcgt.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
- %tmp3 = vfcmp olt <2 x float> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = fcmp olt <2 x float> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
; ole is implemented with VCGE
define <2 x i32> @vcolef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vcolef32:
+;CHECK: vcge.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
- %tmp3 = vfcmp ole <2 x float> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = fcmp ole <2 x float> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
; uge is implemented with VCGT/VMVN
define <2 x i32> @vcugef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vcugef32:
+;CHECK: vcgt.f32
+;CHECK-NEXT: vmvn
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
- %tmp3 = vfcmp uge <2 x float> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = fcmp uge <2 x float> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
; ule is implemented with VCGT/VMVN
define <2 x i32> @vculef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vculef32:
+;CHECK: vcgt.f32
+;CHECK-NEXT: vmvn
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
- %tmp3 = vfcmp ule <2 x float> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = fcmp ule <2 x float> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
; ugt is implemented with VCGE/VMVN
define <2 x i32> @vcugtf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vcugtf32:
+;CHECK: vcge.f32
+;CHECK-NEXT: vmvn
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
- %tmp3 = vfcmp ugt <2 x float> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = fcmp ugt <2 x float> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
; ult is implemented with VCGE/VMVN
define <2 x i32> @vcultf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vcultf32:
+;CHECK: vcge.f32
+;CHECK-NEXT: vmvn
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
- %tmp3 = vfcmp ult <2 x float> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = fcmp ult <2 x float> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
; ueq is implemented with VCGT/VCGT/VORR/VMVN
define <2 x i32> @vcueqf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vcueqf32:
+;CHECK: vcgt.f32
+;CHECK-NEXT: vcgt.f32
+;CHECK-NEXT: vorr
+;CHECK-NEXT: vmvn
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
- %tmp3 = vfcmp ueq <2 x float> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = fcmp ueq <2 x float> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
; one is implemented with VCGT/VCGT/VORR
define <2 x i32> @vconef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vconef32:
+;CHECK: vcgt.f32
+;CHECK-NEXT: vcgt.f32
+;CHECK-NEXT: vorr
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
- %tmp3 = vfcmp one <2 x float> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = fcmp one <2 x float> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
; uno is implemented with VCGT/VCGE/VORR/VMVN
define <2 x i32> @vcunof32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vcunof32:
+;CHECK: vcge.f32
+;CHECK-NEXT: vcgt.f32
+;CHECK-NEXT: vorr
+;CHECK-NEXT: vmvn
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
- %tmp3 = vfcmp uno <2 x float> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = fcmp uno <2 x float> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
; ord is implemented with VCGT/VCGE/VORR
define <2 x i32> @vcordf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vcordf32:
+;CHECK: vcge.f32
+;CHECK-NEXT: vcgt.f32
+;CHECK-NEXT: vorr
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
- %tmp3 = vfcmp ord <2 x float> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = fcmp ord <2 x float> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
diff --git a/test/CodeGen/ARM/vfp.ll b/test/CodeGen/ARM/vfp.ll
index f58da44..50000e31 100644
--- a/test/CodeGen/ARM/vfp.ll
+++ b/test/CodeGen/ARM/vfp.ll
@@ -1,19 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 | \
-; RUN: grep fabs | count 2
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 | \
-; RUN: grep fmscs | count 1
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 | \
-; RUN: grep fcvt | count 2
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 | \
-; RUN: grep fuito | count 2
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 | \
-; RUN: grep fto.i | count 4
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 | \
-; RUN: grep bmi | count 1
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 | \
-; RUN: grep bgt | count 1
-; RUN: llvm-as < %s | llc -march=arm -mattr=+vfp2 | \
-; RUN: grep fcmpezs | count 1
+; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s
define void @test(float* %P, double* %D) {
%A = load float* %P ; <float> [#uses=1]
@@ -28,16 +13,20 @@ declare float @fabsf(float)
declare double @fabs(double)
define void @test_abs(float* %P, double* %D) {
+;CHECK: test_abs:
%a = load float* %P ; <float> [#uses=1]
+;CHECK: fabss
%b = call float @fabsf( float %a ) ; <float> [#uses=1]
store float %b, float* %P
%A = load double* %D ; <double> [#uses=1]
+;CHECK: fabsd
%B = call double @fabs( double %A ) ; <double> [#uses=1]
store double %B, double* %D
ret void
}
define void @test_add(float* %P, double* %D) {
+;CHECK: test_add:
%a = load float* %P ; <float> [#uses=2]
%b = fadd float %a, %a ; <float> [#uses=1]
store float %b, float* %P
@@ -48,9 +37,12 @@ define void @test_add(float* %P, double* %D) {
}
define void @test_ext_round(float* %P, double* %D) {
+;CHECK: test_ext_round:
%a = load float* %P ; <float> [#uses=1]
+;CHECK: fcvtds
%b = fpext float %a to double ; <double> [#uses=1]
%A = load double* %D ; <double> [#uses=1]
+;CHECK: fcvtsd
%B = fptrunc double %A to float ; <float> [#uses=1]
store double %b, double* %D
store float %B, float* %P
@@ -58,9 +50,11 @@ define void @test_ext_round(float* %P, double* %D) {
}
define void @test_fma(float* %P1, float* %P2, float* %P3) {
+;CHECK: test_fma:
%a1 = load float* %P1 ; <float> [#uses=1]
%a2 = load float* %P2 ; <float> [#uses=1]
%a3 = load float* %P3 ; <float> [#uses=1]
+;CHECK: fmscs
%X = fmul float %a1, %a2 ; <float> [#uses=1]
%Y = fsub float %X, %a3 ; <float> [#uses=1]
store float %Y, float* %P1
@@ -68,42 +62,55 @@ define void @test_fma(float* %P1, float* %P2, float* %P3) {
}
define i32 @test_ftoi(float* %P1) {
+;CHECK: test_ftoi:
%a1 = load float* %P1 ; <float> [#uses=1]
+;CHECK: ftosizs
%b1 = fptosi float %a1 to i32 ; <i32> [#uses=1]
ret i32 %b1
}
define i32 @test_ftou(float* %P1) {
+;CHECK: test_ftou:
%a1 = load float* %P1 ; <float> [#uses=1]
+;CHECK: ftouizs
%b1 = fptoui float %a1 to i32 ; <i32> [#uses=1]
ret i32 %b1
}
define i32 @test_dtoi(double* %P1) {
+;CHECK: test_dtoi:
%a1 = load double* %P1 ; <double> [#uses=1]
+;CHECK: ftosizd
%b1 = fptosi double %a1 to i32 ; <i32> [#uses=1]
ret i32 %b1
}
define i32 @test_dtou(double* %P1) {
+;CHECK: test_dtou:
%a1 = load double* %P1 ; <double> [#uses=1]
+;CHECK: ftouizd
%b1 = fptoui double %a1 to i32 ; <i32> [#uses=1]
ret i32 %b1
}
define void @test_utod(double* %P1, i32 %X) {
+;CHECK: test_utod:
+;CHECK: fuitod
%b1 = uitofp i32 %X to double ; <double> [#uses=1]
store double %b1, double* %P1
ret void
}
define void @test_utod2(double* %P1, i8 %X) {
+;CHECK: test_utod2:
+;CHECK: fuitod
%b1 = uitofp i8 %X to double ; <double> [#uses=1]
store double %b1, double* %P1
ret void
}
define void @test_cmp(float* %glob, i32 %X) {
+;CHECK: test_cmp:
entry:
%tmp = load float* %glob ; <float> [#uses=2]
%tmp3 = getelementptr float* %glob, i32 2 ; <float*> [#uses=1]
@@ -111,6 +118,8 @@ entry:
%tmp.upgrd.1 = fcmp oeq float %tmp, %tmp4 ; <i1> [#uses=1]
%tmp5 = fcmp uno float %tmp, %tmp4 ; <i1> [#uses=1]
%tmp6 = or i1 %tmp.upgrd.1, %tmp5 ; <i1> [#uses=1]
+;CHECK: bmi
+;CHECK-NEXT: bgt
br i1 %tmp6, label %cond_true, label %cond_false
cond_true: ; preds = %entry
@@ -129,8 +138,10 @@ declare i32 @bar(...)
declare i32 @baz(...)
define void @test_cmpfp0(float* %glob, i32 %X) {
+;CHECK: test_cmpfp0:
entry:
%tmp = load float* %glob ; <float> [#uses=1]
+;CHECK: fcmpezs
%tmp.upgrd.3 = fcmp ogt float %tmp, 0.000000e+00 ; <i1> [#uses=1]
br i1 %tmp.upgrd.3, label %cond_true, label %cond_false
diff --git a/test/CodeGen/ARM/vget_lane.ll b/test/CodeGen/ARM/vget_lane.ll
index a361ba2..f0df798 100644
--- a/test/CodeGen/ARM/vget_lane.ll
+++ b/test/CodeGen/ARM/vget_lane.ll
@@ -1,11 +1,10 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vmov\\.s8} %t | count 2
-; RUN: grep {vmov\\.s16} %t | count 2
-; RUN: grep {vmov\\.u8} %t | count 2
-; RUN: grep {vmov\\.u16} %t | count 2
-; RUN: grep {vmov\\.32} %t | count 2
+; RUN: llc < %s -mattr=+neon | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
+target triple = "thumbv7-elf"
define i32 @vget_lanes8(<8 x i8>* %A) nounwind {
+;CHECK: vget_lanes8:
+;CHECK: vmov.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = extractelement <8 x i8> %tmp1, i32 1
%tmp3 = sext i8 %tmp2 to i32
@@ -13,6 +12,8 @@ define i32 @vget_lanes8(<8 x i8>* %A) nounwind {
}
define i32 @vget_lanes16(<4 x i16>* %A) nounwind {
+;CHECK: vget_lanes16:
+;CHECK: vmov.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = extractelement <4 x i16> %tmp1, i32 1
%tmp3 = sext i16 %tmp2 to i32
@@ -20,6 +21,8 @@ define i32 @vget_lanes16(<4 x i16>* %A) nounwind {
}
define i32 @vget_laneu8(<8 x i8>* %A) nounwind {
+;CHECK: vget_laneu8:
+;CHECK: vmov.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = extractelement <8 x i8> %tmp1, i32 1
%tmp3 = zext i8 %tmp2 to i32
@@ -27,6 +30,8 @@ define i32 @vget_laneu8(<8 x i8>* %A) nounwind {
}
define i32 @vget_laneu16(<4 x i16>* %A) nounwind {
+;CHECK: vget_laneu16:
+;CHECK: vmov.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = extractelement <4 x i16> %tmp1, i32 1
%tmp3 = zext i16 %tmp2 to i32
@@ -35,6 +40,8 @@ define i32 @vget_laneu16(<4 x i16>* %A) nounwind {
; Do a vector add to keep the extraction from being done directly from memory.
define i32 @vget_lanei32(<2 x i32>* %A) nounwind {
+;CHECK: vget_lanei32:
+;CHECK: vmov.32
%tmp1 = load <2 x i32>* %A
%tmp2 = add <2 x i32> %tmp1, %tmp1
%tmp3 = extractelement <2 x i32> %tmp2, i32 1
@@ -42,6 +49,8 @@ define i32 @vget_lanei32(<2 x i32>* %A) nounwind {
}
define i32 @vgetQ_lanes8(<16 x i8>* %A) nounwind {
+;CHECK: vgetQ_lanes8:
+;CHECK: vmov.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = extractelement <16 x i8> %tmp1, i32 1
%tmp3 = sext i8 %tmp2 to i32
@@ -49,6 +58,8 @@ define i32 @vgetQ_lanes8(<16 x i8>* %A) nounwind {
}
define i32 @vgetQ_lanes16(<8 x i16>* %A) nounwind {
+;CHECK: vgetQ_lanes16:
+;CHECK: vmov.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = extractelement <8 x i16> %tmp1, i32 1
%tmp3 = sext i16 %tmp2 to i32
@@ -56,6 +67,8 @@ define i32 @vgetQ_lanes16(<8 x i16>* %A) nounwind {
}
define i32 @vgetQ_laneu8(<16 x i8>* %A) nounwind {
+;CHECK: vgetQ_laneu8:
+;CHECK: vmov.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = extractelement <16 x i8> %tmp1, i32 1
%tmp3 = zext i8 %tmp2 to i32
@@ -63,6 +76,8 @@ define i32 @vgetQ_laneu8(<16 x i8>* %A) nounwind {
}
define i32 @vgetQ_laneu16(<8 x i16>* %A) nounwind {
+;CHECK: vgetQ_laneu16:
+;CHECK: vmov.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = extractelement <8 x i16> %tmp1, i32 1
%tmp3 = zext i16 %tmp2 to i32
@@ -71,8 +86,127 @@ define i32 @vgetQ_laneu16(<8 x i16>* %A) nounwind {
; Do a vector add to keep the extraction from being done directly from memory.
define i32 @vgetQ_lanei32(<4 x i32>* %A) nounwind {
+;CHECK: vgetQ_lanei32:
+;CHECK: vmov.32
%tmp1 = load <4 x i32>* %A
%tmp2 = add <4 x i32> %tmp1, %tmp1
%tmp3 = extractelement <4 x i32> %tmp2, i32 1
ret i32 %tmp3
}
+
+define arm_aapcs_vfpcc void @test_vget_laneu16() nounwind {
+entry:
+; CHECK: vmov.u16 r0, d0[1]
+ %arg0_uint16x4_t = alloca <4 x i16> ; <<4 x i16>*> [#uses=1]
+ %out_uint16_t = alloca i16 ; <i16*> [#uses=1]
+ %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
+ %0 = load <4 x i16>* %arg0_uint16x4_t, align 8 ; <<4 x i16>> [#uses=1]
+ %1 = extractelement <4 x i16> %0, i32 1 ; <i16> [#uses=1]
+ store i16 %1, i16* %out_uint16_t, align 2
+ br label %return
+
+return: ; preds = %entry
+ ret void
+}
+
+define arm_aapcs_vfpcc void @test_vget_laneu8() nounwind {
+entry:
+; CHECK: vmov.u8 r0, d0[1]
+ %arg0_uint8x8_t = alloca <8 x i8> ; <<8 x i8>*> [#uses=1]
+ %out_uint8_t = alloca i8 ; <i8*> [#uses=1]
+ %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
+ %0 = load <8 x i8>* %arg0_uint8x8_t, align 8 ; <<8 x i8>> [#uses=1]
+ %1 = extractelement <8 x i8> %0, i32 1 ; <i8> [#uses=1]
+ store i8 %1, i8* %out_uint8_t, align 1
+ br label %return
+
+return: ; preds = %entry
+ ret void
+}
+
+define arm_aapcs_vfpcc void @test_vgetQ_laneu16() nounwind {
+entry:
+; CHECK: vmov.u16 r0, d0[1]
+ %arg0_uint16x8_t = alloca <8 x i16> ; <<8 x i16>*> [#uses=1]
+ %out_uint16_t = alloca i16 ; <i16*> [#uses=1]
+ %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
+ %0 = load <8 x i16>* %arg0_uint16x8_t, align 16 ; <<8 x i16>> [#uses=1]
+ %1 = extractelement <8 x i16> %0, i32 1 ; <i16> [#uses=1]
+ store i16 %1, i16* %out_uint16_t, align 2
+ br label %return
+
+return: ; preds = %entry
+ ret void
+}
+
+define arm_aapcs_vfpcc void @test_vgetQ_laneu8() nounwind {
+entry:
+; CHECK: vmov.u8 r0, d0[1]
+ %arg0_uint8x16_t = alloca <16 x i8> ; <<16 x i8>*> [#uses=1]
+ %out_uint8_t = alloca i8 ; <i8*> [#uses=1]
+ %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
+ %0 = load <16 x i8>* %arg0_uint8x16_t, align 16 ; <<16 x i8>> [#uses=1]
+ %1 = extractelement <16 x i8> %0, i32 1 ; <i8> [#uses=1]
+ store i8 %1, i8* %out_uint8_t, align 1
+ br label %return
+
+return: ; preds = %entry
+ ret void
+}
+
+define <8 x i8> @vset_lane8(<8 x i8>* %A, i8 %B) nounwind {
+;CHECK: vset_lane8:
+;CHECK: vmov.8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = insertelement <8 x i8> %tmp1, i8 %B, i32 1
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vset_lane16(<4 x i16>* %A, i16 %B) nounwind {
+;CHECK: vset_lane16:
+;CHECK: vmov.16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = insertelement <4 x i16> %tmp1, i16 %B, i32 1
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vset_lane32(<2 x i32>* %A, i32 %B) nounwind {
+;CHECK: vset_lane32:
+;CHECK: vmov.32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = insertelement <2 x i32> %tmp1, i32 %B, i32 1
+ ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vsetQ_lane8(<16 x i8>* %A, i8 %B) nounwind {
+;CHECK: vsetQ_lane8:
+;CHECK: vmov.8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = insertelement <16 x i8> %tmp1, i8 %B, i32 1
+ ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind {
+;CHECK: vsetQ_lane16:
+;CHECK: vmov.16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = insertelement <8 x i16> %tmp1, i16 %B, i32 1
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind {
+;CHECK: vsetQ_lane32:
+;CHECK: vmov.32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1
+ ret <4 x i32> %tmp2
+}
+
+define arm_aapcs_vfpcc <2 x float> @test_vset_lanef32(float %arg0_float32_t, <2 x float> %arg1_float32x2_t) nounwind {
+;CHECK: test_vset_lanef32:
+;CHECK: fcpys
+;CHECK: fcpys
+entry:
+ %0 = insertelement <2 x float> %arg1_float32x2_t, float %arg0_float32_t, i32 1 ; <<2 x float>> [#uses=1]
+ ret <2 x float> %0
+}
diff --git a/test/CodeGen/ARM/vhadd.ll b/test/CodeGen/ARM/vhadd.ll
index 5e7503d..379e062 100644
--- a/test/CodeGen/ARM/vhadd.ll
+++ b/test/CodeGen/ARM/vhadd.ll
@@ -1,12 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vhadd\\.s8} %t | count 2
-; RUN: grep {vhadd\\.s16} %t | count 2
-; RUN: grep {vhadd\\.s32} %t | count 2
-; RUN: grep {vhadd\\.u8} %t | count 2
-; RUN: grep {vhadd\\.u16} %t | count 2
-; RUN: grep {vhadd\\.u32} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vhadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vhadds8:
+;CHECK: vhadd.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -14,6 +10,8 @@ define <8 x i8> @vhadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vhadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vhadds16:
+;CHECK: vhadd.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -21,6 +19,8 @@ define <4 x i16> @vhadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vhadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vhadds32:
+;CHECK: vhadd.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -28,6 +28,8 @@ define <2 x i32> @vhadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <8 x i8> @vhaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vhaddu8:
+;CHECK: vhadd.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -35,6 +37,8 @@ define <8 x i8> @vhaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vhaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vhaddu16:
+;CHECK: vhadd.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -42,6 +46,8 @@ define <4 x i16> @vhaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vhaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vhaddu32:
+;CHECK: vhadd.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -49,6 +55,8 @@ define <2 x i32> @vhaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <16 x i8> @vhaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vhaddQs8:
+;CHECK: vhadd.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -56,6 +64,8 @@ define <16 x i8> @vhaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vhaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vhaddQs16:
+;CHECK: vhadd.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -63,6 +73,8 @@ define <8 x i16> @vhaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vhaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vhaddQs32:
+;CHECK: vhadd.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -70,6 +82,8 @@ define <4 x i32> @vhaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <16 x i8> @vhaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vhaddQu8:
+;CHECK: vhadd.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -77,6 +91,8 @@ define <16 x i8> @vhaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vhaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vhaddQu16:
+;CHECK: vhadd.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -84,6 +100,8 @@ define <8 x i16> @vhaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vhaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vhaddQu32:
+;CHECK: vhadd.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -105,3 +123,127 @@ declare <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32>, <4 x i32>) nounwind rea
declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @vrhadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vrhadds8:
+;CHECK: vrhadd.s8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vrhadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vrhadds16:
+;CHECK: vrhadd.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vrhadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vrhadds32:
+;CHECK: vrhadd.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vrhaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vrhaddu8:
+;CHECK: vrhadd.u8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vrhaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vrhaddu16:
+;CHECK: vrhadd.u16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vrhaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vrhaddu32:
+;CHECK: vrhadd.u32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vrhaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vrhaddQs8:
+;CHECK: vrhadd.s8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vrhaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vrhaddQs16:
+;CHECK: vrhadd.s16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vrhaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vrhaddQs32:
+;CHECK: vrhadd.s32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vrhaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vrhaddQu8:
+;CHECK: vrhadd.u8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vrhaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vrhaddQu16:
+;CHECK: vrhadd.u16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vrhaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vrhaddQu32:
+;CHECK: vrhadd.u32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vhsub.ll b/test/CodeGen/ARM/vhsub.ll
index 32a66e5..0f0d027 100644
--- a/test/CodeGen/ARM/vhsub.ll
+++ b/test/CodeGen/ARM/vhsub.ll
@@ -1,12 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vhsub\\.s8} %t | count 2
-; RUN: grep {vhsub\\.s16} %t | count 2
-; RUN: grep {vhsub\\.s32} %t | count 2
-; RUN: grep {vhsub\\.u8} %t | count 2
-; RUN: grep {vhsub\\.u16} %t | count 2
-; RUN: grep {vhsub\\.u32} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vhsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vhsubs8:
+;CHECK: vhsub.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -14,6 +10,8 @@ define <8 x i8> @vhsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vhsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vhsubs16:
+;CHECK: vhsub.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -21,6 +19,8 @@ define <4 x i16> @vhsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vhsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vhsubs32:
+;CHECK: vhsub.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -28,6 +28,8 @@ define <2 x i32> @vhsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <8 x i8> @vhsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vhsubu8:
+;CHECK: vhsub.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -35,6 +37,8 @@ define <8 x i8> @vhsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vhsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vhsubu16:
+;CHECK: vhsub.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -42,6 +46,8 @@ define <4 x i16> @vhsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vhsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vhsubu32:
+;CHECK: vhsub.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -49,6 +55,8 @@ define <2 x i32> @vhsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <16 x i8> @vhsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vhsubQs8:
+;CHECK: vhsub.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -56,6 +64,8 @@ define <16 x i8> @vhsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vhsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vhsubQs16:
+;CHECK: vhsub.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -63,6 +73,8 @@ define <8 x i16> @vhsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vhsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vhsubQs32:
+;CHECK: vhsub.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -70,6 +82,8 @@ define <4 x i32> @vhsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <16 x i8> @vhsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vhsubQu8:
+;CHECK: vhsub.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -77,6 +91,8 @@ define <16 x i8> @vhsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vhsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vhsubQu16:
+;CHECK: vhsub.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -84,6 +100,8 @@ define <8 x i16> @vhsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vhsubQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vhsubQu32:
+;CHECK: vhsub.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
diff --git a/test/CodeGen/ARM/vicmp.ll b/test/CodeGen/ARM/vicmp.ll
index 86858f9..2d8cb89 100644
--- a/test/CodeGen/ARM/vicmp.ll
+++ b/test/CodeGen/ARM/vicmp.ll
@@ -1,85 +1,113 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vceq\\.i8} %t | count 2
-; RUN: grep {vceq\\.i16} %t | count 2
-; RUN: grep {vceq\\.i32} %t | count 2
-; RUN: grep vmvn %t | count 6
-; RUN: grep {vcgt\\.s8} %t | count 1
-; RUN: grep {vcge\\.s16} %t | count 1
-; RUN: grep {vcgt\\.u16} %t | count 1
-; RUN: grep {vcge\\.u32} %t | count 1
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
-; This tests vicmp operations that do not map directly to NEON instructions.
+; This tests icmp operations that do not map directly to NEON instructions.
; Not-equal (ne) operations are implemented by VCEQ/VMVN. Less-than (lt/ult)
; and less-than-or-equal (le/ule) are implemented by swapping the arguments
; to VCGT and VCGE. Test all the operand types for not-equal but only sample
; the other operations.
define <8 x i8> @vcnei8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vcnei8:
+;CHECK: vceq.i8
+;CHECK-NEXT: vmvn
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
- %tmp3 = vicmp ne <8 x i8> %tmp1, %tmp2
- ret <8 x i8> %tmp3
+ %tmp3 = icmp ne <8 x i8> %tmp1, %tmp2
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
}
define <4 x i16> @vcnei16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vcnei16:
+;CHECK: vceq.i16
+;CHECK-NEXT: vmvn
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
- %tmp3 = vicmp ne <4 x i16> %tmp1, %tmp2
- ret <4 x i16> %tmp3
+ %tmp3 = icmp ne <4 x i16> %tmp1, %tmp2
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
}
define <2 x i32> @vcnei32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vcnei32:
+;CHECK: vceq.i32
+;CHECK-NEXT: vmvn
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
- %tmp3 = vicmp ne <2 x i32> %tmp1, %tmp2
- ret <2 x i32> %tmp3
+ %tmp3 = icmp ne <2 x i32> %tmp1, %tmp2
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
}
define <16 x i8> @vcneQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vcneQi8:
+;CHECK: vceq.i8
+;CHECK-NEXT: vmvn
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
- %tmp3 = vicmp ne <16 x i8> %tmp1, %tmp2
- ret <16 x i8> %tmp3
+ %tmp3 = icmp ne <16 x i8> %tmp1, %tmp2
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
}
define <8 x i16> @vcneQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vcneQi16:
+;CHECK: vceq.i16
+;CHECK-NEXT: vmvn
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
- %tmp3 = vicmp ne <8 x i16> %tmp1, %tmp2
- ret <8 x i16> %tmp3
+ %tmp3 = icmp ne <8 x i16> %tmp1, %tmp2
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
}
define <4 x i32> @vcneQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vcneQi32:
+;CHECK: vceq.i32
+;CHECK-NEXT: vmvn
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
- %tmp3 = vicmp ne <4 x i32> %tmp1, %tmp2
- ret <4 x i32> %tmp3
+ %tmp3 = icmp ne <4 x i32> %tmp1, %tmp2
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
}
define <16 x i8> @vcltQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vcltQs8:
+;CHECK: vcgt.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
- %tmp3 = vicmp slt <16 x i8> %tmp1, %tmp2
- ret <16 x i8> %tmp3
+ %tmp3 = icmp slt <16 x i8> %tmp1, %tmp2
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
}
define <4 x i16> @vcles16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vcles16:
+;CHECK: vcge.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
- %tmp3 = vicmp sle <4 x i16> %tmp1, %tmp2
- ret <4 x i16> %tmp3
+ %tmp3 = icmp sle <4 x i16> %tmp1, %tmp2
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
}
define <4 x i16> @vcltu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vcltu16:
+;CHECK: vcgt.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
- %tmp3 = vicmp ult <4 x i16> %tmp1, %tmp2
- ret <4 x i16> %tmp3
+ %tmp3 = icmp ult <4 x i16> %tmp1, %tmp2
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
}
define <4 x i32> @vcleQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vcleQu32:
+;CHECK: vcge.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
- %tmp3 = vicmp ule <4 x i32> %tmp1, %tmp2
- ret <4 x i32> %tmp3
+ %tmp3 = icmp ule <4 x i32> %tmp1, %tmp2
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
}
diff --git a/test/CodeGen/ARM/vld1.ll b/test/CodeGen/ARM/vld1.ll
new file mode 100644
index 0000000..f5383aa
--- /dev/null
+++ b/test/CodeGen/ARM/vld1.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+define <8 x i8> @vld1i8(i8* %A) nounwind {
+;CHECK: vld1i8:
+;CHECK: vld1.8
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %A)
+ ret <8 x i8> %tmp1
+}
+
+define <4 x i16> @vld1i16(i16* %A) nounwind {
+;CHECK: vld1i16:
+;CHECK: vld1.16
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i16* %A)
+ ret <4 x i16> %tmp1
+}
+
+define <2 x i32> @vld1i32(i32* %A) nounwind {
+;CHECK: vld1i32:
+;CHECK: vld1.32
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i32* %A)
+ ret <2 x i32> %tmp1
+}
+
+define <2 x float> @vld1f(float* %A) nounwind {
+;CHECK: vld1f:
+;CHECK: vld1.32
+ %tmp1 = call <2 x float> @llvm.arm.neon.vld1.v2f32(float* %A)
+ ret <2 x float> %tmp1
+}
+
+define <1 x i64> @vld1i64(i64* %A) nounwind {
+;CHECK: vld1i64:
+;CHECK: vld1.64
+ %tmp1 = call <1 x i64> @llvm.arm.neon.vld1.v1i64(i64* %A)
+ ret <1 x i64> %tmp1
+}
+
+define <16 x i8> @vld1Qi8(i8* %A) nounwind {
+;CHECK: vld1Qi8:
+;CHECK: vld1.8
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A)
+ ret <16 x i8> %tmp1
+}
+
+define <8 x i16> @vld1Qi16(i16* %A) nounwind {
+;CHECK: vld1Qi16:
+;CHECK: vld1.16
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i16* %A)
+ ret <8 x i16> %tmp1
+}
+
+define <4 x i32> @vld1Qi32(i32* %A) nounwind {
+;CHECK: vld1Qi32:
+;CHECK: vld1.32
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vld1.v4i32(i32* %A)
+ ret <4 x i32> %tmp1
+}
+
+define <4 x float> @vld1Qf(float* %A) nounwind {
+;CHECK: vld1Qf:
+;CHECK: vld1.32
+ %tmp1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(float* %A)
+ ret <4 x float> %tmp1
+}
+
+define <2 x i64> @vld1Qi64(i64* %A) nounwind {
+;CHECK: vld1Qi64:
+;CHECK: vld1.64
+ %tmp1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i64* %A)
+ ret <2 x i64> %tmp1
+}
+
+declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*) nounwind readonly
+declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*) nounwind readonly
+declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*) nounwind readonly
+declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*) nounwind readonly
+declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*) nounwind readonly
+
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*) nounwind readonly
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*) nounwind readonly
+declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*) nounwind readonly
+declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*) nounwind readonly
+declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*) nounwind readonly
diff --git a/test/CodeGen/ARM/vld2.ll b/test/CodeGen/ARM/vld2.ll
new file mode 100644
index 0000000..23f7d2c
--- /dev/null
+++ b/test/CodeGen/ARM/vld2.ll
@@ -0,0 +1,113 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
+%struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> }
+%struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
+%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
+%struct.__neon_int64x1x2_t = type { <1 x i64>, <1 x i64> }
+
+%struct.__neon_int8x16x2_t = type { <16 x i8>, <16 x i8> }
+%struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
+%struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
+%struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> }
+
+define <8 x i8> @vld2i8(i8* %A) nounwind {
+;CHECK: vld2i8:
+;CHECK: vld2.8
+ %tmp1 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8* %A)
+ %tmp2 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 1
+ %tmp4 = add <8 x i8> %tmp2, %tmp3
+ ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vld2i16(i16* %A) nounwind {
+;CHECK: vld2i16:
+;CHECK: vld2.16
+ %tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i16* %A)
+ %tmp2 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 1
+ %tmp4 = add <4 x i16> %tmp2, %tmp3
+ ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vld2i32(i32* %A) nounwind {
+;CHECK: vld2i32:
+;CHECK: vld2.32
+ %tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i32* %A)
+ %tmp2 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 1
+ %tmp4 = add <2 x i32> %tmp2, %tmp3
+ ret <2 x i32> %tmp4
+}
+
+define <2 x float> @vld2f(float* %A) nounwind {
+;CHECK: vld2f:
+;CHECK: vld2.32
+ %tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(float* %A)
+ %tmp2 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 1
+ %tmp4 = add <2 x float> %tmp2, %tmp3
+ ret <2 x float> %tmp4
+}
+
+define <1 x i64> @vld2i64(i64* %A) nounwind {
+;CHECK: vld2i64:
+;CHECK: vld1.64
+ %tmp1 = call %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i64* %A)
+ %tmp2 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 1
+ %tmp4 = add <1 x i64> %tmp2, %tmp3
+ ret <1 x i64> %tmp4
+}
+
+define <16 x i8> @vld2Qi8(i8* %A) nounwind {
+;CHECK: vld2Qi8:
+;CHECK: vld2.8
+ %tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A)
+ %tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
+ %tmp4 = add <16 x i8> %tmp2, %tmp3
+ ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vld2Qi16(i16* %A) nounwind {
+;CHECK: vld2Qi16:
+;CHECK: vld2.16
+ %tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i16* %A)
+ %tmp2 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 1
+ %tmp4 = add <8 x i16> %tmp2, %tmp3
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vld2Qi32(i32* %A) nounwind {
+;CHECK: vld2Qi32:
+;CHECK: vld2.32
+ %tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i32* %A)
+ %tmp2 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 1
+ %tmp4 = add <4 x i32> %tmp2, %tmp3
+ ret <4 x i32> %tmp4
+}
+
+define <4 x float> @vld2Qf(float* %A) nounwind {
+;CHECK: vld2Qf:
+;CHECK: vld2.32
+ %tmp1 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32(float* %A)
+ %tmp2 = extractvalue %struct.__neon_float32x4x2_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp1, 1
+ %tmp4 = add <4 x float> %tmp2, %tmp3
+ ret <4 x float> %tmp4
+}
+
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8*) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8*) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8*) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8*) nounwind readonly
+
+declare %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8*) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32(i8*) nounwind readonly
diff --git a/test/CodeGen/ARM/vld3.ll b/test/CodeGen/ARM/vld3.ll
new file mode 100644
index 0000000..207dc6a
--- /dev/null
+++ b/test/CodeGen/ARM/vld3.ll
@@ -0,0 +1,117 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
+%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
+%struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
+%struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
+%struct.__neon_int64x1x3_t = type { <1 x i64>, <1 x i64>, <1 x i64> }
+
+%struct.__neon_int8x16x3_t = type { <16 x i8>, <16 x i8>, <16 x i8> }
+%struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> }
+%struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> }
+%struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> }
+
+define <8 x i8> @vld3i8(i8* %A) nounwind {
+;CHECK: vld3i8:
+;CHECK: vld3.8
+ %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A)
+ %tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2
+ %tmp4 = add <8 x i8> %tmp2, %tmp3
+ ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vld3i16(i16* %A) nounwind {
+;CHECK: vld3i16:
+;CHECK: vld3.16
+ %tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i16* %A)
+ %tmp2 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 2
+ %tmp4 = add <4 x i16> %tmp2, %tmp3
+ ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vld3i32(i32* %A) nounwind {
+;CHECK: vld3i32:
+;CHECK: vld3.32
+ %tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i32* %A)
+ %tmp2 = extractvalue %struct.__neon_int32x2x3_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp1, 2
+ %tmp4 = add <2 x i32> %tmp2, %tmp3
+ ret <2 x i32> %tmp4
+}
+
+define <2 x float> @vld3f(float* %A) nounwind {
+;CHECK: vld3f:
+;CHECK: vld3.32
+ %tmp1 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32(float* %A)
+ %tmp2 = extractvalue %struct.__neon_float32x2x3_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp1, 2
+ %tmp4 = add <2 x float> %tmp2, %tmp3
+ ret <2 x float> %tmp4
+}
+
+define <1 x i64> @vld3i64(i64* %A) nounwind {
+;CHECK: vld3i64:
+;CHECK: vld1.64
+ %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i64* %A)
+ %tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 2
+ %tmp4 = add <1 x i64> %tmp2, %tmp3
+ ret <1 x i64> %tmp4
+}
+
+define <16 x i8> @vld3Qi8(i8* %A) nounwind {
+;CHECK: vld3Qi8:
+;CHECK: vld3.8
+;CHECK: vld3.8
+ %tmp1 = call %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8* %A)
+ %tmp2 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 2
+ %tmp4 = add <16 x i8> %tmp2, %tmp3
+ ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vld3Qi16(i16* %A) nounwind {
+;CHECK: vld3Qi16:
+;CHECK: vld3.16
+;CHECK: vld3.16
+ %tmp1 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i16* %A)
+ %tmp2 = extractvalue %struct.__neon_int16x8x3_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp1, 2
+ %tmp4 = add <8 x i16> %tmp2, %tmp3
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vld3Qi32(i32* %A) nounwind {
+;CHECK: vld3Qi32:
+;CHECK: vld3.32
+;CHECK: vld3.32
+ %tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i32* %A)
+ %tmp2 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 2
+ %tmp4 = add <4 x i32> %tmp2, %tmp3
+ ret <4 x i32> %tmp4
+}
+
+define <4 x float> @vld3Qf(float* %A) nounwind {
+;CHECK: vld3Qf:
+;CHECK: vld3.32
+;CHECK: vld3.32
+ %tmp1 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(float* %A)
+ %tmp2 = extractvalue %struct.__neon_float32x4x3_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp1, 2
+ %tmp4 = add <4 x float> %tmp2, %tmp3
+ ret <4 x float> %tmp4
+}
+
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8*) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32(i8*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8*) nounwind readonly
+
+declare %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i8*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8*) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(i8*) nounwind readonly
diff --git a/test/CodeGen/ARM/vld4.ll b/test/CodeGen/ARM/vld4.ll
new file mode 100644
index 0000000..0624f29
--- /dev/null
+++ b/test/CodeGen/ARM/vld4.ll
@@ -0,0 +1,117 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+%struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }
+%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
+%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
+%struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
+%struct.__neon_int64x1x4_t = type { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }
+
+%struct.__neon_int8x16x4_t = type { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }
+%struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }
+%struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }
+%struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
+
+define <8 x i8> @vld4i8(i8* %A) nounwind {
+;CHECK: vld4i8:
+;CHECK: vld4.8
+ %tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A)
+ %tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2
+ %tmp4 = add <8 x i8> %tmp2, %tmp3
+ ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vld4i16(i16* %A) nounwind {
+;CHECK: vld4i16:
+;CHECK: vld4.16
+ %tmp1 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i16* %A)
+ %tmp2 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 2
+ %tmp4 = add <4 x i16> %tmp2, %tmp3
+ ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vld4i32(i32* %A) nounwind {
+;CHECK: vld4i32:
+;CHECK: vld4.32
+ %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i32* %A)
+ %tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 2
+ %tmp4 = add <2 x i32> %tmp2, %tmp3
+ ret <2 x i32> %tmp4
+}
+
+define <2 x float> @vld4f(float* %A) nounwind {
+;CHECK: vld4f:
+;CHECK: vld4.32
+ %tmp1 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32(float* %A)
+ %tmp2 = extractvalue %struct.__neon_float32x2x4_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp1, 2
+ %tmp4 = add <2 x float> %tmp2, %tmp3
+ ret <2 x float> %tmp4
+}
+
+define <1 x i64> @vld4i64(i64* %A) nounwind {
+;CHECK: vld4i64:
+;CHECK: vld1.64
+ %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i64* %A)
+ %tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 2
+ %tmp4 = add <1 x i64> %tmp2, %tmp3
+ ret <1 x i64> %tmp4
+}
+
+define <16 x i8> @vld4Qi8(i8* %A) nounwind {
+;CHECK: vld4Qi8:
+;CHECK: vld4.8
+;CHECK: vld4.8
+ %tmp1 = call %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8* %A)
+ %tmp2 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 2
+ %tmp4 = add <16 x i8> %tmp2, %tmp3
+ ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vld4Qi16(i16* %A) nounwind {
+;CHECK: vld4Qi16:
+;CHECK: vld4.16
+;CHECK: vld4.16
+ %tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i16* %A)
+ %tmp2 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 2
+ %tmp4 = add <8 x i16> %tmp2, %tmp3
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vld4Qi32(i32* %A) nounwind {
+;CHECK: vld4Qi32:
+;CHECK: vld4.32
+;CHECK: vld4.32
+ %tmp1 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32(i32* %A)
+ %tmp2 = extractvalue %struct.__neon_int32x4x4_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp1, 2
+ %tmp4 = add <4 x i32> %tmp2, %tmp3
+ ret <4 x i32> %tmp4
+}
+
+define <4 x float> @vld4Qf(float* %A) nounwind {
+;CHECK: vld4Qf:
+;CHECK: vld4.32
+;CHECK: vld4.32
+ %tmp1 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32(float* %A)
+ %tmp2 = extractvalue %struct.__neon_float32x4x4_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp1, 2
+ %tmp4 = add <4 x float> %tmp2, %tmp3
+ ret <4 x float> %tmp4
+}
+
+declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8*) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32(i8*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8*) nounwind readonly
+
+declare %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32(i8*) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32(i8*) nounwind readonly
diff --git a/test/CodeGen/ARM/vldlane.ll b/test/CodeGen/ARM/vldlane.ll
new file mode 100644
index 0000000..53881a3f
--- /dev/null
+++ b/test/CodeGen/ARM/vldlane.ll
@@ -0,0 +1,328 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
+%struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> }
+%struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
+%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
+
+%struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
+%struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
+%struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> }
+
+define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind {
+;CHECK: vld2lanei8:
+;CHECK: vld2.8
+ %tmp1 = load <8 x i8>* %B
+ %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind {
+;CHECK: vld2lanei16:
+;CHECK: vld2.16
+ %tmp1 = load <4 x i16>* %B
+ %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1
+ %tmp5 = add <4 x i16> %tmp3, %tmp4
+ ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind {
+;CHECK: vld2lanei32:
+;CHECK: vld2.32
+ %tmp1 = load <2 x i32>* %B
+ %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
+ %tmp5 = add <2 x i32> %tmp3, %tmp4
+ ret <2 x i32> %tmp5
+}
+
+define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind {
+;CHECK: vld2lanef:
+;CHECK: vld2.32
+ %tmp1 = load <2 x float>* %B
+ %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1
+ %tmp5 = add <2 x float> %tmp3, %tmp4
+ ret <2 x float> %tmp5
+}
+
+define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
+;CHECK: vld2laneQi16:
+;CHECK: vld2.16
+ %tmp1 = load <8 x i16>* %B
+ %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
+;CHECK: vld2laneQi32:
+;CHECK: vld2.32
+ %tmp1 = load <4 x i32>* %B
+ %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2)
+ %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind {
+;CHECK: vld2laneQf:
+;CHECK: vld2.32
+ %tmp1 = load <4 x float>* %B
+ %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1
+ %tmp5 = add <4 x float> %tmp3, %tmp4
+ ret <4 x float> %tmp5
+}
+
+declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32) nounwind readonly
+
+declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind readonly
+
+%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
+%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
+%struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
+%struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
+
+%struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> }
+%struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> }
+%struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> }
+
+define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind {
+;CHECK: vld3lanei8:
+;CHECK: vld3.8
+ %tmp1 = load <8 x i8>* %B
+ %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
+ %tmp6 = add <8 x i8> %tmp3, %tmp4
+ %tmp7 = add <8 x i8> %tmp5, %tmp6
+ ret <8 x i8> %tmp7
+}
+
+define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind {
+;CHECK: vld3lanei16:
+;CHECK: vld3.16
+ %tmp1 = load <4 x i16>* %B
+ %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2
+ %tmp6 = add <4 x i16> %tmp3, %tmp4
+ %tmp7 = add <4 x i16> %tmp5, %tmp6
+ ret <4 x i16> %tmp7
+}
+
+define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind {
+;CHECK: vld3lanei32:
+;CHECK: vld3.32
+ %tmp1 = load <2 x i32>* %B
+ %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2
+ %tmp6 = add <2 x i32> %tmp3, %tmp4
+ %tmp7 = add <2 x i32> %tmp5, %tmp6
+ ret <2 x i32> %tmp7
+}
+
+define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind {
+;CHECK: vld3lanef:
+;CHECK: vld3.32
+ %tmp1 = load <2 x float>* %B
+ %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2
+ %tmp6 = add <2 x float> %tmp3, %tmp4
+ %tmp7 = add <2 x float> %tmp5, %tmp6
+ ret <2 x float> %tmp7
+}
+
+define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
+;CHECK: vld3laneQi16:
+;CHECK: vld3.16
+ %tmp1 = load <8 x i16>* %B
+ %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
+ %tmp6 = add <8 x i16> %tmp3, %tmp4
+ %tmp7 = add <8 x i16> %tmp5, %tmp6
+ ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
+;CHECK: vld3laneQi32:
+;CHECK: vld3.32
+ %tmp1 = load <4 x i32>* %B
+ %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3)
+ %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2
+ %tmp6 = add <4 x i32> %tmp3, %tmp4
+ %tmp7 = add <4 x i32> %tmp5, %tmp6
+ ret <4 x i32> %tmp7
+}
+
+define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind {
+;CHECK: vld3laneQf:
+;CHECK: vld3.32
+ %tmp1 = load <4 x float>* %B
+ %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2
+ %tmp6 = add <4 x float> %tmp3, %tmp4
+ %tmp7 = add <4 x float> %tmp5, %tmp6
+ ret <4 x float> %tmp7
+}
+
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) nounwind readonly
+
+declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32) nounwind readonly
+
+%struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }
+%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
+%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
+%struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
+
+%struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }
+%struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }
+%struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
+
+define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
+;CHECK: vld4lanei8:
+;CHECK: vld4.8
+ %tmp1 = load <8 x i8>* %B
+ %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
+ %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
+ %tmp7 = add <8 x i8> %tmp3, %tmp4
+ %tmp8 = add <8 x i8> %tmp5, %tmp6
+ %tmp9 = add <8 x i8> %tmp7, %tmp8
+ ret <8 x i8> %tmp9
+}
+
+define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
+;CHECK: vld4lanei16:
+;CHECK: vld4.16
+ %tmp1 = load <4 x i16>* %B
+ %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2
+ %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3
+ %tmp7 = add <4 x i16> %tmp3, %tmp4
+ %tmp8 = add <4 x i16> %tmp5, %tmp6
+ %tmp9 = add <4 x i16> %tmp7, %tmp8
+ ret <4 x i16> %tmp9
+}
+
+define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind {
+;CHECK: vld4lanei32:
+;CHECK: vld4.32
+ %tmp1 = load <2 x i32>* %B
+ %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2
+ %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3
+ %tmp7 = add <2 x i32> %tmp3, %tmp4
+ %tmp8 = add <2 x i32> %tmp5, %tmp6
+ %tmp9 = add <2 x i32> %tmp7, %tmp8
+ ret <2 x i32> %tmp9
+}
+
+define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind {
+;CHECK: vld4lanef:
+;CHECK: vld4.32
+ %tmp1 = load <2 x float>* %B
+ %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2
+ %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3
+ %tmp7 = add <2 x float> %tmp3, %tmp4
+ %tmp8 = add <2 x float> %tmp5, %tmp6
+ %tmp9 = add <2 x float> %tmp7, %tmp8
+ ret <2 x float> %tmp9
+}
+
+define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
+;CHECK: vld4laneQi16:
+;CHECK: vld4.16
+ %tmp1 = load <8 x i16>* %B
+ %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2
+ %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3
+ %tmp7 = add <8 x i16> %tmp3, %tmp4
+ %tmp8 = add <8 x i16> %tmp5, %tmp6
+ %tmp9 = add <8 x i16> %tmp7, %tmp8
+ ret <8 x i16> %tmp9
+}
+
+define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
+;CHECK: vld4laneQi32:
+;CHECK: vld4.32
+ %tmp1 = load <4 x i32>* %B
+ %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2
+ %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3
+ %tmp7 = add <4 x i32> %tmp3, %tmp4
+ %tmp8 = add <4 x i32> %tmp5, %tmp6
+ %tmp9 = add <4 x i32> %tmp7, %tmp8
+ ret <4 x i32> %tmp9
+}
+
+define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind {
+;CHECK: vld4laneQf:
+;CHECK: vld4.32
+ %tmp1 = load <4 x float>* %B
+ %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2
+ %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3
+ %tmp7 = add <4 x float> %tmp3, %tmp4
+ %tmp8 = add <4 x float> %tmp5, %tmp6
+ %tmp9 = add <4 x float> %tmp7, %tmp8
+ ret <4 x float> %tmp9
+}
+
+declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) nounwind readonly
+
+declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) nounwind readonly
diff --git a/test/CodeGen/ARM/vminmax.ll b/test/CodeGen/ARM/vminmax.ll
new file mode 100644
index 0000000..e3527c1
--- /dev/null
+++ b/test/CodeGen/ARM/vminmax.ll
@@ -0,0 +1,293 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+define <8 x i8> @vmins8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vmins8:
+;CHECK: vmin.s8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vmins16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vmins16:
+;CHECK: vmin.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vmins32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vmins32:
+;CHECK: vmin.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vminu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vminu8:
+;CHECK: vmin.u8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vminu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vminu16:
+;CHECK: vmin.u16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vminu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vminu32:
+;CHECK: vmin.u32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vminf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vminf32:
+;CHECK: vmin.f32
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x float> %tmp3
+}
+
+define <16 x i8> @vminQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vminQs8:
+;CHECK: vmin.s8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vminQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vminQs16:
+;CHECK: vmin.s16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vminQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vminQs32:
+;CHECK: vmin.s32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vminQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vminQu8:
+;CHECK: vmin.u8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vminQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vminQu16:
+;CHECK: vmin.u16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vminQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vminQu32:
+;CHECK: vmin.u32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <4 x float> @vminQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK: vminQf32:
+;CHECK: vmin.f32
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+ ret <4 x float> %tmp3
+}
+
+declare <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <8 x i8> @vmaxs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vmaxs8:
+;CHECK: vmax.s8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vmaxs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vmaxs16:
+;CHECK: vmax.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vmaxs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vmaxs32:
+;CHECK: vmax.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vmaxu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vmaxu8:
+;CHECK: vmax.u8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vmaxu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vmaxu16:
+;CHECK: vmax.u16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vmaxu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vmaxu32:
+;CHECK: vmax.u32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vmaxf32:
+;CHECK: vmax.f32
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x float> %tmp3
+}
+
+define <16 x i8> @vmaxQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vmaxQs8:
+;CHECK: vmax.s8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vmaxQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vmaxQs16:
+;CHECK: vmax.s16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vmaxQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vmaxQs32:
+;CHECK: vmax.s32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vmaxQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vmaxQu8:
+;CHECK: vmax.u8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vmaxQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vmaxQu16:
+;CHECK: vmax.u16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vmaxQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vmaxQu32:
+;CHECK: vmax.u32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <4 x float> @vmaxQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK: vmaxQf32:
+;CHECK: vmax.f32
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+ ret <4 x float> %tmp3
+}
+
+declare <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vmla.ll b/test/CodeGen/ARM/vmla.ll
index ed77e11..8405218 100644
--- a/test/CodeGen/ARM/vmla.ll
+++ b/test/CodeGen/ARM/vmla.ll
@@ -1,10 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vmla\\.i8} %t | count 2
-; RUN: grep {vmla\\.i16} %t | count 2
-; RUN: grep {vmla\\.i32} %t | count 2
-; RUN: grep {vmla\\.f32} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vmlai8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8> * %C) nounwind {
+;CHECK: vmlai8:
+;CHECK: vmla.i8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
@@ -14,6 +12,8 @@ define <8 x i8> @vmlai8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8> * %C) nounwind {
}
define <4 x i16> @vmlai16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK: vmlai16:
+;CHECK: vmla.i16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
@@ -23,6 +23,8 @@ define <4 x i16> @vmlai16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind
}
define <2 x i32> @vmlai32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK: vmlai32:
+;CHECK: vmla.i32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
@@ -32,6 +34,8 @@ define <2 x i32> @vmlai32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind
}
define <2 x float> @vmlaf32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK: vmlaf32:
+;CHECK: vmla.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = load <2 x float>* %C
@@ -41,6 +45,8 @@ define <2 x float> @vmlaf32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) n
}
define <16 x i8> @vmlaQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8> * %C) nounwind {
+;CHECK: vmlaQi8:
+;CHECK: vmla.i8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = load <16 x i8>* %C
@@ -50,6 +56,8 @@ define <16 x i8> @vmlaQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8> * %C) nounwind
}
define <8 x i16> @vmlaQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+;CHECK: vmlaQi16:
+;CHECK: vmla.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = load <8 x i16>* %C
@@ -59,6 +67,8 @@ define <8 x i16> @vmlaQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind
}
define <4 x i32> @vmlaQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+;CHECK: vmlaQi32:
+;CHECK: vmla.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = load <4 x i32>* %C
@@ -68,6 +78,8 @@ define <4 x i32> @vmlaQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind
}
define <4 x float> @vmlaQf32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK: vmlaQf32:
+;CHECK: vmla.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = load <4 x float>* %C
@@ -75,3 +87,107 @@ define <4 x float> @vmlaQf32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C)
%tmp5 = add <4 x float> %tmp1, %tmp4
ret <4 x float> %tmp5
}
+
+define <8 x i16> @vmlals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK: vmlals8:
+;CHECK: vmlal.s8
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = load <8 x i8>* %C
+ %tmp4 = call <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK: vmlals16:
+;CHECK: vmlal.s16
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i16>* %C
+ %tmp4 = call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK: vmlals32:
+;CHECK: vmlal.s32
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i32>* %C
+ %tmp4 = call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @vmlalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK: vmlalu8:
+;CHECK: vmlal.u8
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = load <8 x i8>* %C
+ %tmp4 = call <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vmlalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK: vmlalu16:
+;CHECK: vmlal.u16
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i16>* %C
+ %tmp4 = call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vmlalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK: vmlalu32:
+;CHECK: vmlal.u32
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i32>* %C
+ %tmp4 = call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+ ret <2 x i64> %tmp4
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
+entry:
+; CHECK: test_vmlal_lanes16
+; CHECK: vmlal.s16 q0, d2, d3[1]
+ %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
+ %1 = tail call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
+entry:
+; CHECK: test_vmlal_lanes32
+; CHECK: vmlal.s32 q0, d2, d3[1]
+ %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
+ %1 = tail call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmlal_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone {
+entry:
+; CHECK: test_vmlal_laneu16
+; CHECK: vmlal.u16 q0, d2, d3[1]
+ %0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
+ %1 = tail call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vmlal_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone {
+entry:
+; CHECK: test_vmlal_laneu32
+; CHECK: vmlal.u32 q0, d2, d3[1]
+ %0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
+ %1 = tail call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %1
+}
+
+declare <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vmls.ll b/test/CodeGen/ARM/vmls.ll
index d519b7e..c89552e 100644
--- a/test/CodeGen/ARM/vmls.ll
+++ b/test/CodeGen/ARM/vmls.ll
@@ -1,10 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vmls\\.i8} %t | count 2
-; RUN: grep {vmls\\.i16} %t | count 2
-; RUN: grep {vmls\\.i32} %t | count 2
-; RUN: grep {vmls\\.f32} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vmlsi8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8> * %C) nounwind {
+;CHECK: vmlsi8:
+;CHECK: vmls.i8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = load <8 x i8>* %C
@@ -14,6 +12,8 @@ define <8 x i8> @vmlsi8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8> * %C) nounwind {
}
define <4 x i16> @vmlsi16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK: vmlsi16:
+;CHECK: vmls.i16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = load <4 x i16>* %C
@@ -23,6 +23,8 @@ define <4 x i16> @vmlsi16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind
}
define <2 x i32> @vmlsi32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK: vmlsi32:
+;CHECK: vmls.i32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = load <2 x i32>* %C
@@ -32,6 +34,8 @@ define <2 x i32> @vmlsi32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind
}
define <2 x float> @vmlsf32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK: vmlsf32:
+;CHECK: vmls.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = load <2 x float>* %C
@@ -41,6 +45,8 @@ define <2 x float> @vmlsf32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) n
}
define <16 x i8> @vmlsQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8> * %C) nounwind {
+;CHECK: vmlsQi8:
+;CHECK: vmls.i8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = load <16 x i8>* %C
@@ -50,6 +56,8 @@ define <16 x i8> @vmlsQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8> * %C) nounwind
}
define <8 x i16> @vmlsQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+;CHECK: vmlsQi16:
+;CHECK: vmls.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = load <8 x i16>* %C
@@ -59,6 +67,8 @@ define <8 x i16> @vmlsQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind
}
define <4 x i32> @vmlsQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+;CHECK: vmlsQi32:
+;CHECK: vmls.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = load <4 x i32>* %C
@@ -68,6 +78,8 @@ define <4 x i32> @vmlsQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind
}
define <4 x float> @vmlsQf32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK: vmlsQf32:
+;CHECK: vmls.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = load <4 x float>* %C
@@ -75,3 +87,107 @@ define <4 x float> @vmlsQf32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C)
%tmp5 = sub <4 x float> %tmp1, %tmp4
ret <4 x float> %tmp5
}
+
+define <8 x i16> @vmlsls8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK: vmlsls8:
+;CHECK: vmlsl.s8
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = load <8 x i8>* %C
+ %tmp4 = call <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK: vmlsls16:
+;CHECK: vmlsl.s16
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i16>* %C
+ %tmp4 = call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK: vmlsls32:
+;CHECK: vmlsl.s32
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i32>* %C
+ %tmp4 = call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @vmlslu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK: vmlslu8:
+;CHECK: vmlsl.u8
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = load <8 x i8>* %C
+ %tmp4 = call <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vmlslu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK: vmlslu16:
+;CHECK: vmlsl.u16
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i16>* %C
+ %tmp4 = call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vmlslu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK: vmlslu32:
+;CHECK: vmlsl.u32
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i32>* %C
+ %tmp4 = call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+ ret <2 x i64> %tmp4
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
+entry:
+; CHECK: test_vmlsl_lanes16
+; CHECK: vmlsl.s16 q0, d2, d3[1]
+ %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
+ %1 = tail call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
+entry:
+; CHECK: test_vmlsl_lanes32
+; CHECK: vmlsl.s32 q0, d2, d3[1]
+ %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
+ %1 = tail call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone {
+entry:
+; CHECK: test_vmlsl_laneu16
+; CHECK: vmlsl.u16 q0, d2, d3[1]
+ %0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
+ %1 = tail call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone {
+entry:
+; CHECK: test_vmlsl_laneu32
+; CHECK: vmlsl.u32 q0, d2, d3[1]
+ %0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
+ %1 = tail call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %1
+}
+
+declare <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vmov.ll b/test/CodeGen/ARM/vmov.ll
index af9c8e2..ed69f97 100644
--- a/test/CodeGen/ARM/vmov.ll
+++ b/test/CodeGen/ARM/vmov.ll
@@ -1,101 +1,303 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep vmov.i8 %t | count 2
-; RUN: grep vmov.i16 %t | count 4
-; RUN: grep vmov.i32 %t | count 12
-; RUN: grep vmov.i64 %t | count 2
-; Note: function names do not include "vmov" to allow simple grep for opcodes
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @v_movi8() nounwind {
+;CHECK: v_movi8:
+;CHECK: vmov.i8
ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
}
define <4 x i16> @v_movi16a() nounwind {
+;CHECK: v_movi16a:
+;CHECK: vmov.i16
ret <4 x i16> < i16 16, i16 16, i16 16, i16 16 >
}
; 0x1000 = 4096
define <4 x i16> @v_movi16b() nounwind {
+;CHECK: v_movi16b:
+;CHECK: vmov.i16
ret <4 x i16> < i16 4096, i16 4096, i16 4096, i16 4096 >
}
define <2 x i32> @v_movi32a() nounwind {
+;CHECK: v_movi32a:
+;CHECK: vmov.i32
ret <2 x i32> < i32 32, i32 32 >
}
; 0x2000 = 8192
define <2 x i32> @v_movi32b() nounwind {
+;CHECK: v_movi32b:
+;CHECK: vmov.i32
ret <2 x i32> < i32 8192, i32 8192 >
}
; 0x200000 = 2097152
define <2 x i32> @v_movi32c() nounwind {
+;CHECK: v_movi32c:
+;CHECK: vmov.i32
ret <2 x i32> < i32 2097152, i32 2097152 >
}
; 0x20000000 = 536870912
define <2 x i32> @v_movi32d() nounwind {
+;CHECK: v_movi32d:
+;CHECK: vmov.i32
ret <2 x i32> < i32 536870912, i32 536870912 >
}
; 0x20ff = 8447
define <2 x i32> @v_movi32e() nounwind {
+;CHECK: v_movi32e:
+;CHECK: vmov.i32
ret <2 x i32> < i32 8447, i32 8447 >
}
; 0x20ffff = 2162687
define <2 x i32> @v_movi32f() nounwind {
+;CHECK: v_movi32f:
+;CHECK: vmov.i32
ret <2 x i32> < i32 2162687, i32 2162687 >
}
; 0xff0000ff0000ffff = 18374687574888349695
define <1 x i64> @v_movi64() nounwind {
+;CHECK: v_movi64:
+;CHECK: vmov.i64
ret <1 x i64> < i64 18374687574888349695 >
}
define <16 x i8> @v_movQi8() nounwind {
+;CHECK: v_movQi8:
+;CHECK: vmov.i8
ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
}
define <8 x i16> @v_movQi16a() nounwind {
+;CHECK: v_movQi16a:
+;CHECK: vmov.i16
ret <8 x i16> < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
}
; 0x1000 = 4096
define <8 x i16> @v_movQi16b() nounwind {
+;CHECK: v_movQi16b:
+;CHECK: vmov.i16
ret <8 x i16> < i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096 >
}
define <4 x i32> @v_movQi32a() nounwind {
+;CHECK: v_movQi32a:
+;CHECK: vmov.i32
ret <4 x i32> < i32 32, i32 32, i32 32, i32 32 >
}
; 0x2000 = 8192
define <4 x i32> @v_movQi32b() nounwind {
+;CHECK: v_movQi32b:
+;CHECK: vmov.i32
ret <4 x i32> < i32 8192, i32 8192, i32 8192, i32 8192 >
}
; 0x200000 = 2097152
define <4 x i32> @v_movQi32c() nounwind {
+;CHECK: v_movQi32c:
+;CHECK: vmov.i32
ret <4 x i32> < i32 2097152, i32 2097152, i32 2097152, i32 2097152 >
}
; 0x20000000 = 536870912
define <4 x i32> @v_movQi32d() nounwind {
+;CHECK: v_movQi32d:
+;CHECK: vmov.i32
ret <4 x i32> < i32 536870912, i32 536870912, i32 536870912, i32 536870912 >
}
; 0x20ff = 8447
define <4 x i32> @v_movQi32e() nounwind {
+;CHECK: v_movQi32e:
+;CHECK: vmov.i32
ret <4 x i32> < i32 8447, i32 8447, i32 8447, i32 8447 >
}
; 0x20ffff = 2162687
define <4 x i32> @v_movQi32f() nounwind {
+;CHECK: v_movQi32f:
+;CHECK: vmov.i32
ret <4 x i32> < i32 2162687, i32 2162687, i32 2162687, i32 2162687 >
}
; 0xff0000ff0000ffff = 18374687574888349695
define <2 x i64> @v_movQi64() nounwind {
+;CHECK: v_movQi64:
+;CHECK: vmov.i64
ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 >
}
+define <8 x i16> @vmovls8(<8 x i8>* %A) nounwind {
+;CHECK: vmovls8:
+;CHECK: vmovl.s8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = call <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8> %tmp1)
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vmovls16(<4 x i16>* %A) nounwind {
+;CHECK: vmovls16:
+;CHECK: vmovl.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = call <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16> %tmp1)
+ ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vmovls32(<2 x i32>* %A) nounwind {
+;CHECK: vmovls32:
+;CHECK: vmovl.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = call <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32> %tmp1)
+ ret <2 x i64> %tmp2
+}
+
+define <8 x i16> @vmovlu8(<8 x i8>* %A) nounwind {
+;CHECK: vmovlu8:
+;CHECK: vmovl.u8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = call <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8> %tmp1)
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vmovlu16(<4 x i16>* %A) nounwind {
+;CHECK: vmovlu16:
+;CHECK: vmovl.u16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = call <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16> %tmp1)
+ ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vmovlu32(<2 x i32>* %A) nounwind {
+;CHECK: vmovlu32:
+;CHECK: vmovl.u32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = call <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32> %tmp1)
+ ret <2 x i64> %tmp2
+}
+
+declare <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32>) nounwind readnone
+
+define <8 x i8> @vmovni16(<8 x i16>* %A) nounwind {
+;CHECK: vmovni16:
+;CHECK: vmovn.i16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = call <8 x i8> @llvm.arm.neon.vmovn.v8i8(<8 x i16> %tmp1)
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vmovni32(<4 x i32>* %A) nounwind {
+;CHECK: vmovni32:
+;CHECK: vmovn.i32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32> %tmp1)
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vmovni64(<2 x i64>* %A) nounwind {
+;CHECK: vmovni64:
+;CHECK: vmovn.i64
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64> %tmp1)
+ ret <2 x i32> %tmp2
+}
+
+declare <8 x i8> @llvm.arm.neon.vmovn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64>) nounwind readnone
+
+define <8 x i8> @vqmovns16(<8 x i16>* %A) nounwind {
+;CHECK: vqmovns16:
+;CHECK: vqmovn.s16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %tmp1)
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqmovns32(<4 x i32>* %A) nounwind {
+;CHECK: vqmovns32:
+;CHECK: vqmovn.s32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %tmp1)
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqmovns64(<2 x i64>* %A) nounwind {
+;CHECK: vqmovns64:
+;CHECK: vqmovn.s64
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %tmp1)
+ ret <2 x i32> %tmp2
+}
+
+define <8 x i8> @vqmovnu16(<8 x i16>* %A) nounwind {
+;CHECK: vqmovnu16:
+;CHECK: vqmovn.u16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %tmp1)
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqmovnu32(<4 x i32>* %A) nounwind {
+;CHECK: vqmovnu32:
+;CHECK: vqmovn.u32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %tmp1)
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqmovnu64(<2 x i64>* %A) nounwind {
+;CHECK: vqmovnu64:
+;CHECK: vqmovn.u64
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %tmp1)
+ ret <2 x i32> %tmp2
+}
+
+define <8 x i8> @vqmovuns16(<8 x i16>* %A) nounwind {
+;CHECK: vqmovuns16:
+;CHECK: vqmovun.s16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %tmp1)
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqmovuns32(<4 x i32>* %A) nounwind {
+;CHECK: vqmovuns32:
+;CHECK: vqmovun.s32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %tmp1)
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqmovuns64(<2 x i64>* %A) nounwind {
+;CHECK: vqmovuns64:
+;CHECK: vqmovun.s64
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %tmp1)
+ ret <2 x i32> %tmp2
+}
+
+declare <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll
index eb9ae7b..325da5d 100644
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@@ -1,11 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vmul\\.i8} %t | count 2
-; RUN: grep {vmul\\.i16} %t | count 2
-; RUN: grep {vmul\\.i32} %t | count 2
-; RUN: grep {vmul\\.f32} %t | count 2
-; RUN: grep {vmul\\.p8} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vmuli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vmuli8:
+;CHECK: vmul.i8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = mul <8 x i8> %tmp1, %tmp2
@@ -13,6 +10,8 @@ define <8 x i8> @vmuli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vmuli16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vmuli16:
+;CHECK: vmul.i16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = mul <4 x i16> %tmp1, %tmp2
@@ -20,6 +19,8 @@ define <4 x i16> @vmuli16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vmuli32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vmuli32:
+;CHECK: vmul.i32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = mul <2 x i32> %tmp1, %tmp2
@@ -27,6 +28,8 @@ define <2 x i32> @vmuli32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <2 x float> @vmulf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vmulf32:
+;CHECK: vmul.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = mul <2 x float> %tmp1, %tmp2
@@ -34,6 +37,8 @@ define <2 x float> @vmulf32(<2 x float>* %A, <2 x float>* %B) nounwind {
}
define <8 x i8> @vmulp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vmulp8:
+;CHECK: vmul.p8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -41,6 +46,8 @@ define <8 x i8> @vmulp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <16 x i8> @vmulQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vmulQi8:
+;CHECK: vmul.i8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = mul <16 x i8> %tmp1, %tmp2
@@ -48,6 +55,8 @@ define <16 x i8> @vmulQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vmulQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vmulQi16:
+;CHECK: vmul.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = mul <8 x i16> %tmp1, %tmp2
@@ -55,6 +64,8 @@ define <8 x i16> @vmulQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vmulQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vmulQi32:
+;CHECK: vmul.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = mul <4 x i32> %tmp1, %tmp2
@@ -62,6 +73,8 @@ define <4 x i32> @vmulQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <4 x float> @vmulQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK: vmulQf32:
+;CHECK: vmul.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = mul <4 x float> %tmp1, %tmp2
@@ -69,6 +82,8 @@ define <4 x float> @vmulQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
}
define <16 x i8> @vmulQp8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vmulQp8:
+;CHECK: vmul.p8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -77,3 +92,166 @@ define <16 x i8> @vmulQp8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
declare <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
declare <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define arm_aapcs_vfpcc <2 x float> @test_vmul_lanef32(<2 x float> %arg0_float32x2_t, <2 x float> %arg1_float32x2_t) nounwind readnone {
+entry:
+; CHECK: test_vmul_lanef32:
+; CHECK: vmul.f32 d0, d0, d1[0]
+ %0 = shufflevector <2 x float> %arg1_float32x2_t, <2 x float> undef, <2 x i32> zeroinitializer ; <<2 x float>> [#uses=1]
+ %1 = fmul <2 x float> %0, %arg0_float32x2_t ; <<2 x float>> [#uses=1]
+ ret <2 x float> %1
+}
+
+define arm_aapcs_vfpcc <4 x i16> @test_vmul_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
+entry:
+; CHECK: test_vmul_lanes16:
+; CHECK: vmul.i16 d0, d0, d1[1]
+ %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses$
+ %1 = mul <4 x i16> %0, %arg0_int16x4_t ; <<4 x i16>> [#uses=1]
+ ret <4 x i16> %1
+}
+
+define arm_aapcs_vfpcc <2 x i32> @test_vmul_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
+entry:
+; CHECK: test_vmul_lanes32:
+; CHECK: vmul.i32 d0, d0, d1[1]
+ %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
+ %1 = mul <2 x i32> %0, %arg0_int32x2_t ; <<2 x i32>> [#uses=1]
+ ret <2 x i32> %1
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vmulQ_lanef32(<4 x float> %arg0_float32x4_t, <2 x float> %arg1_float32x2_t) nounwind readnone {
+entry:
+; CHECK: test_vmulQ_lanef32:
+; CHECK: vmul.f32 q0, q0, d2[1]
+ %0 = shufflevector <2 x float> %arg1_float32x2_t, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x float>$
+ %1 = fmul <4 x float> %0, %arg0_float32x4_t ; <<4 x float>> [#uses=1]
+ ret <4 x float> %1
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmulQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
+entry:
+; CHECK: test_vmulQ_lanes16:
+; CHECK: vmul.i16 q0, q0, d2[1]
+ %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %1 = mul <8 x i16> %0, %arg0_int16x8_t ; <<8 x i16>> [#uses=1]
+ ret <8 x i16> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmulQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
+entry:
+; CHECK: test_vmulQ_lanes32:
+; CHECK: vmul.i32 q0, q0, d2[1]
+ %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses$
+ %1 = mul <4 x i32> %0, %arg0_int32x4_t ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %1
+}
+
+define <8 x i16> @vmulls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vmulls8:
+;CHECK: vmull.s8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vmulls16:
+;CHECK: vmull.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vmulls32:
+;CHECK: vmull.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vmullu8:
+;CHECK: vmull.u8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vmullu16:
+;CHECK: vmull.u16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vmullu32:
+;CHECK: vmull.u32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vmullp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vmullp8:
+;CHECK: vmull.p8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
+entry:
+; CHECK: test_vmull_lanes16
+; CHECK: vmull.s16 q0, d0, d1[1]
+ %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
+ %1 = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
+entry:
+; CHECK: test_vmull_lanes32
+; CHECK: vmull.s32 q0, d0, d1[1]
+ %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
+ %1 = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmull_laneu16(<4 x i16> %arg0_uint16x4_t, <4 x i16> %arg1_uint16x4_t) nounwind readnone {
+entry:
+; CHECK: test_vmull_laneu16
+; CHECK: vmull.u16 q0, d0, d1[1]
+ %0 = shufflevector <4 x i16> %arg1_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
+ %1 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %arg0_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vmull_laneu32(<2 x i32> %arg0_uint32x2_t, <2 x i32> %arg1_uint32x2_t) nounwind readnone {
+entry:
+; CHECK: test_vmull_laneu32
+; CHECK: vmull.u32 q0, d0, d1[1]
+ %0 = shufflevector <2 x i32> %arg1_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
+ %1 = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %arg0_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %1
+}
+
+declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM/vneg.ll b/test/CodeGen/ARM/vneg.ll
index 9fa527f..7764e87 100644
--- a/test/CodeGen/ARM/vneg.ll
+++ b/test/CodeGen/ARM/vneg.ll
@@ -1,53 +1,121 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vneg\\.s8} %t | count 2
-; RUN: grep {vneg\\.s16} %t | count 2
-; RUN: grep {vneg\\.s32} %t | count 2
-; RUN: grep {vneg\\.f32} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vnegs8(<8 x i8>* %A) nounwind {
+;CHECK: vnegs8:
+;CHECK: vneg.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = sub <8 x i8> zeroinitializer, %tmp1
ret <8 x i8> %tmp2
}
define <4 x i16> @vnegs16(<4 x i16>* %A) nounwind {
+;CHECK: vnegs16:
+;CHECK: vneg.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = sub <4 x i16> zeroinitializer, %tmp1
ret <4 x i16> %tmp2
}
define <2 x i32> @vnegs32(<2 x i32>* %A) nounwind {
+;CHECK: vnegs32:
+;CHECK: vneg.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = sub <2 x i32> zeroinitializer, %tmp1
ret <2 x i32> %tmp2
}
define <2 x float> @vnegf32(<2 x float>* %A) nounwind {
+;CHECK: vnegf32:
+;CHECK: vneg.f32
%tmp1 = load <2 x float>* %A
%tmp2 = sub <2 x float> < float -0.000000e+00, float -0.000000e+00 >, %tmp1
ret <2 x float> %tmp2
}
define <16 x i8> @vnegQs8(<16 x i8>* %A) nounwind {
+;CHECK: vnegQs8:
+;CHECK: vneg.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = sub <16 x i8> zeroinitializer, %tmp1
ret <16 x i8> %tmp2
}
define <8 x i16> @vnegQs16(<8 x i16>* %A) nounwind {
+;CHECK: vnegQs16:
+;CHECK: vneg.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = sub <8 x i16> zeroinitializer, %tmp1
ret <8 x i16> %tmp2
}
define <4 x i32> @vnegQs32(<4 x i32>* %A) nounwind {
+;CHECK: vnegQs32:
+;CHECK: vneg.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = sub <4 x i32> zeroinitializer, %tmp1
ret <4 x i32> %tmp2
}
define <4 x float> @vnegQf32(<4 x float>* %A) nounwind {
+;CHECK: vnegQf32:
+;CHECK: vneg.f32
%tmp1 = load <4 x float>* %A
%tmp2 = sub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %tmp1
ret <4 x float> %tmp2
}
+
+define <8 x i8> @vqnegs8(<8 x i8>* %A) nounwind {
+;CHECK: vqnegs8:
+;CHECK: vqneg.s8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %tmp1)
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqnegs16(<4 x i16>* %A) nounwind {
+;CHECK: vqnegs16:
+;CHECK: vqneg.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %tmp1)
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqnegs32(<2 x i32>* %A) nounwind {
+;CHECK: vqnegs32:
+;CHECK: vqneg.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %tmp1)
+ ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vqnegQs8(<16 x i8>* %A) nounwind {
+;CHECK: vqnegQs8:
+;CHECK: vqneg.s8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %tmp1)
+ ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vqnegQs16(<8 x i16>* %A) nounwind {
+;CHECK: vqnegQs16:
+;CHECK: vqneg.s16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %tmp1)
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vqnegQs32(<4 x i32>* %A) nounwind {
+;CHECK: vqnegQs32:
+;CHECK: vqneg.s32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %tmp1)
+ ret <4 x i32> %tmp2
+}
+
+declare <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vpadal.ll b/test/CodeGen/ARM/vpadal.ll
index c41c532..7296e93 100644
--- a/test/CodeGen/ARM/vpadal.ll
+++ b/test/CodeGen/ARM/vpadal.ll
@@ -1,12 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vpadal\\.s8} %t | count 2
-; RUN: grep {vpadal\\.s16} %t | count 2
-; RUN: grep {vpadal\\.s32} %t | count 2
-; RUN: grep {vpadal\\.u8} %t | count 2
-; RUN: grep {vpadal\\.u16} %t | count 2
-; RUN: grep {vpadal\\.u32} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <4 x i16> @vpadals8(<4 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vpadals8:
+;CHECK: vpadal.s8
%tmp1 = load <4 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %tmp1, <8 x i8> %tmp2)
@@ -14,6 +10,8 @@ define <4 x i16> @vpadals8(<4 x i16>* %A, <8 x i8>* %B) nounwind {
}
define <2 x i32> @vpadals16(<2 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vpadals16:
+;CHECK: vpadal.s16
%tmp1 = load <2 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %tmp1, <4 x i16> %tmp2)
@@ -21,6 +19,8 @@ define <2 x i32> @vpadals16(<2 x i32>* %A, <4 x i16>* %B) nounwind {
}
define <1 x i64> @vpadals32(<1 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vpadals32:
+;CHECK: vpadal.s32
%tmp1 = load <1 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %tmp1, <2 x i32> %tmp2)
@@ -28,6 +28,8 @@ define <1 x i64> @vpadals32(<1 x i64>* %A, <2 x i32>* %B) nounwind {
}
define <4 x i16> @vpadalu8(<4 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vpadalu8:
+;CHECK: vpadal.u8
%tmp1 = load <4 x i16>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %tmp1, <8 x i8> %tmp2)
@@ -35,6 +37,8 @@ define <4 x i16> @vpadalu8(<4 x i16>* %A, <8 x i8>* %B) nounwind {
}
define <2 x i32> @vpadalu16(<2 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vpadalu16:
+;CHECK: vpadal.u16
%tmp1 = load <2 x i32>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %tmp1, <4 x i16> %tmp2)
@@ -42,6 +46,8 @@ define <2 x i32> @vpadalu16(<2 x i32>* %A, <4 x i16>* %B) nounwind {
}
define <1 x i64> @vpadalu32(<1 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vpadalu32:
+;CHECK: vpadal.u32
%tmp1 = load <1 x i64>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %tmp1, <2 x i32> %tmp2)
@@ -49,6 +55,8 @@ define <1 x i64> @vpadalu32(<1 x i64>* %A, <2 x i32>* %B) nounwind {
}
define <8 x i16> @vpadalQs8(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vpadalQs8:
+;CHECK: vpadal.s8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %tmp1, <16 x i8> %tmp2)
@@ -56,6 +64,8 @@ define <8 x i16> @vpadalQs8(<8 x i16>* %A, <16 x i8>* %B) nounwind {
}
define <4 x i32> @vpadalQs16(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vpadalQs16:
+;CHECK: vpadal.s16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %tmp1, <8 x i16> %tmp2)
@@ -63,6 +73,8 @@ define <4 x i32> @vpadalQs16(<4 x i32>* %A, <8 x i16>* %B) nounwind {
}
define <2 x i64> @vpadalQs32(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vpadalQs32:
+;CHECK: vpadal.s32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %tmp1, <4 x i32> %tmp2)
@@ -70,6 +82,8 @@ define <2 x i64> @vpadalQs32(<2 x i64>* %A, <4 x i32>* %B) nounwind {
}
define <8 x i16> @vpadalQu8(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vpadalQu8:
+;CHECK: vpadal.u8
%tmp1 = load <8 x i16>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %tmp1, <16 x i8> %tmp2)
@@ -77,6 +91,8 @@ define <8 x i16> @vpadalQu8(<8 x i16>* %A, <16 x i8>* %B) nounwind {
}
define <4 x i32> @vpadalQu16(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vpadalQu16:
+;CHECK: vpadal.u16
%tmp1 = load <4 x i32>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %tmp1, <8 x i16> %tmp2)
@@ -84,6 +100,8 @@ define <4 x i32> @vpadalQu16(<4 x i32>* %A, <8 x i16>* %B) nounwind {
}
define <2 x i64> @vpadalQu32(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vpadalQu32:
+;CHECK: vpadal.u32
%tmp1 = load <2 x i64>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %tmp1, <4 x i32> %tmp2)
diff --git a/test/CodeGen/ARM/vpadd.ll b/test/CodeGen/ARM/vpadd.ll
index baff492..2125573 100644
--- a/test/CodeGen/ARM/vpadd.ll
+++ b/test/CodeGen/ARM/vpadd.ll
@@ -1,39 +1,155 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vpadd\\.i8} %t | count 1
-; RUN: grep {vpadd\\.i16} %t | count 1
-; RUN: grep {vpadd\\.i32} %t | count 1
-; RUN: grep {vpadd\\.f32} %t | count 1
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vpaddi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vpaddi8:
+;CHECK: vpadd.i8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
- %tmp3 = call <8 x i8> @llvm.arm.neon.vpaddi.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
ret <8 x i8> %tmp3
}
define <4 x i16> @vpaddi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vpaddi16:
+;CHECK: vpadd.i16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
- %tmp3 = call <4 x i16> @llvm.arm.neon.vpaddi.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i16> %tmp3
}
define <2 x i32> @vpaddi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vpaddi32:
+;CHECK: vpadd.i32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
- %tmp3 = call <2 x i32> @llvm.arm.neon.vpaddi.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i32> %tmp3
}
define <2 x float> @vpaddf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vpaddf32:
+;CHECK: vpadd.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
- %tmp3 = call <2 x float> @llvm.arm.neon.vpaddf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ %tmp3 = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
ret <2 x float> %tmp3
}
-declare <8 x i8> @llvm.arm.neon.vpaddi.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vpaddi.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vpaddi.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <2 x float> @llvm.arm.neon.vpaddf.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define <4 x i16> @vpaddls8(<8 x i8>* %A) nounwind {
+;CHECK: vpaddls8:
+;CHECK: vpaddl.s8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %tmp1)
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vpaddls16(<4 x i16>* %A) nounwind {
+;CHECK: vpaddls16:
+;CHECK: vpaddl.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %tmp1)
+ ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vpaddls32(<2 x i32>* %A) nounwind {
+;CHECK: vpaddls32:
+;CHECK: vpaddl.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %tmp1)
+ ret <1 x i64> %tmp2
+}
+
+define <4 x i16> @vpaddlu8(<8 x i8>* %A) nounwind {
+;CHECK: vpaddlu8:
+;CHECK: vpaddl.u8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %tmp1)
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vpaddlu16(<4 x i16>* %A) nounwind {
+;CHECK: vpaddlu16:
+;CHECK: vpaddl.u16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %tmp1)
+ ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vpaddlu32(<2 x i32>* %A) nounwind {
+;CHECK: vpaddlu32:
+;CHECK: vpaddl.u32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %tmp1)
+ ret <1 x i64> %tmp2
+}
+
+define <8 x i16> @vpaddlQs8(<16 x i8>* %A) nounwind {
+;CHECK: vpaddlQs8:
+;CHECK: vpaddl.s8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %tmp1)
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vpaddlQs16(<8 x i16>* %A) nounwind {
+;CHECK: vpaddlQs16:
+;CHECK: vpaddl.s16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %tmp1)
+ ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vpaddlQs32(<4 x i32>* %A) nounwind {
+;CHECK: vpaddlQs32:
+;CHECK: vpaddl.s32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %tmp1)
+ ret <2 x i64> %tmp2
+}
+
+define <8 x i16> @vpaddlQu8(<16 x i8>* %A) nounwind {
+;CHECK: vpaddlQu8:
+;CHECK: vpaddl.u8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %tmp1)
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vpaddlQu16(<8 x i16>* %A) nounwind {
+;CHECK: vpaddlQu16:
+;CHECK: vpaddl.u16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %tmp1)
+ ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind {
+;CHECK: vpaddlQu32:
+;CHECK: vpaddl.u32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %tmp1)
+ ret <2 x i64> %tmp2
+}
+
+declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone
+
+declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vpminmax.ll b/test/CodeGen/ARM/vpminmax.ll
new file mode 100644
index 0000000..b75bcc9
--- /dev/null
+++ b/test/CodeGen/ARM/vpminmax.ll
@@ -0,0 +1,147 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+define <8 x i8> @vpmins8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vpmins8:
+;CHECK: vpmin.s8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vpmins16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vpmins16:
+;CHECK: vpmin.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vpmins32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vpmins32:
+;CHECK: vpmin.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vpminu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vpminu8:
+;CHECK: vpmin.u8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vpminu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vpminu16:
+;CHECK: vpmin.u16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vpminu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vpminu32:
+;CHECK: vpmin.u32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vpminf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vpminf32:
+;CHECK: vpmin.f32
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x float> %tmp3
+}
+
+declare <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define <8 x i8> @vpmaxs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vpmaxs8:
+;CHECK: vpmax.s8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vpmaxs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vpmaxs16:
+;CHECK: vpmax.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vpmaxs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vpmaxs32:
+;CHECK: vpmax.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vpmaxu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vpmaxu8:
+;CHECK: vpmax.u8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vpmaxu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vpmaxu16:
+;CHECK: vpmax.u16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vpmaxu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vpmaxu32:
+;CHECK: vpmax.u32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vpmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vpmaxf32:
+;CHECK: vpmax.f32
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x float> %tmp3
+}
+
+declare <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vqadd.ll b/test/CodeGen/ARM/vqadd.ll
index c9e2359..a1669b6 100644
--- a/test/CodeGen/ARM/vqadd.ll
+++ b/test/CodeGen/ARM/vqadd.ll
@@ -1,14 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vqadd\\.s8} %t | count 2
-; RUN: grep {vqadd\\.s16} %t | count 2
-; RUN: grep {vqadd\\.s32} %t | count 2
-; RUN: grep {vqadd\\.s64} %t | count 2
-; RUN: grep {vqadd\\.u8} %t | count 2
-; RUN: grep {vqadd\\.u16} %t | count 2
-; RUN: grep {vqadd\\.u32} %t | count 2
-; RUN: grep {vqadd\\.u64} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vqadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vqadds8:
+;CHECK: vqadd.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -16,6 +10,8 @@ define <8 x i8> @vqadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vqadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vqadds16:
+;CHECK: vqadd.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -23,6 +19,8 @@ define <4 x i16> @vqadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vqadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vqadds32:
+;CHECK: vqadd.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -30,6 +28,8 @@ define <2 x i32> @vqadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vqadds64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vqadds64:
+;CHECK: vqadd.s64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
@@ -37,6 +37,8 @@ define <1 x i64> @vqadds64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <8 x i8> @vqaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vqaddu8:
+;CHECK: vqadd.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -44,6 +46,8 @@ define <8 x i8> @vqaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vqaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vqaddu16:
+;CHECK: vqadd.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -51,6 +55,8 @@ define <4 x i16> @vqaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vqaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vqaddu32:
+;CHECK: vqadd.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -58,6 +64,8 @@ define <2 x i32> @vqaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vqaddu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vqaddu64:
+;CHECK: vqadd.u64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
@@ -65,6 +73,8 @@ define <1 x i64> @vqaddu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <16 x i8> @vqaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vqaddQs8:
+;CHECK: vqadd.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -72,6 +82,8 @@ define <16 x i8> @vqaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vqaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vqaddQs16:
+;CHECK: vqadd.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -79,6 +91,8 @@ define <8 x i16> @vqaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vqaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vqaddQs32:
+;CHECK: vqadd.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -86,6 +100,8 @@ define <4 x i32> @vqaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vqaddQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vqaddQs64:
+;CHECK: vqadd.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
@@ -93,6 +109,8 @@ define <2 x i64> @vqaddQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
}
define <16 x i8> @vqaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vqaddQu8:
+;CHECK: vqadd.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -100,6 +118,8 @@ define <16 x i8> @vqaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vqaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vqaddQu16:
+;CHECK: vqadd.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -107,6 +127,8 @@ define <8 x i16> @vqaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vqaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vqaddQu32:
+;CHECK: vqadd.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -114,6 +136,8 @@ define <4 x i32> @vqaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vqaddQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vqaddQu64:
+;CHECK: vqadd.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
diff --git a/test/CodeGen/ARM/vqdmul.ll b/test/CodeGen/ARM/vqdmul.ll
new file mode 100644
index 0000000..8dcc7f7
--- /dev/null
+++ b/test/CodeGen/ARM/vqdmul.ll
@@ -0,0 +1,281 @@
+; RUN: llc -mattr=+neon < %s | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
+target triple = "thumbv7-elf"
+
+define <4 x i16> @vqdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vqdmulhs16:
+;CHECK: vqdmulh.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vqdmulhs32:
+;CHECK: vqdmulh.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <8 x i16> @vqdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vqdmulhQs16:
+;CHECK: vqdmulh.s16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vqdmulhQs32:
+;CHECK: vqdmulh.s32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
+entry:
+; CHECK: test_vqdmulhQ_lanes16
+; CHECK: vqdmulh.s16 q0, q0, d2[1]
+ %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1]
+ %1 = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1]
+ ret <8 x i16> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
+entry:
+; CHECK: test_vqdmulhQ_lanes32
+; CHECK: vqdmulh.s32 q0, q0, d2[1]
+ %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1]
+ %1 = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <4 x i16> @test_vqdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
+entry:
+; CHECK: test_vqdmulh_lanes16
+; CHECK: vqdmulh.s16 d0, d0, d1[1]
+ %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
+ %1 = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1]
+ ret <4 x i16> %1
+}
+
+define arm_aapcs_vfpcc <2 x i32> @test_vqdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
+entry:
+; CHECK: test_vqdmulh_lanes32
+; CHECK: vqdmulh.s32 d0, d0, d1[1]
+ %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
+ %1 = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1]
+ ret <2 x i32> %1
+}
+
+declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i16> @vqrdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vqrdmulhs16:
+;CHECK: vqrdmulh.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqrdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vqrdmulhs32:
+;CHECK: vqrdmulh.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <8 x i16> @vqrdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vqrdmulhQs16:
+;CHECK: vqrdmulh.s16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqrdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vqrdmulhQs32:
+;CHECK: vqrdmulh.s32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vqRdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
+entry:
+; CHECK: test_vqRdmulhQ_lanes16
+; CHECK: vqrdmulh.s16 q0, q0, d2[1]
+ %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1]
+ %1 = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1]
+ ret <8 x i16> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqRdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
+entry:
+; CHECK: test_vqRdmulhQ_lanes32
+; CHECK: vqrdmulh.s32 q0, q0, d2[1]
+ %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1]
+ %1 = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <4 x i16> @test_vqRdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
+entry:
+; CHECK: test_vqRdmulh_lanes16
+; CHECK: vqrdmulh.s16 d0, d0, d1[1]
+ %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
+ %1 = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1]
+ ret <4 x i16> %1
+}
+
+define arm_aapcs_vfpcc <2 x i32> @test_vqRdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
+entry:
+; CHECK: test_vqRdmulh_lanes32
+; CHECK: vqrdmulh.s32 d0, d0, d1[1]
+ %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
+ %1 = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1]
+ ret <2 x i32> %1
+}
+
+declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @vqdmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vqdmulls16:
+;CHECK: vqdmull.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqdmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vqdmulls32:
+;CHECK: vqdmull.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
+entry:
+; CHECK: test_vqdmull_lanes16
+; CHECK: vqdmull.s16 q0, d0, d1[1]
+ %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
+ %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vqdmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
+entry:
+; CHECK: test_vqdmull_lanes32
+; CHECK: vqdmull.s32 q0, d0, d1[1]
+ %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
+ %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %1
+}
+
+declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <4 x i32> @vqdmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK: vqdmlals16:
+;CHECK: vqdmlal.s16
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i16>* %C
+ %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vqdmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK: vqdmlals32:
+;CHECK: vqdmlal.s32
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i32>* %C
+ %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+ ret <2 x i64> %tmp4
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
+entry:
+; CHECK: test_vqdmlal_lanes16
+; CHECK: vqdmlal.s16 q0, d2, d3[1]
+ %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
+ %1 = tail call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
+entry:
+; CHECK: test_vqdmlal_lanes32
+; CHECK: vqdmlal.s32 q0, d2, d3[1]
+ %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
+ %1 = tail call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %1
+}
+
+declare <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
+
+define <4 x i32> @vqdmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK: vqdmlsls16:
+;CHECK: vqdmlsl.s16
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i16>* %C
+ %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vqdmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK: vqdmlsls32:
+;CHECK: vqdmlsl.s32
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i32>* %C
+ %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+ ret <2 x i64> %tmp4
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
+entry:
+; CHECK: test_vqdmlsl_lanes16
+; CHECK: vqdmlsl.s16 q0, d2, d3[1]
+ %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
+ %1 = tail call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
+ ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
+entry:
+; CHECK: test_vqdmlsl_lanes32
+; CHECK: vqdmlsl.s32 q0, d2, d3[1]
+ %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
+ %1 = tail call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %1
+}
+
+declare <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vqshl.ll b/test/CodeGen/ARM/vqshl.ll
index 60b04bd..e4d29a3 100644
--- a/test/CodeGen/ARM/vqshl.ll
+++ b/test/CodeGen/ARM/vqshl.ll
@@ -1,26 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vqshl\\.s8} %t | count 4
-; RUN: grep {vqshl\\.s16} %t | count 4
-; RUN: grep {vqshl\\.s32} %t | count 4
-; RUN: grep {vqshl\\.s64} %t | count 4
-; RUN: grep {vqshl\\.u8} %t | count 4
-; RUN: grep {vqshl\\.u16} %t | count 4
-; RUN: grep {vqshl\\.u32} %t | count 4
-; RUN: grep {vqshl\\.u64} %t | count 4
-; RUN: grep {vqshl\\.s8.*#7} %t | count 2
-; RUN: grep {vqshl\\.s16.*#15} %t | count 2
-; RUN: grep {vqshl\\.s32.*#31} %t | count 2
-; RUN: grep {vqshl\\.s64.*#63} %t | count 2
-; RUN: grep {vqshl\\.u8.*#7} %t | count 2
-; RUN: grep {vqshl\\.u16.*#15} %t | count 2
-; RUN: grep {vqshl\\.u32.*#31} %t | count 2
-; RUN: grep {vqshl\\.u64.*#63} %t | count 2
-; RUN: grep {vqshlu\\.s8} %t | count 2
-; RUN: grep {vqshlu\\.s16} %t | count 2
-; RUN: grep {vqshlu\\.s32} %t | count 2
-; RUN: grep {vqshlu\\.s64} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vqshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vqshls8:
+;CHECK: vqshl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -28,6 +10,8 @@ define <8 x i8> @vqshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vqshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vqshls16:
+;CHECK: vqshl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -35,6 +19,8 @@ define <4 x i16> @vqshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vqshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vqshls32:
+;CHECK: vqshl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -42,6 +28,8 @@ define <2 x i32> @vqshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vqshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vqshls64:
+;CHECK: vqshl.s64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
@@ -49,6 +37,8 @@ define <1 x i64> @vqshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <8 x i8> @vqshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vqshlu8:
+;CHECK: vqshl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -56,6 +46,8 @@ define <8 x i8> @vqshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vqshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vqshlu16:
+;CHECK: vqshl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -63,6 +55,8 @@ define <4 x i16> @vqshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vqshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vqshlu32:
+;CHECK: vqshl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -70,6 +64,8 @@ define <2 x i32> @vqshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vqshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vqshlu64:
+;CHECK: vqshl.u64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
@@ -77,6 +73,8 @@ define <1 x i64> @vqshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <16 x i8> @vqshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vqshlQs8:
+;CHECK: vqshl.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -84,6 +82,8 @@ define <16 x i8> @vqshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vqshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vqshlQs16:
+;CHECK: vqshl.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -91,6 +91,8 @@ define <8 x i16> @vqshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vqshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vqshlQs32:
+;CHECK: vqshl.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -98,6 +100,8 @@ define <4 x i32> @vqshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vqshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vqshlQs64:
+;CHECK: vqshl.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
@@ -105,6 +109,8 @@ define <2 x i64> @vqshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
}
define <16 x i8> @vqshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vqshlQu8:
+;CHECK: vqshl.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -112,6 +118,8 @@ define <16 x i8> @vqshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vqshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vqshlQu16:
+;CHECK: vqshl.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -119,6 +127,8 @@ define <8 x i16> @vqshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vqshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vqshlQu32:
+;CHECK: vqshl.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -126,6 +136,8 @@ define <4 x i32> @vqshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vqshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vqshlQu64:
+;CHECK: vqshl.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
@@ -133,144 +145,192 @@ define <2 x i64> @vqshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
}
define <8 x i8> @vqshls_n8(<8 x i8>* %A) nounwind {
+;CHECK: vqshls_n8:
+;CHECK: vqshl.s8{{.*#7}}
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqshls_n16(<4 x i16>* %A) nounwind {
+;CHECK: vqshls_n16:
+;CHECK: vqshl.s16{{.*#15}}
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqshls_n32(<2 x i32>* %A) nounwind {
+;CHECK: vqshls_n32:
+;CHECK: vqshl.s32{{.*#31}}
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
ret <2 x i32> %tmp2
}
define <1 x i64> @vqshls_n64(<1 x i64>* %A) nounwind {
+;CHECK: vqshls_n64:
+;CHECK: vqshl.s64{{.*#63}}
%tmp1 = load <1 x i64>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >)
ret <1 x i64> %tmp2
}
define <8 x i8> @vqshlu_n8(<8 x i8>* %A) nounwind {
+;CHECK: vqshlu_n8:
+;CHECK: vqshl.u8{{.*#7}}
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqshlu_n16(<4 x i16>* %A) nounwind {
+;CHECK: vqshlu_n16:
+;CHECK: vqshl.u16{{.*#15}}
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqshlu_n32(<2 x i32>* %A) nounwind {
+;CHECK: vqshlu_n32:
+;CHECK: vqshl.u32{{.*#31}}
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
ret <2 x i32> %tmp2
}
define <1 x i64> @vqshlu_n64(<1 x i64>* %A) nounwind {
+;CHECK: vqshlu_n64:
+;CHECK: vqshl.u64{{.*#63}}
%tmp1 = load <1 x i64>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >)
ret <1 x i64> %tmp2
}
define <8 x i8> @vqshlsu_n8(<8 x i8>* %A) nounwind {
+;CHECK: vqshlsu_n8:
+;CHECK: vqshlu.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqshlsu_n16(<4 x i16>* %A) nounwind {
+;CHECK: vqshlsu_n16:
+;CHECK: vqshlu.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqshlsu_n32(<2 x i32>* %A) nounwind {
+;CHECK: vqshlsu_n32:
+;CHECK: vqshlu.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
ret <2 x i32> %tmp2
}
define <1 x i64> @vqshlsu_n64(<1 x i64>* %A) nounwind {
+;CHECK: vqshlsu_n64:
+;CHECK: vqshlu.s64
%tmp1 = load <1 x i64>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >)
ret <1 x i64> %tmp2
}
define <16 x i8> @vqshlQs_n8(<16 x i8>* %A) nounwind {
+;CHECK: vqshlQs_n8:
+;CHECK: vqshl.s8{{.*#7}}
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <16 x i8> %tmp2
}
define <8 x i16> @vqshlQs_n16(<8 x i16>* %A) nounwind {
+;CHECK: vqshlQs_n16:
+;CHECK: vqshl.s16{{.*#15}}
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vqshlQs_n32(<4 x i32>* %A) nounwind {
+;CHECK: vqshlQs_n32:
+;CHECK: vqshl.s32{{.*#31}}
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vqshlQs_n64(<2 x i64>* %A) nounwind {
+;CHECK: vqshlQs_n64:
+;CHECK: vqshl.s64{{.*#63}}
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >)
ret <2 x i64> %tmp2
}
define <16 x i8> @vqshlQu_n8(<16 x i8>* %A) nounwind {
+;CHECK: vqshlQu_n8:
+;CHECK: vqshl.u8{{.*#7}}
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <16 x i8> %tmp2
}
define <8 x i16> @vqshlQu_n16(<8 x i16>* %A) nounwind {
+;CHECK: vqshlQu_n16:
+;CHECK: vqshl.u16{{.*#15}}
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vqshlQu_n32(<4 x i32>* %A) nounwind {
+;CHECK: vqshlQu_n32:
+;CHECK: vqshl.u32{{.*#31}}
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vqshlQu_n64(<2 x i64>* %A) nounwind {
+;CHECK: vqshlQu_n64:
+;CHECK: vqshl.u64{{.*#63}}
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >)
ret <2 x i64> %tmp2
}
define <16 x i8> @vqshlQsu_n8(<16 x i8>* %A) nounwind {
+;CHECK: vqshlQsu_n8:
+;CHECK: vqshlu.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <16 x i8> %tmp2
}
define <8 x i16> @vqshlQsu_n16(<8 x i16>* %A) nounwind {
+;CHECK: vqshlQsu_n16:
+;CHECK: vqshlu.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vqshlQsu_n32(<4 x i32>* %A) nounwind {
+;CHECK: vqshlQsu_n32:
+;CHECK: vqshlu.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vqshlQsu_n64(<2 x i64>* %A) nounwind {
+;CHECK: vqshlQsu_n64:
+;CHECK: vqshlu.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >)
ret <2 x i64> %tmp2
@@ -305,3 +365,167 @@ declare <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8>, <16 x i8>) nounwind
declare <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @vqrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vqrshls8:
+;CHECK: vqrshl.s8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vqrshls16:
+;CHECK: vqrshl.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vqrshls32:
+;CHECK: vqrshl.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vqrshls64:
+;CHECK: vqrshl.s64
+ %tmp1 = load <1 x i64>* %A
+ %tmp2 = load <1 x i64>* %B
+ %tmp3 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+ ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vqrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vqrshlu8:
+;CHECK: vqrshl.u8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vqrshlu16:
+;CHECK: vqrshl.u16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vqrshlu32:
+;CHECK: vqrshl.u32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vqrshlu64:
+;CHECK: vqrshl.u64
+ %tmp1 = load <1 x i64>* %A
+ %tmp2 = load <1 x i64>* %B
+ %tmp3 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+ ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vqrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vqrshlQs8:
+;CHECK: vqrshl.s8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vqrshlQs16:
+;CHECK: vqrshl.s16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vqrshlQs32:
+;CHECK: vqrshl.s32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vqrshlQs64:
+;CHECK: vqrshl.s64
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vqrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vqrshlQu8:
+;CHECK: vqrshl.u8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vqrshlQu16:
+;CHECK: vqrshl.u16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vqrshlQu32:
+;CHECK: vqrshl.u32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vqrshlQu64:
+;CHECK: vqrshl.u64
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+declare <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vqshrn.ll b/test/CodeGen/ARM/vqshrn.ll
index 6bd607a..5da7943 100644
--- a/test/CodeGen/ARM/vqshrn.ll
+++ b/test/CodeGen/ARM/vqshrn.ll
@@ -1,63 +1,72 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vqshrn\\.s16} %t | count 1
-; RUN: grep {vqshrn\\.s32} %t | count 1
-; RUN: grep {vqshrn\\.s64} %t | count 1
-; RUN: grep {vqshrn\\.u16} %t | count 1
-; RUN: grep {vqshrn\\.u32} %t | count 1
-; RUN: grep {vqshrn\\.u64} %t | count 1
-; RUN: grep {vqshrun\\.s16} %t | count 1
-; RUN: grep {vqshrun\\.s32} %t | count 1
-; RUN: grep {vqshrun\\.s64} %t | count 1
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vqshrns8(<8 x i16>* %A) nounwind {
+;CHECK: vqshrns8:
+;CHECK: vqshrn.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqshrns16(<4 x i32>* %A) nounwind {
+;CHECK: vqshrns16:
+;CHECK: vqshrn.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqshrns32(<2 x i64>* %A) nounwind {
+;CHECK: vqshrns32:
+;CHECK: vqshrn.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
}
define <8 x i8> @vqshrnu8(<8 x i16>* %A) nounwind {
+;CHECK: vqshrnu8:
+;CHECK: vqshrn.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqshrnu16(<4 x i32>* %A) nounwind {
+;CHECK: vqshrnu16:
+;CHECK: vqshrn.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqshrnu32(<2 x i64>* %A) nounwind {
+;CHECK: vqshrnu32:
+;CHECK: vqshrn.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
}
define <8 x i8> @vqshruns8(<8 x i16>* %A) nounwind {
+;CHECK: vqshruns8:
+;CHECK: vqshrun.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vqshruns16(<4 x i32>* %A) nounwind {
+;CHECK: vqshruns16:
+;CHECK: vqshrun.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vqshruns32(<2 x i64>* %A) nounwind {
+;CHECK: vqshruns32:
+;CHECK: vqshrun.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
@@ -74,3 +83,87 @@ declare <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind
declare <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @vqrshrns8(<8 x i16>* %A) nounwind {
+;CHECK: vqrshrns8:
+;CHECK: vqrshrn.s16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqrshrns16(<4 x i32>* %A) nounwind {
+;CHECK: vqrshrns16:
+;CHECK: vqrshrn.s32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqrshrns32(<2 x i64>* %A) nounwind {
+;CHECK: vqrshrns32:
+;CHECK: vqrshrn.s64
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+ ret <2 x i32> %tmp2
+}
+
+define <8 x i8> @vqrshrnu8(<8 x i16>* %A) nounwind {
+;CHECK: vqrshrnu8:
+;CHECK: vqrshrn.u16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqrshrnu16(<4 x i32>* %A) nounwind {
+;CHECK: vqrshrnu16:
+;CHECK: vqrshrn.u32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqrshrnu32(<2 x i64>* %A) nounwind {
+;CHECK: vqrshrnu32:
+;CHECK: vqrshrn.u64
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+ ret <2 x i32> %tmp2
+}
+
+define <8 x i8> @vqrshruns8(<8 x i16>* %A) nounwind {
+;CHECK: vqrshruns8:
+;CHECK: vqrshrun.s16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vqrshruns16(<4 x i32>* %A) nounwind {
+;CHECK: vqrshruns16:
+;CHECK: vqrshrun.s32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vqrshruns32(<2 x i64>* %A) nounwind {
+;CHECK: vqrshruns32:
+;CHECK: vqrshrun.s64
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+ ret <2 x i32> %tmp2
+}
+
+declare <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vqsub.ll b/test/CodeGen/ARM/vqsub.ll
index 07052f7..4231fca 100644
--- a/test/CodeGen/ARM/vqsub.ll
+++ b/test/CodeGen/ARM/vqsub.ll
@@ -1,14 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vqsub\\.s8} %t | count 2
-; RUN: grep {vqsub\\.s16} %t | count 2
-; RUN: grep {vqsub\\.s32} %t | count 2
-; RUN: grep {vqsub\\.s64} %t | count 2
-; RUN: grep {vqsub\\.u8} %t | count 2
-; RUN: grep {vqsub\\.u16} %t | count 2
-; RUN: grep {vqsub\\.u32} %t | count 2
-; RUN: grep {vqsub\\.u64} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vqsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vqsubs8:
+;CHECK: vqsub.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -16,6 +10,8 @@ define <8 x i8> @vqsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vqsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vqsubs16:
+;CHECK: vqsub.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -23,6 +19,8 @@ define <4 x i16> @vqsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vqsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vqsubs32:
+;CHECK: vqsub.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -30,6 +28,8 @@ define <2 x i32> @vqsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vqsubs64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vqsubs64:
+;CHECK: vqsub.s64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
@@ -37,6 +37,8 @@ define <1 x i64> @vqsubs64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <8 x i8> @vqsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vqsubu8:
+;CHECK: vqsub.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -44,6 +46,8 @@ define <8 x i8> @vqsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vqsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vqsubu16:
+;CHECK: vqsub.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -51,6 +55,8 @@ define <4 x i16> @vqsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vqsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vqsubu32:
+;CHECK: vqsub.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -58,6 +64,8 @@ define <2 x i32> @vqsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vqsubu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vqsubu64:
+;CHECK: vqsub.u64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
@@ -65,6 +73,8 @@ define <1 x i64> @vqsubu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <16 x i8> @vqsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vqsubQs8:
+;CHECK: vqsub.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -72,6 +82,8 @@ define <16 x i8> @vqsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vqsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vqsubQs16:
+;CHECK: vqsub.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -79,6 +91,8 @@ define <8 x i16> @vqsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vqsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vqsubQs32:
+;CHECK: vqsub.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -86,6 +100,8 @@ define <4 x i32> @vqsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vqsubQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vqsubQs64:
+;CHECK: vqsub.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
@@ -93,6 +109,8 @@ define <2 x i64> @vqsubQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
}
define <16 x i8> @vqsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vqsubQu8:
+;CHECK: vqsub.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -100,6 +118,8 @@ define <16 x i8> @vqsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vqsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vqsubQu16:
+;CHECK: vqsub.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -107,6 +127,8 @@ define <8 x i16> @vqsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vqsubQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vqsubQu32:
+;CHECK: vqsub.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -114,6 +136,8 @@ define <4 x i32> @vqsubQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vqsubQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vqsubQu64:
+;CHECK: vqsub.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
diff --git a/test/CodeGen/ARM/vrec.ll b/test/CodeGen/ARM/vrec.ll
new file mode 100644
index 0000000..99989e9
--- /dev/null
+++ b/test/CodeGen/ARM/vrec.ll
@@ -0,0 +1,119 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+define <2 x i32> @vrecpei32(<2 x i32>* %A) nounwind {
+;CHECK: vrecpei32:
+;CHECK: vrecpe.u32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %tmp1)
+ ret <2 x i32> %tmp2
+}
+
+define <4 x i32> @vrecpeQi32(<4 x i32>* %A) nounwind {
+;CHECK: vrecpeQi32:
+;CHECK: vrecpe.u32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %tmp1)
+ ret <4 x i32> %tmp2
+}
+
+define <2 x float> @vrecpef32(<2 x float>* %A) nounwind {
+;CHECK: vrecpef32:
+;CHECK: vrecpe.f32
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %tmp1)
+ ret <2 x float> %tmp2
+}
+
+define <4 x float> @vrecpeQf32(<4 x float>* %A) nounwind {
+;CHECK: vrecpeQf32:
+;CHECK: vrecpe.f32
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %tmp1)
+ ret <4 x float> %tmp2
+}
+
+declare <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
+
+define <2 x float> @vrecpsf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vrecpsf32:
+;CHECK: vrecps.f32
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @vrecpsQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK: vrecpsQf32:
+;CHECK: vrecps.f32
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+ ret <4 x float> %tmp3
+}
+
+declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x i32> @vrsqrtei32(<2 x i32>* %A) nounwind {
+;CHECK: vrsqrtei32:
+;CHECK: vrsqrte.u32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %tmp1)
+ ret <2 x i32> %tmp2
+}
+
+define <4 x i32> @vrsqrteQi32(<4 x i32>* %A) nounwind {
+;CHECK: vrsqrteQi32:
+;CHECK: vrsqrte.u32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %tmp1)
+ ret <4 x i32> %tmp2
+}
+
+define <2 x float> @vrsqrtef32(<2 x float>* %A) nounwind {
+;CHECK: vrsqrtef32:
+;CHECK: vrsqrte.f32
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %tmp1)
+ ret <2 x float> %tmp2
+}
+
+define <4 x float> @vrsqrteQf32(<4 x float>* %A) nounwind {
+;CHECK: vrsqrteQf32:
+;CHECK: vrsqrte.f32
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %tmp1)
+ ret <4 x float> %tmp2
+}
+
+declare <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+
+define <2 x float> @vrsqrtsf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vrsqrtsf32:
+;CHECK: vrsqrts.f32
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @vrsqrtsQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK: vrsqrtsQf32:
+;CHECK: vrsqrts.f32
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+ ret <4 x float> %tmp3
+}
+
+declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vrev.ll b/test/CodeGen/ARM/vrev.ll
new file mode 100644
index 0000000..f0a04a4
--- /dev/null
+++ b/test/CodeGen/ARM/vrev.ll
@@ -0,0 +1,113 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+define arm_apcscc <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind {
+;CHECK: test_vrev64D8:
+;CHECK: vrev64.8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+ ret <8 x i8> %tmp2
+}
+
+define arm_apcscc <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind {
+;CHECK: test_vrev64D16:
+;CHECK: vrev64.16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ ret <4 x i16> %tmp2
+}
+
+define arm_apcscc <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind {
+;CHECK: test_vrev64D32:
+;CHECK: vrev64.32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+ ret <2 x i32> %tmp2
+}
+
+define arm_apcscc <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind {
+;CHECK: test_vrev64Df:
+;CHECK: vrev64.32
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+ ret <2 x float> %tmp2
+}
+
+define arm_apcscc <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind {
+;CHECK: test_vrev64Q8:
+;CHECK: vrev64.8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+ ret <16 x i8> %tmp2
+}
+
+define arm_apcscc <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind {
+;CHECK: test_vrev64Q16:
+;CHECK: vrev64.16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x i16> %tmp2
+}
+
+define arm_apcscc <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind {
+;CHECK: test_vrev64Q32:
+;CHECK: vrev64.32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ ret <4 x i32> %tmp2
+}
+
+define arm_apcscc <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind {
+;CHECK: test_vrev64Qf:
+;CHECK: vrev64.32
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ ret <4 x float> %tmp2
+}
+
+define arm_apcscc <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind {
+;CHECK: test_vrev32D8:
+;CHECK: vrev32.8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x i8> %tmp2
+}
+
+define arm_apcscc <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind {
+;CHECK: test_vrev32D16:
+;CHECK: vrev32.16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ ret <4 x i16> %tmp2
+}
+
+define arm_apcscc <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind {
+;CHECK: test_vrev32Q8:
+;CHECK: vrev32.8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+ ret <16 x i8> %tmp2
+}
+
+define arm_apcscc <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind {
+;CHECK: test_vrev32Q16:
+;CHECK: vrev32.16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+ ret <8 x i16> %tmp2
+}
+
+define arm_apcscc <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind {
+;CHECK: test_vrev16D8:
+;CHECK: vrev16.8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+ ret <8 x i8> %tmp2
+}
+
+define arm_apcscc <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind {
+;CHECK: test_vrev16Q8:
+;CHECK: vrev16.8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+ ret <16 x i8> %tmp2
+}
diff --git a/test/CodeGen/ARM/vshift.ll b/test/CodeGen/ARM/vshift.ll
index 8c5c4aa..f3cbec7 100644
--- a/test/CodeGen/ARM/vshift.ll
+++ b/test/CodeGen/ARM/vshift.ll
@@ -1,30 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vshl\\.s8} %t | count 2
-; RUN: grep {vshl\\.s16} %t | count 2
-; RUN: grep {vshl\\.s32} %t | count 2
-; RUN: grep {vshl\\.s64} %t | count 2
-; RUN: grep {vshl\\.u8} %t | count 4
-; RUN: grep {vshl\\.u16} %t | count 4
-; RUN: grep {vshl\\.u32} %t | count 4
-; RUN: grep {vshl\\.u64} %t | count 4
-; RUN: grep {vshl\\.i8} %t | count 2
-; RUN: grep {vshl\\.i16} %t | count 2
-; RUN: grep {vshl\\.i32} %t | count 2
-; RUN: grep {vshl\\.i64} %t | count 2
-; RUN: grep {vshr\\.u8} %t | count 2
-; RUN: grep {vshr\\.u16} %t | count 2
-; RUN: grep {vshr\\.u32} %t | count 2
-; RUN: grep {vshr\\.u64} %t | count 2
-; RUN: grep {vshr\\.s8} %t | count 2
-; RUN: grep {vshr\\.s16} %t | count 2
-; RUN: grep {vshr\\.s32} %t | count 2
-; RUN: grep {vshr\\.s64} %t | count 2
-; RUN: grep {vneg\\.s8} %t | count 4
-; RUN: grep {vneg\\.s16} %t | count 4
-; RUN: grep {vneg\\.s32} %t | count 4
-; RUN: grep {vsub\\.i64} %t | count 4
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vshls8:
+;CHECK: vshl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = shl <8 x i8> %tmp1, %tmp2
@@ -32,6 +10,8 @@ define <8 x i8> @vshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vshls16:
+;CHECK: vshl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = shl <4 x i16> %tmp1, %tmp2
@@ -39,6 +19,8 @@ define <4 x i16> @vshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vshls32:
+;CHECK: vshl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = shl <2 x i32> %tmp1, %tmp2
@@ -46,6 +28,8 @@ define <2 x i32> @vshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vshls64:
+;CHECK: vshl.u64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = shl <1 x i64> %tmp1, %tmp2
@@ -53,30 +37,40 @@ define <1 x i64> @vshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <8 x i8> @vshli8(<8 x i8>* %A) nounwind {
+;CHECK: vshli8:
+;CHECK: vshl.i8
%tmp1 = load <8 x i8>* %A
%tmp2 = shl <8 x i8> %tmp1, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
ret <8 x i8> %tmp2
}
define <4 x i16> @vshli16(<4 x i16>* %A) nounwind {
+;CHECK: vshli16:
+;CHECK: vshl.i16
%tmp1 = load <4 x i16>* %A
%tmp2 = shl <4 x i16> %tmp1, < i16 15, i16 15, i16 15, i16 15 >
ret <4 x i16> %tmp2
}
define <2 x i32> @vshli32(<2 x i32>* %A) nounwind {
+;CHECK: vshli32:
+;CHECK: vshl.i32
%tmp1 = load <2 x i32>* %A
%tmp2 = shl <2 x i32> %tmp1, < i32 31, i32 31 >
ret <2 x i32> %tmp2
}
define <1 x i64> @vshli64(<1 x i64>* %A) nounwind {
+;CHECK: vshli64:
+;CHECK: vshl.i64
%tmp1 = load <1 x i64>* %A
%tmp2 = shl <1 x i64> %tmp1, < i64 63 >
ret <1 x i64> %tmp2
}
define <16 x i8> @vshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vshlQs8:
+;CHECK: vshl.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = shl <16 x i8> %tmp1, %tmp2
@@ -84,6 +78,8 @@ define <16 x i8> @vshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vshlQs16:
+;CHECK: vshl.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = shl <8 x i16> %tmp1, %tmp2
@@ -91,6 +87,8 @@ define <8 x i16> @vshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vshlQs32:
+;CHECK: vshl.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = shl <4 x i32> %tmp1, %tmp2
@@ -98,6 +96,8 @@ define <4 x i32> @vshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vshlQs64:
+;CHECK: vshl.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = shl <2 x i64> %tmp1, %tmp2
@@ -105,30 +105,41 @@ define <2 x i64> @vshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
}
define <16 x i8> @vshlQi8(<16 x i8>* %A) nounwind {
+;CHECK: vshlQi8:
+;CHECK: vshl.i8
%tmp1 = load <16 x i8>* %A
%tmp2 = shl <16 x i8> %tmp1, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
ret <16 x i8> %tmp2
}
define <8 x i16> @vshlQi16(<8 x i16>* %A) nounwind {
+;CHECK: vshlQi16:
+;CHECK: vshl.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = shl <8 x i16> %tmp1, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
ret <8 x i16> %tmp2
}
define <4 x i32> @vshlQi32(<4 x i32>* %A) nounwind {
+;CHECK: vshlQi32:
+;CHECK: vshl.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = shl <4 x i32> %tmp1, < i32 31, i32 31, i32 31, i32 31 >
ret <4 x i32> %tmp2
}
define <2 x i64> @vshlQi64(<2 x i64>* %A) nounwind {
+;CHECK: vshlQi64:
+;CHECK: vshl.i64
%tmp1 = load <2 x i64>* %A
%tmp2 = shl <2 x i64> %tmp1, < i64 63, i64 63 >
ret <2 x i64> %tmp2
}
define <8 x i8> @vlshru8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vlshru8:
+;CHECK: vneg.s8
+;CHECK: vshl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = lshr <8 x i8> %tmp1, %tmp2
@@ -136,6 +147,9 @@ define <8 x i8> @vlshru8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vlshru16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vlshru16:
+;CHECK: vneg.s16
+;CHECK: vshl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = lshr <4 x i16> %tmp1, %tmp2
@@ -143,6 +157,9 @@ define <4 x i16> @vlshru16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vlshru32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vlshru32:
+;CHECK: vneg.s32
+;CHECK: vshl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = lshr <2 x i32> %tmp1, %tmp2
@@ -150,6 +167,9 @@ define <2 x i32> @vlshru32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vlshru64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vlshru64:
+;CHECK: vsub.i64
+;CHECK: vshl.u64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = lshr <1 x i64> %tmp1, %tmp2
@@ -157,30 +177,41 @@ define <1 x i64> @vlshru64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <8 x i8> @vlshri8(<8 x i8>* %A) nounwind {
+;CHECK: vlshri8:
+;CHECK: vshr.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = lshr <8 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
ret <8 x i8> %tmp2
}
define <4 x i16> @vlshri16(<4 x i16>* %A) nounwind {
+;CHECK: vlshri16:
+;CHECK: vshr.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = lshr <4 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16 >
ret <4 x i16> %tmp2
}
define <2 x i32> @vlshri32(<2 x i32>* %A) nounwind {
+;CHECK: vlshri32:
+;CHECK: vshr.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = lshr <2 x i32> %tmp1, < i32 32, i32 32 >
ret <2 x i32> %tmp2
}
define <1 x i64> @vlshri64(<1 x i64>* %A) nounwind {
+;CHECK: vlshri64:
+;CHECK: vshr.u64
%tmp1 = load <1 x i64>* %A
%tmp2 = lshr <1 x i64> %tmp1, < i64 64 >
ret <1 x i64> %tmp2
}
define <16 x i8> @vlshrQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vlshrQu8:
+;CHECK: vneg.s8
+;CHECK: vshl.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = lshr <16 x i8> %tmp1, %tmp2
@@ -188,6 +219,9 @@ define <16 x i8> @vlshrQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vlshrQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vlshrQu16:
+;CHECK: vneg.s16
+;CHECK: vshl.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = lshr <8 x i16> %tmp1, %tmp2
@@ -195,6 +229,9 @@ define <8 x i16> @vlshrQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vlshrQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vlshrQu32:
+;CHECK: vneg.s32
+;CHECK: vshl.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = lshr <4 x i32> %tmp1, %tmp2
@@ -202,6 +239,9 @@ define <4 x i32> @vlshrQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vlshrQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vlshrQu64:
+;CHECK: vsub.i64
+;CHECK: vshl.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = lshr <2 x i64> %tmp1, %tmp2
@@ -209,30 +249,48 @@ define <2 x i64> @vlshrQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
}
define <16 x i8> @vlshrQi8(<16 x i8>* %A) nounwind {
+;CHECK: vlshrQi8:
+;CHECK: vshr.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = lshr <16 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
ret <16 x i8> %tmp2
}
define <8 x i16> @vlshrQi16(<8 x i16>* %A) nounwind {
+;CHECK: vlshrQi16:
+;CHECK: vshr.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = lshr <8 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
ret <8 x i16> %tmp2
}
define <4 x i32> @vlshrQi32(<4 x i32>* %A) nounwind {
+;CHECK: vlshrQi32:
+;CHECK: vshr.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = lshr <4 x i32> %tmp1, < i32 32, i32 32, i32 32, i32 32 >
ret <4 x i32> %tmp2
}
define <2 x i64> @vlshrQi64(<2 x i64>* %A) nounwind {
+;CHECK: vlshrQi64:
+;CHECK: vshr.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = lshr <2 x i64> %tmp1, < i64 64, i64 64 >
ret <2 x i64> %tmp2
}
+; Example that requires splitting and expanding a vector shift.
+define <2 x i64> @update(<2 x i64> %val) nounwind readnone {
+entry:
+ %shr = lshr <2 x i64> %val, < i64 2, i64 2 > ; <<2 x i64>> [#uses=1]
+ ret <2 x i64> %shr
+}
+
define <8 x i8> @vashrs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vashrs8:
+;CHECK: vneg.s8
+;CHECK: vshl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = ashr <8 x i8> %tmp1, %tmp2
@@ -240,6 +298,9 @@ define <8 x i8> @vashrs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vashrs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vashrs16:
+;CHECK: vneg.s16
+;CHECK: vshl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = ashr <4 x i16> %tmp1, %tmp2
@@ -247,6 +308,9 @@ define <4 x i16> @vashrs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vashrs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vashrs32:
+;CHECK: vneg.s32
+;CHECK: vshl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = ashr <2 x i32> %tmp1, %tmp2
@@ -254,6 +318,9 @@ define <2 x i32> @vashrs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vashrs64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vashrs64:
+;CHECK: vsub.i64
+;CHECK: vshl.s64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = ashr <1 x i64> %tmp1, %tmp2
@@ -261,30 +328,41 @@ define <1 x i64> @vashrs64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <8 x i8> @vashri8(<8 x i8>* %A) nounwind {
+;CHECK: vashri8:
+;CHECK: vshr.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = ashr <8 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
ret <8 x i8> %tmp2
}
define <4 x i16> @vashri16(<4 x i16>* %A) nounwind {
+;CHECK: vashri16:
+;CHECK: vshr.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = ashr <4 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16 >
ret <4 x i16> %tmp2
}
define <2 x i32> @vashri32(<2 x i32>* %A) nounwind {
+;CHECK: vashri32:
+;CHECK: vshr.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = ashr <2 x i32> %tmp1, < i32 32, i32 32 >
ret <2 x i32> %tmp2
}
define <1 x i64> @vashri64(<1 x i64>* %A) nounwind {
+;CHECK: vashri64:
+;CHECK: vshr.s64
%tmp1 = load <1 x i64>* %A
%tmp2 = ashr <1 x i64> %tmp1, < i64 64 >
ret <1 x i64> %tmp2
}
define <16 x i8> @vashrQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vashrQs8:
+;CHECK: vneg.s8
+;CHECK: vshl.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = ashr <16 x i8> %tmp1, %tmp2
@@ -292,6 +370,9 @@ define <16 x i8> @vashrQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vashrQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vashrQs16:
+;CHECK: vneg.s16
+;CHECK: vshl.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = ashr <8 x i16> %tmp1, %tmp2
@@ -299,6 +380,9 @@ define <8 x i16> @vashrQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vashrQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vashrQs32:
+;CHECK: vneg.s32
+;CHECK: vshl.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = ashr <4 x i32> %tmp1, %tmp2
@@ -306,6 +390,9 @@ define <4 x i32> @vashrQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vashrQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vashrQs64:
+;CHECK: vsub.i64
+;CHECK: vshl.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = ashr <2 x i64> %tmp1, %tmp2
@@ -313,24 +400,32 @@ define <2 x i64> @vashrQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
}
define <16 x i8> @vashrQi8(<16 x i8>* %A) nounwind {
+;CHECK: vashrQi8:
+;CHECK: vshr.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = ashr <16 x i8> %tmp1, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
ret <16 x i8> %tmp2
}
define <8 x i16> @vashrQi16(<8 x i16>* %A) nounwind {
+;CHECK: vashrQi16:
+;CHECK: vshr.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = ashr <8 x i16> %tmp1, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
ret <8 x i16> %tmp2
}
define <4 x i32> @vashrQi32(<4 x i32>* %A) nounwind {
+;CHECK: vashrQi32:
+;CHECK: vshr.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = ashr <4 x i32> %tmp1, < i32 32, i32 32, i32 32, i32 32 >
ret <4 x i32> %tmp2
}
define <2 x i64> @vashrQi64(<2 x i64>* %A) nounwind {
+;CHECK: vashrQi64:
+;CHECK: vshr.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = ashr <2 x i64> %tmp1, < i64 64, i64 64 >
ret <2 x i64> %tmp2
diff --git a/test/CodeGen/ARM/vshiftins.ll b/test/CodeGen/ARM/vshiftins.ll
index cb7cbb8..3a4f857 100644
--- a/test/CodeGen/ARM/vshiftins.ll
+++ b/test/CodeGen/ARM/vshiftins.ll
@@ -1,14 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vsli\\.8} %t | count 2
-; RUN: grep {vsli\\.16} %t | count 2
-; RUN: grep {vsli\\.32} %t | count 2
-; RUN: grep {vsli\\.64} %t | count 2
-; RUN: grep {vsri\\.8} %t | count 2
-; RUN: grep {vsri\\.16} %t | count 2
-; RUN: grep {vsri\\.32} %t | count 2
-; RUN: grep {vsri\\.64} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vsli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vsli8:
+;CHECK: vsli.8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
@@ -16,6 +10,8 @@ define <8 x i8> @vsli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vsli16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vsli16:
+;CHECK: vsli.16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
@@ -23,6 +19,8 @@ define <4 x i16> @vsli16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vsli32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vsli32:
+;CHECK: vsli.32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> < i32 31, i32 31 >)
@@ -30,6 +28,8 @@ define <2 x i32> @vsli32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vsli64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vsli64:
+;CHECK: vsli.64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, <1 x i64> < i64 63 >)
@@ -37,6 +37,8 @@ define <1 x i64> @vsli64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <16 x i8> @vsliQ8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vsliQ8:
+;CHECK: vsli.8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
@@ -44,6 +46,8 @@ define <16 x i8> @vsliQ8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vsliQ16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vsliQ16:
+;CHECK: vsli.16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
@@ -51,6 +55,8 @@ define <8 x i16> @vsliQ16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vsliQ32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vsliQ32:
+;CHECK: vsli.32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
@@ -58,6 +64,8 @@ define <4 x i32> @vsliQ32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vsliQ64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vsliQ64:
+;CHECK: vsli.64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, <2 x i64> < i64 63, i64 63 >)
@@ -65,6 +73,8 @@ define <2 x i64> @vsliQ64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
}
define <8 x i8> @vsri8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vsri8:
+;CHECK: vsri.8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
@@ -72,6 +82,8 @@ define <8 x i8> @vsri8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vsri16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vsri16:
+;CHECK: vsri.16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
@@ -79,6 +91,8 @@ define <4 x i16> @vsri16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vsri32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vsri32:
+;CHECK: vsri.32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >)
@@ -86,6 +100,8 @@ define <2 x i32> @vsri32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vsri64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vsri64:
+;CHECK: vsri.64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, <1 x i64> < i64 -64 >)
@@ -93,6 +109,8 @@ define <1 x i64> @vsri64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <16 x i8> @vsriQ8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vsriQ8:
+;CHECK: vsri.8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
@@ -100,6 +118,8 @@ define <16 x i8> @vsriQ8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vsriQ16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vsriQ16:
+;CHECK: vsri.16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
@@ -107,6 +127,8 @@ define <8 x i16> @vsriQ16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vsriQ32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vsriQ32:
+;CHECK: vsri.32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
@@ -114,6 +136,8 @@ define <4 x i32> @vsriQ32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vsriQ64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vsriQ64:
+;CHECK: vsri.64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >)
diff --git a/test/CodeGen/ARM/vshl.ll b/test/CodeGen/ARM/vshl.ll
index 993126e..818e71b 100644
--- a/test/CodeGen/ARM/vshl.ll
+++ b/test/CodeGen/ARM/vshl.ll
@@ -1,26 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vshl\\.s8} %t | count 2
-; RUN: grep {vshl\\.s16} %t | count 2
-; RUN: grep {vshl\\.s32} %t | count 2
-; RUN: grep {vshl\\.s64} %t | count 2
-; RUN: grep {vshl\\.u8} %t | count 2
-; RUN: grep {vshl\\.u16} %t | count 2
-; RUN: grep {vshl\\.u32} %t | count 2
-; RUN: grep {vshl\\.u64} %t | count 2
-; RUN: grep {vshl\\.i8} %t | count 2
-; RUN: grep {vshl\\.i16} %t | count 2
-; RUN: grep {vshl\\.i32} %t | count 2
-; RUN: grep {vshl\\.i64} %t | count 2
-; RUN: grep {vshr\\.s8} %t | count 2
-; RUN: grep {vshr\\.s16} %t | count 2
-; RUN: grep {vshr\\.s32} %t | count 2
-; RUN: grep {vshr\\.s64} %t | count 2
-; RUN: grep {vshr\\.u8} %t | count 2
-; RUN: grep {vshr\\.u16} %t | count 2
-; RUN: grep {vshr\\.u32} %t | count 2
-; RUN: grep {vshr\\.u64} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vshls8:
+;CHECK: vshl.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -28,6 +10,8 @@ define <8 x i8> @vshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vshls16:
+;CHECK: vshl.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -35,6 +19,8 @@ define <4 x i16> @vshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vshls32:
+;CHECK: vshl.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -42,6 +28,8 @@ define <2 x i32> @vshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vshls64:
+;CHECK: vshl.s64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
@@ -49,6 +37,8 @@ define <1 x i64> @vshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <8 x i8> @vshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vshlu8:
+;CHECK: vshl.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -56,6 +46,8 @@ define <8 x i8> @vshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vshlu16:
+;CHECK: vshl.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -63,6 +55,8 @@ define <4 x i16> @vshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vshlu32:
+;CHECK: vshl.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -70,6 +64,8 @@ define <2 x i32> @vshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vshlu64:
+;CHECK: vshl.u64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
@@ -77,6 +73,8 @@ define <1 x i64> @vshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <16 x i8> @vshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vshlQs8:
+;CHECK: vshl.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -84,6 +82,8 @@ define <16 x i8> @vshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vshlQs16:
+;CHECK: vshl.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -91,6 +91,8 @@ define <8 x i16> @vshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vshlQs32:
+;CHECK: vshl.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -98,6 +100,8 @@ define <4 x i32> @vshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vshlQs64:
+;CHECK: vshl.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
@@ -105,6 +109,8 @@ define <2 x i64> @vshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
}
define <16 x i8> @vshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vshlQu8:
+;CHECK: vshl.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -112,6 +118,8 @@ define <16 x i8> @vshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vshlQu16:
+;CHECK: vshl.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -119,6 +127,8 @@ define <8 x i16> @vshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vshlQu32:
+;CHECK: vshl.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -126,6 +136,8 @@ define <4 x i32> @vshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vshlQu64:
+;CHECK: vshl.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
@@ -136,48 +148,64 @@ define <2 x i64> @vshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
; Test a mix of both signed and unsigned intrinsics.
define <8 x i8> @vshli8(<8 x i8>* %A) nounwind {
+;CHECK: vshli8:
+;CHECK: vshl.i8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vshli16(<4 x i16>* %A) nounwind {
+;CHECK: vshli16:
+;CHECK: vshl.i16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vshli32(<2 x i32>* %A) nounwind {
+;CHECK: vshli32:
+;CHECK: vshl.i32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
ret <2 x i32> %tmp2
}
define <1 x i64> @vshli64(<1 x i64>* %A) nounwind {
+;CHECK: vshli64:
+;CHECK: vshl.i64
%tmp1 = load <1 x i64>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 63 >)
ret <1 x i64> %tmp2
}
define <16 x i8> @vshlQi8(<16 x i8>* %A) nounwind {
+;CHECK: vshlQi8:
+;CHECK: vshl.i8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <16 x i8> %tmp2
}
define <8 x i16> @vshlQi16(<8 x i16>* %A) nounwind {
+;CHECK: vshlQi16:
+;CHECK: vshl.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vshlQi32(<4 x i32>* %A) nounwind {
+;CHECK: vshlQi32:
+;CHECK: vshl.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 31, i32 31, i32 31, i32 31 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vshlQi64(<2 x i64>* %A) nounwind {
+;CHECK: vshlQi64:
+;CHECK: vshl.i64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 63, i64 63 >)
ret <2 x i64> %tmp2
@@ -186,96 +214,128 @@ define <2 x i64> @vshlQi64(<2 x i64>* %A) nounwind {
; Right shift by immediate:
define <8 x i8> @vshrs8(<8 x i8>* %A) nounwind {
+;CHECK: vshrs8:
+;CHECK: vshr.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vshrs16(<4 x i16>* %A) nounwind {
+;CHECK: vshrs16:
+;CHECK: vshr.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vshrs32(<2 x i32>* %A) nounwind {
+;CHECK: vshrs32:
+;CHECK: vshr.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
ret <2 x i32> %tmp2
}
define <1 x i64> @vshrs64(<1 x i64>* %A) nounwind {
+;CHECK: vshrs64:
+;CHECK: vshr.s64
%tmp1 = load <1 x i64>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
ret <1 x i64> %tmp2
}
define <8 x i8> @vshru8(<8 x i8>* %A) nounwind {
+;CHECK: vshru8:
+;CHECK: vshr.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vshru16(<4 x i16>* %A) nounwind {
+;CHECK: vshru16:
+;CHECK: vshr.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vshru32(<2 x i32>* %A) nounwind {
+;CHECK: vshru32:
+;CHECK: vshr.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
ret <2 x i32> %tmp2
}
define <1 x i64> @vshru64(<1 x i64>* %A) nounwind {
+;CHECK: vshru64:
+;CHECK: vshr.u64
%tmp1 = load <1 x i64>* %A
%tmp2 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
ret <1 x i64> %tmp2
}
define <16 x i8> @vshrQs8(<16 x i8>* %A) nounwind {
+;CHECK: vshrQs8:
+;CHECK: vshr.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <16 x i8> %tmp2
}
define <8 x i16> @vshrQs16(<8 x i16>* %A) nounwind {
+;CHECK: vshrQs16:
+;CHECK: vshr.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vshrQs32(<4 x i32>* %A) nounwind {
+;CHECK: vshrQs32:
+;CHECK: vshr.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vshrQs64(<2 x i64>* %A) nounwind {
+;CHECK: vshrQs64:
+;CHECK: vshr.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
ret <2 x i64> %tmp2
}
define <16 x i8> @vshrQu8(<16 x i8>* %A) nounwind {
+;CHECK: vshrQu8:
+;CHECK: vshr.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
ret <16 x i8> %tmp2
}
define <8 x i16> @vshrQu16(<8 x i16>* %A) nounwind {
+;CHECK: vshrQu16:
+;CHECK: vshr.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vshrQu32(<4 x i32>* %A) nounwind {
+;CHECK: vshrQu32:
+;CHECK: vshr.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vshrQu64(<2 x i64>* %A) nounwind {
+;CHECK: vshrQu64:
+;CHECK: vshr.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
ret <2 x i64> %tmp2
@@ -300,3 +360,295 @@ declare <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind re
declare <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @vrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vrshls8:
+;CHECK: vrshl.s8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vrshls16:
+;CHECK: vrshl.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vrshls32:
+;CHECK: vrshl.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vrshls64:
+;CHECK: vrshl.s64
+ %tmp1 = load <1 x i64>* %A
+ %tmp2 = load <1 x i64>* %B
+ %tmp3 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+ ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vrshlu8:
+;CHECK: vrshl.u8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vrshlu16:
+;CHECK: vrshl.u16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vrshlu32:
+;CHECK: vrshl.u32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vrshlu64:
+;CHECK: vrshl.u64
+ %tmp1 = load <1 x i64>* %A
+ %tmp2 = load <1 x i64>* %B
+ %tmp3 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+ ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vrshlQs8:
+;CHECK: vrshl.s8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vrshlQs16:
+;CHECK: vrshl.s16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vrshlQs32:
+;CHECK: vrshl.s32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vrshlQs64:
+;CHECK: vrshl.s64
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vrshlQu8:
+;CHECK: vrshl.u8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vrshlQu16:
+;CHECK: vrshl.u16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vrshlQu32:
+;CHECK: vrshl.u32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vrshlQu64:
+;CHECK: vrshl.u64
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @vrshrs8(<8 x i8>* %A) nounwind {
+;CHECK: vrshrs8:
+;CHECK: vrshr.s8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vrshrs16(<4 x i16>* %A) nounwind {
+;CHECK: vrshrs16:
+;CHECK: vrshr.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vrshrs32(<2 x i32>* %A) nounwind {
+;CHECK: vrshrs32:
+;CHECK: vrshr.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
+ ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vrshrs64(<1 x i64>* %A) nounwind {
+;CHECK: vrshrs64:
+;CHECK: vrshr.s64
+ %tmp1 = load <1 x i64>* %A
+ %tmp2 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
+ ret <1 x i64> %tmp2
+}
+
+define <8 x i8> @vrshru8(<8 x i8>* %A) nounwind {
+;CHECK: vrshru8:
+;CHECK: vrshr.u8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vrshru16(<4 x i16>* %A) nounwind {
+;CHECK: vrshru16:
+;CHECK: vrshr.u16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vrshru32(<2 x i32>* %A) nounwind {
+;CHECK: vrshru32:
+;CHECK: vrshr.u32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >)
+ ret <2 x i32> %tmp2
+}
+
+define <1 x i64> @vrshru64(<1 x i64>* %A) nounwind {
+;CHECK: vrshru64:
+;CHECK: vrshr.u64
+ %tmp1 = load <1 x i64>* %A
+ %tmp2 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >)
+ ret <1 x i64> %tmp2
+}
+
+define <16 x i8> @vrshrQs8(<16 x i8>* %A) nounwind {
+;CHECK: vrshrQs8:
+;CHECK: vrshr.s8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+ ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vrshrQs16(<8 x i16>* %A) nounwind {
+;CHECK: vrshrQs16:
+;CHECK: vrshr.s16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vrshrQs32(<4 x i32>* %A) nounwind {
+;CHECK: vrshrQs32:
+;CHECK: vrshr.s32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
+ ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vrshrQs64(<2 x i64>* %A) nounwind {
+;CHECK: vrshrQs64:
+;CHECK: vrshr.s64
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
+ ret <2 x i64> %tmp2
+}
+
+define <16 x i8> @vrshrQu8(<16 x i8>* %A) nounwind {
+;CHECK: vrshrQu8:
+;CHECK: vrshr.u8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
+ ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vrshrQu16(<8 x i16>* %A) nounwind {
+;CHECK: vrshrQu16:
+;CHECK: vrshr.u16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vrshrQu32(<4 x i32>* %A) nounwind {
+;CHECK: vrshrQu32:
+;CHECK: vrshr.u32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
+ ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vrshrQu64(<2 x i64>* %A) nounwind {
+;CHECK: vrshrQu64:
+;CHECK: vrshr.u64
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >)
+ ret <2 x i64> %tmp2
+}
+
+declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vshll.ll b/test/CodeGen/ARM/vshll.ll
index f81c09a..8e85b98 100644
--- a/test/CodeGen/ARM/vshll.ll
+++ b/test/CodeGen/ARM/vshll.ll
@@ -1,45 +1,48 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vshll\\.s8} %t | count 1
-; RUN: grep {vshll\\.s16} %t | count 1
-; RUN: grep {vshll\\.s32} %t | count 1
-; RUN: grep {vshll\\.u8} %t | count 1
-; RUN: grep {vshll\\.u16} %t | count 1
-; RUN: grep {vshll\\.u32} %t | count 1
-; RUN: grep {vshll\\.i8} %t | count 1
-; RUN: grep {vshll\\.i16} %t | count 1
-; RUN: grep {vshll\\.i32} %t | count 1
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i16> @vshlls8(<8 x i8>* %A) nounwind {
+;CHECK: vshlls8:
+;CHECK: vshll.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftls.v8i16(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vshlls16(<4 x i16>* %A) nounwind {
+;CHECK: vshlls16:
+;CHECK: vshll.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftls.v4i32(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vshlls32(<2 x i32>* %A) nounwind {
+;CHECK: vshlls32:
+;CHECK: vshll.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftls.v2i64(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
ret <2 x i64> %tmp2
}
define <8 x i16> @vshllu8(<8 x i8>* %A) nounwind {
+;CHECK: vshllu8:
+;CHECK: vshll.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftlu.v8i16(<8 x i8> %tmp1, <8 x i8> < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vshllu16(<4 x i16>* %A) nounwind {
+;CHECK: vshllu16:
+;CHECK: vshll.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftlu.v4i32(<4 x i16> %tmp1, <4 x i16> < i16 15, i16 15, i16 15, i16 15 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vshllu32(<2 x i32>* %A) nounwind {
+;CHECK: vshllu32:
+;CHECK: vshll.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftlu.v2i64(<2 x i32> %tmp1, <2 x i32> < i32 31, i32 31 >)
ret <2 x i64> %tmp2
@@ -48,18 +51,24 @@ define <2 x i64> @vshllu32(<2 x i32>* %A) nounwind {
; The following tests use the maximum shift count, so the signedness is
; irrelevant. Test both signed and unsigned versions.
define <8 x i16> @vshlli8(<8 x i8>* %A) nounwind {
+;CHECK: vshlli8:
+;CHECK: vshll.i8
%tmp1 = load <8 x i8>* %A
%tmp2 = call <8 x i16> @llvm.arm.neon.vshiftls.v8i16(<8 x i8> %tmp1, <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >)
ret <8 x i16> %tmp2
}
define <4 x i32> @vshlli16(<4 x i16>* %A) nounwind {
+;CHECK: vshlli16:
+;CHECK: vshll.i16
%tmp1 = load <4 x i16>* %A
%tmp2 = call <4 x i32> @llvm.arm.neon.vshiftlu.v4i32(<4 x i16> %tmp1, <4 x i16> < i16 16, i16 16, i16 16, i16 16 >)
ret <4 x i32> %tmp2
}
define <2 x i64> @vshlli32(<2 x i32>* %A) nounwind {
+;CHECK: vshlli32:
+;CHECK: vshll.i32
%tmp1 = load <2 x i32>* %A
%tmp2 = call <2 x i64> @llvm.arm.neon.vshiftls.v2i64(<2 x i32> %tmp1, <2 x i32> < i32 32, i32 32 >)
ret <2 x i64> %tmp2
diff --git a/test/CodeGen/ARM/vshrn.ll b/test/CodeGen/ARM/vshrn.ll
index bc640cb..e2544f4 100644
--- a/test/CodeGen/ARM/vshrn.ll
+++ b/test/CodeGen/ARM/vshrn.ll
@@ -1,21 +1,24 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vshrn\\.i16} %t | count 1
-; RUN: grep {vshrn\\.i32} %t | count 1
-; RUN: grep {vshrn\\.i64} %t | count 1
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vshrns8(<8 x i16>* %A) nounwind {
+;CHECK: vshrns8:
+;CHECK: vshrn.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = call <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
ret <8 x i8> %tmp2
}
define <4 x i16> @vshrns16(<4 x i32>* %A) nounwind {
+;CHECK: vshrns16:
+;CHECK: vshrn.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
ret <4 x i16> %tmp2
}
define <2 x i32> @vshrns32(<2 x i64>* %A) nounwind {
+;CHECK: vshrns32:
+;CHECK: vshrn.i64
%tmp1 = load <2 x i64>* %A
%tmp2 = call <2 x i32> @llvm.arm.neon.vshiftn.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
ret <2 x i32> %tmp2
@@ -24,3 +27,31 @@ define <2 x i32> @vshrns32(<2 x i64>* %A) nounwind {
declare <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i32> @llvm.arm.neon.vshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @vrshrns8(<8 x i16>* %A) nounwind {
+;CHECK: vrshrns8:
+;CHECK: vrshrn.i16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vrshrns16(<4 x i32>* %A) nounwind {
+;CHECK: vrshrns16:
+;CHECK: vrshrn.i32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vrshrns32(<2 x i64>* %A) nounwind {
+;CHECK: vrshrns32:
+;CHECK: vrshrn.i64
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
+ ret <2 x i32> %tmp2
+}
+
+declare <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vsra.ll b/test/CodeGen/ARM/vsra.ll
index e2829dc..acb672d 100644
--- a/test/CodeGen/ARM/vsra.ll
+++ b/test/CodeGen/ARM/vsra.ll
@@ -1,22 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vsra\\.s8} %t | count 2
-; RUN: grep {vsra\\.s16} %t | count 2
-; RUN: grep {vsra\\.s32} %t | count 2
-; RUN: grep {vsra\\.s64} %t | count 2
-; RUN: grep {vsra\\.u8} %t | count 2
-; RUN: grep {vsra\\.u16} %t | count 2
-; RUN: grep {vsra\\.u32} %t | count 2
-; RUN: grep {vsra\\.u64} %t | count 2
-; RUN: grep {vrsra\\.s8} %t | count 2
-; RUN: grep {vrsra\\.s16} %t | count 2
-; RUN: grep {vrsra\\.s32} %t | count 2
-; RUN: grep {vrsra\\.s64} %t | count 2
-; RUN: grep {vrsra\\.u8} %t | count 2
-; RUN: grep {vrsra\\.u16} %t | count 2
-; RUN: grep {vrsra\\.u32} %t | count 2
-; RUN: grep {vrsra\\.u64} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vsras8:
+;CHECK: vsra.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = ashr <8 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
@@ -25,6 +11,8 @@ define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vsras16:
+;CHECK: vsra.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = ashr <4 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16 >
@@ -33,6 +21,8 @@ define <4 x i16> @vsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vsras32:
+;CHECK: vsra.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = ashr <2 x i32> %tmp2, < i32 32, i32 32 >
@@ -41,6 +31,8 @@ define <2 x i32> @vsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vsras64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vsras64:
+;CHECK: vsra.s64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = ashr <1 x i64> %tmp2, < i64 64 >
@@ -49,6 +41,8 @@ define <1 x i64> @vsras64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <16 x i8> @vsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vsraQs8:
+;CHECK: vsra.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = ashr <16 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
@@ -57,6 +51,8 @@ define <16 x i8> @vsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vsraQs16:
+;CHECK: vsra.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = ashr <8 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
@@ -65,6 +61,8 @@ define <8 x i16> @vsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vsraQs32:
+;CHECK: vsra.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = ashr <4 x i32> %tmp2, < i32 32, i32 32, i32 32, i32 32 >
@@ -73,6 +71,8 @@ define <4 x i32> @vsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vsraQs64:
+;CHECK: vsra.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = ashr <2 x i64> %tmp2, < i64 64, i64 64 >
@@ -81,6 +81,8 @@ define <2 x i64> @vsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
}
define <8 x i8> @vsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vsrau8:
+;CHECK: vsra.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = lshr <8 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
@@ -89,6 +91,8 @@ define <8 x i8> @vsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vsrau16:
+;CHECK: vsra.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = lshr <4 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16 >
@@ -97,6 +101,8 @@ define <4 x i16> @vsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vsrau32:
+;CHECK: vsra.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = lshr <2 x i32> %tmp2, < i32 32, i32 32 >
@@ -105,6 +111,8 @@ define <2 x i32> @vsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vsrau64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vsrau64:
+;CHECK: vsra.u64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = lshr <1 x i64> %tmp2, < i64 64 >
@@ -113,6 +121,8 @@ define <1 x i64> @vsrau64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <16 x i8> @vsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vsraQu8:
+;CHECK: vsra.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = lshr <16 x i8> %tmp2, < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
@@ -121,6 +131,8 @@ define <16 x i8> @vsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vsraQu16:
+;CHECK: vsra.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = lshr <8 x i16> %tmp2, < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
@@ -129,6 +141,8 @@ define <8 x i16> @vsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vsraQu32:
+;CHECK: vsra.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = lshr <4 x i32> %tmp2, < i32 32, i32 32, i32 32, i32 32 >
@@ -137,6 +151,8 @@ define <4 x i32> @vsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vsraQu64:
+;CHECK: vsra.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = lshr <2 x i64> %tmp2, < i64 64, i64 64 >
@@ -145,6 +161,8 @@ define <2 x i64> @vsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
}
define <8 x i8> @vrsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vrsras8:
+;CHECK: vrsra.s8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
@@ -153,6 +171,8 @@ define <8 x i8> @vrsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vrsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vrsras16:
+;CHECK: vrsra.s16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
@@ -161,6 +181,8 @@ define <4 x i16> @vrsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vrsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vrsras32:
+;CHECK: vrsra.s32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >)
@@ -169,6 +191,8 @@ define <2 x i32> @vrsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vrsras64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vrsras64:
+;CHECK: vrsra.s64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp2, <1 x i64> < i64 -64 >)
@@ -177,6 +201,8 @@ define <1 x i64> @vrsras64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <8 x i8> @vrsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vrsrau8:
+;CHECK: vrsra.u8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp2, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
@@ -185,6 +211,8 @@ define <8 x i8> @vrsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vrsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vrsrau16:
+;CHECK: vrsra.u16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp2, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >)
@@ -193,6 +221,8 @@ define <4 x i16> @vrsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vrsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vrsrau32:
+;CHECK: vrsra.u32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp2, <2 x i32> < i32 -32, i32 -32 >)
@@ -201,6 +231,8 @@ define <2 x i32> @vrsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vrsrau64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vrsrau64:
+;CHECK: vrsra.u64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp2, <1 x i64> < i64 -64 >)
@@ -209,6 +241,8 @@ define <1 x i64> @vrsrau64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <16 x i8> @vrsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vrsraQs8:
+;CHECK: vrsra.s8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
@@ -217,6 +251,8 @@ define <16 x i8> @vrsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vrsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vrsraQs16:
+;CHECK: vrsra.s16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
@@ -225,6 +261,8 @@ define <8 x i16> @vrsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vrsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vrsraQs32:
+;CHECK: vrsra.s32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
@@ -233,6 +271,8 @@ define <4 x i32> @vrsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vrsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vrsraQs64:
+;CHECK: vrsra.s64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >)
@@ -241,6 +281,8 @@ define <2 x i64> @vrsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
}
define <16 x i8> @vrsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vrsraQu8:
+;CHECK: vrsra.u8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp2, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >)
@@ -249,6 +291,8 @@ define <16 x i8> @vrsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vrsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vrsraQu16:
+;CHECK: vrsra.u16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp2, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >)
@@ -257,6 +301,8 @@ define <8 x i16> @vrsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vrsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vrsraQu32:
+;CHECK: vrsra.u32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp2, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >)
@@ -265,6 +311,8 @@ define <4 x i32> @vrsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vrsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vrsraQu64:
+;CHECK: vrsra.u64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp2, <2 x i64> < i64 -64, i64 -64 >)
diff --git a/test/CodeGen/ARM/vst1.ll b/test/CodeGen/ARM/vst1.ll
new file mode 100644
index 0000000..602b124
--- /dev/null
+++ b/test/CodeGen/ARM/vst1.ll
@@ -0,0 +1,93 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+define void @vst1i8(i8* %A, <8 x i8>* %B) nounwind {
+;CHECK: vst1i8:
+;CHECK: vst1.8
+ %tmp1 = load <8 x i8>* %B
+ call void @llvm.arm.neon.vst1.v8i8(i8* %A, <8 x i8> %tmp1)
+ ret void
+}
+
+define void @vst1i16(i16* %A, <4 x i16>* %B) nounwind {
+;CHECK: vst1i16:
+;CHECK: vst1.16
+ %tmp1 = load <4 x i16>* %B
+ call void @llvm.arm.neon.vst1.v4i16(i16* %A, <4 x i16> %tmp1)
+ ret void
+}
+
+define void @vst1i32(i32* %A, <2 x i32>* %B) nounwind {
+;CHECK: vst1i32:
+;CHECK: vst1.32
+ %tmp1 = load <2 x i32>* %B
+ call void @llvm.arm.neon.vst1.v2i32(i32* %A, <2 x i32> %tmp1)
+ ret void
+}
+
+define void @vst1f(float* %A, <2 x float>* %B) nounwind {
+;CHECK: vst1f:
+;CHECK: vst1.32
+ %tmp1 = load <2 x float>* %B
+ call void @llvm.arm.neon.vst1.v2f32(float* %A, <2 x float> %tmp1)
+ ret void
+}
+
+define void @vst1i64(i64* %A, <1 x i64>* %B) nounwind {
+;CHECK: vst1i64:
+;CHECK: vst1.64
+ %tmp1 = load <1 x i64>* %B
+ call void @llvm.arm.neon.vst1.v1i64(i64* %A, <1 x i64> %tmp1)
+ ret void
+}
+
+define void @vst1Qi8(i8* %A, <16 x i8>* %B) nounwind {
+;CHECK: vst1Qi8:
+;CHECK: vst1.8
+ %tmp1 = load <16 x i8>* %B
+ call void @llvm.arm.neon.vst1.v16i8(i8* %A, <16 x i8> %tmp1)
+ ret void
+}
+
+define void @vst1Qi16(i16* %A, <8 x i16>* %B) nounwind {
+;CHECK: vst1Qi16:
+;CHECK: vst1.16
+ %tmp1 = load <8 x i16>* %B
+ call void @llvm.arm.neon.vst1.v8i16(i16* %A, <8 x i16> %tmp1)
+ ret void
+}
+
+define void @vst1Qi32(i32* %A, <4 x i32>* %B) nounwind {
+;CHECK: vst1Qi32:
+;CHECK: vst1.32
+ %tmp1 = load <4 x i32>* %B
+ call void @llvm.arm.neon.vst1.v4i32(i32* %A, <4 x i32> %tmp1)
+ ret void
+}
+
+define void @vst1Qf(float* %A, <4 x float>* %B) nounwind {
+;CHECK: vst1Qf:
+;CHECK: vst1.32
+ %tmp1 = load <4 x float>* %B
+ call void @llvm.arm.neon.vst1.v4f32(float* %A, <4 x float> %tmp1)
+ ret void
+}
+
+define void @vst1Qi64(i64* %A, <2 x i64>* %B) nounwind {
+;CHECK: vst1Qi64:
+;CHECK: vst1.64
+ %tmp1 = load <2 x i64>* %B
+ call void @llvm.arm.neon.vst1.v2i64(i64* %A, <2 x i64> %tmp1)
+ ret void
+}
+
+declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>) nounwind
+declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>) nounwind
+declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>) nounwind
+declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>) nounwind
+declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>) nounwind
+
+declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>) nounwind
+declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>) nounwind
+declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>) nounwind
+declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>) nounwind
+declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>) nounwind
diff --git a/test/CodeGen/ARM/vst2.ll b/test/CodeGen/ARM/vst2.ll
new file mode 100644
index 0000000..17d6bee
--- /dev/null
+++ b/test/CodeGen/ARM/vst2.ll
@@ -0,0 +1,84 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+define void @vst2i8(i8* %A, <8 x i8>* %B) nounwind {
+;CHECK: vst2i8:
+;CHECK: vst2.8
+ %tmp1 = load <8 x i8>* %B
+ call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1)
+ ret void
+}
+
+define void @vst2i16(i16* %A, <4 x i16>* %B) nounwind {
+;CHECK: vst2i16:
+;CHECK: vst2.16
+ %tmp1 = load <4 x i16>* %B
+ call void @llvm.arm.neon.vst2.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1)
+ ret void
+}
+
+define void @vst2i32(i32* %A, <2 x i32>* %B) nounwind {
+;CHECK: vst2i32:
+;CHECK: vst2.32
+ %tmp1 = load <2 x i32>* %B
+ call void @llvm.arm.neon.vst2.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1)
+ ret void
+}
+
+define void @vst2f(float* %A, <2 x float>* %B) nounwind {
+;CHECK: vst2f:
+;CHECK: vst2.32
+ %tmp1 = load <2 x float>* %B
+ call void @llvm.arm.neon.vst2.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1)
+ ret void
+}
+
+define void @vst2i64(i64* %A, <1 x i64>* %B) nounwind {
+;CHECK: vst2i64:
+;CHECK: vst1.64
+ %tmp1 = load <1 x i64>* %B
+ call void @llvm.arm.neon.vst2.v1i64(i64* %A, <1 x i64> %tmp1, <1 x i64> %tmp1)
+ ret void
+}
+
+define void @vst2Qi8(i8* %A, <16 x i8>* %B) nounwind {
+;CHECK: vst2Qi8:
+;CHECK: vst2.8
+ %tmp1 = load <16 x i8>* %B
+ call void @llvm.arm.neon.vst2.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1)
+ ret void
+}
+
+define void @vst2Qi16(i16* %A, <8 x i16>* %B) nounwind {
+;CHECK: vst2Qi16:
+;CHECK: vst2.16
+ %tmp1 = load <8 x i16>* %B
+ call void @llvm.arm.neon.vst2.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1)
+ ret void
+}
+
+define void @vst2Qi32(i32* %A, <4 x i32>* %B) nounwind {
+;CHECK: vst2Qi32:
+;CHECK: vst2.32
+ %tmp1 = load <4 x i32>* %B
+ call void @llvm.arm.neon.vst2.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1)
+ ret void
+}
+
+define void @vst2Qf(float* %A, <4 x float>* %B) nounwind {
+;CHECK: vst2Qf:
+;CHECK: vst2.32
+ %tmp1 = load <4 x float>* %B
+ call void @llvm.arm.neon.vst2.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1)
+ ret void
+}
+
+declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>) nounwind
+declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>) nounwind
+declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>) nounwind
+declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>) nounwind
+declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>) nounwind
+
+declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>) nounwind
+declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>) nounwind
+declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>) nounwind
+declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>) nounwind
diff --git a/test/CodeGen/ARM/vst3.ll b/test/CodeGen/ARM/vst3.ll
new file mode 100644
index 0000000..a831a0c
--- /dev/null
+++ b/test/CodeGen/ARM/vst3.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+define void @vst3i8(i8* %A, <8 x i8>* %B) nounwind {
+;CHECK: vst3i8:
+;CHECK: vst3.8
+ %tmp1 = load <8 x i8>* %B
+ call void @llvm.arm.neon.vst3.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1)
+ ret void
+}
+
+define void @vst3i16(i16* %A, <4 x i16>* %B) nounwind {
+;CHECK: vst3i16:
+;CHECK: vst3.16
+ %tmp1 = load <4 x i16>* %B
+ call void @llvm.arm.neon.vst3.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1)
+ ret void
+}
+
+define void @vst3i32(i32* %A, <2 x i32>* %B) nounwind {
+;CHECK: vst3i32:
+;CHECK: vst3.32
+ %tmp1 = load <2 x i32>* %B
+ call void @llvm.arm.neon.vst3.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1)
+ ret void
+}
+
+define void @vst3f(float* %A, <2 x float>* %B) nounwind {
+;CHECK: vst3f:
+;CHECK: vst3.32
+ %tmp1 = load <2 x float>* %B
+ call void @llvm.arm.neon.vst3.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1)
+ ret void
+}
+
+define void @vst3i64(i64* %A, <1 x i64>* %B) nounwind {
+;CHECK: vst3i64:
+;CHECK: vst1.64
+ %tmp1 = load <1 x i64>* %B
+ call void @llvm.arm.neon.vst3.v1i64(i64* %A, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1)
+ ret void
+}
+
+define void @vst3Qi8(i8* %A, <16 x i8>* %B) nounwind {
+;CHECK: vst3Qi8:
+;CHECK: vst3.8
+;CHECK: vst3.8
+ %tmp1 = load <16 x i8>* %B
+ call void @llvm.arm.neon.vst3.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1)
+ ret void
+}
+
+define void @vst3Qi16(i16* %A, <8 x i16>* %B) nounwind {
+;CHECK: vst3Qi16:
+;CHECK: vst3.16
+;CHECK: vst3.16
+ %tmp1 = load <8 x i16>* %B
+ call void @llvm.arm.neon.vst3.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1)
+ ret void
+}
+
+define void @vst3Qi32(i32* %A, <4 x i32>* %B) nounwind {
+;CHECK: vst3Qi32:
+;CHECK: vst3.32
+;CHECK: vst3.32
+ %tmp1 = load <4 x i32>* %B
+ call void @llvm.arm.neon.vst3.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1)
+ ret void
+}
+
+define void @vst3Qf(float* %A, <4 x float>* %B) nounwind {
+;CHECK: vst3Qf:
+;CHECK: vst3.32
+;CHECK: vst3.32
+ %tmp1 = load <4 x float>* %B
+ call void @llvm.arm.neon.vst3.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1)
+ ret void
+}
+
+declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
+declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>) nounwind
+declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>) nounwind
+declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>) nounwind
+declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>) nounwind
+
+declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>) nounwind
+declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>) nounwind
+declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>) nounwind
+declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>) nounwind
diff --git a/test/CodeGen/ARM/vst4.ll b/test/CodeGen/ARM/vst4.ll
new file mode 100644
index 0000000..d92c017
--- /dev/null
+++ b/test/CodeGen/ARM/vst4.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind {
+;CHECK: vst4i8:
+;CHECK: vst4.8
+ %tmp1 = load <8 x i8>* %B
+ call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1)
+ ret void
+}
+
+define void @vst4i16(i16* %A, <4 x i16>* %B) nounwind {
+;CHECK: vst4i16:
+;CHECK: vst4.16
+ %tmp1 = load <4 x i16>* %B
+ call void @llvm.arm.neon.vst4.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1)
+ ret void
+}
+
+define void @vst4i32(i32* %A, <2 x i32>* %B) nounwind {
+;CHECK: vst4i32:
+;CHECK: vst4.32
+ %tmp1 = load <2 x i32>* %B
+ call void @llvm.arm.neon.vst4.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1)
+ ret void
+}
+
+define void @vst4f(float* %A, <2 x float>* %B) nounwind {
+;CHECK: vst4f:
+;CHECK: vst4.32
+ %tmp1 = load <2 x float>* %B
+ call void @llvm.arm.neon.vst4.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1)
+ ret void
+}
+
+define void @vst4i64(i64* %A, <1 x i64>* %B) nounwind {
+;CHECK: vst4i64:
+;CHECK: vst1.64
+ %tmp1 = load <1 x i64>* %B
+ call void @llvm.arm.neon.vst4.v1i64(i64* %A, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1)
+ ret void
+}
+
+define void @vst4Qi8(i8* %A, <16 x i8>* %B) nounwind {
+;CHECK: vst4Qi8:
+;CHECK: vst4.8
+;CHECK: vst4.8
+ %tmp1 = load <16 x i8>* %B
+ call void @llvm.arm.neon.vst4.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1)
+ ret void
+}
+
+define void @vst4Qi16(i16* %A, <8 x i16>* %B) nounwind {
+;CHECK: vst4Qi16:
+;CHECK: vst4.16
+;CHECK: vst4.16
+ %tmp1 = load <8 x i16>* %B
+ call void @llvm.arm.neon.vst4.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1)
+ ret void
+}
+
+define void @vst4Qi32(i32* %A, <4 x i32>* %B) nounwind {
+;CHECK: vst4Qi32:
+;CHECK: vst4.32
+;CHECK: vst4.32
+ %tmp1 = load <4 x i32>* %B
+ call void @llvm.arm.neon.vst4.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1)
+ ret void
+}
+
+define void @vst4Qf(float* %A, <4 x float>* %B) nounwind {
+;CHECK: vst4Qf:
+;CHECK: vst4.32
+;CHECK: vst4.32
+ %tmp1 = load <4 x float>* %B
+ call void @llvm.arm.neon.vst4.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1)
+ ret void
+}
+
+declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind
+declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>) nounwind
+declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>) nounwind
+declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>) nounwind
+declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>) nounwind
+
+declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind
+declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>) nounwind
+declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>) nounwind
+declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>) nounwind
diff --git a/test/CodeGen/ARM/vstlane.ll b/test/CodeGen/ARM/vstlane.ll
new file mode 100644
index 0000000..3bfb14f
--- /dev/null
+++ b/test/CodeGen/ARM/vstlane.ll
@@ -0,0 +1,197 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+define void @vst2lanei8(i8* %A, <8 x i8>* %B) nounwind {
+;CHECK: vst2lanei8:
+;CHECK: vst2.8
+ %tmp1 = load <8 x i8>* %B
+ call void @llvm.arm.neon.vst2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1)
+ ret void
+}
+
+define void @vst2lanei16(i16* %A, <4 x i16>* %B) nounwind {
+;CHECK: vst2lanei16:
+;CHECK: vst2.16
+ %tmp1 = load <4 x i16>* %B
+ call void @llvm.arm.neon.vst2lane.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
+ ret void
+}
+
+define void @vst2lanei32(i32* %A, <2 x i32>* %B) nounwind {
+;CHECK: vst2lanei32:
+;CHECK: vst2.32
+ %tmp1 = load <2 x i32>* %B
+ call void @llvm.arm.neon.vst2lane.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+ ret void
+}
+
+define void @vst2lanef(float* %A, <2 x float>* %B) nounwind {
+;CHECK: vst2lanef:
+;CHECK: vst2.32
+ %tmp1 = load <2 x float>* %B
+ call void @llvm.arm.neon.vst2lane.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+ ret void
+}
+
+define void @vst2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
+;CHECK: vst2laneQi16:
+;CHECK: vst2.16
+ %tmp1 = load <8 x i16>* %B
+ call void @llvm.arm.neon.vst2lane.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
+ ret void
+}
+
+define void @vst2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
+;CHECK: vst2laneQi32:
+;CHECK: vst2.32
+ %tmp1 = load <4 x i32>* %B
+ call void @llvm.arm.neon.vst2lane.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2)
+ ret void
+}
+
+define void @vst2laneQf(float* %A, <4 x float>* %B) nounwind {
+;CHECK: vst2laneQf:
+;CHECK: vst2.32
+ %tmp1 = load <4 x float>* %B
+ call void @llvm.arm.neon.vst2lane.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1, i32 3)
+ ret void
+}
+
+declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32) nounwind
+
+declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
+
+define void @vst3lanei8(i8* %A, <8 x i8>* %B) nounwind {
+;CHECK: vst3lanei8:
+;CHECK: vst3.8
+ %tmp1 = load <8 x i8>* %B
+ call void @llvm.arm.neon.vst3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1)
+ ret void
+}
+
+define void @vst3lanei16(i16* %A, <4 x i16>* %B) nounwind {
+;CHECK: vst3lanei16:
+;CHECK: vst3.16
+ %tmp1 = load <4 x i16>* %B
+ call void @llvm.arm.neon.vst3lane.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
+ ret void
+}
+
+define void @vst3lanei32(i32* %A, <2 x i32>* %B) nounwind {
+;CHECK: vst3lanei32:
+;CHECK: vst3.32
+ %tmp1 = load <2 x i32>* %B
+ call void @llvm.arm.neon.vst3lane.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+ ret void
+}
+
+define void @vst3lanef(float* %A, <2 x float>* %B) nounwind {
+;CHECK: vst3lanef:
+;CHECK: vst3.32
+ %tmp1 = load <2 x float>* %B
+ call void @llvm.arm.neon.vst3lane.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+ ret void
+}
+
+define void @vst3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
+;CHECK: vst3laneQi16:
+;CHECK: vst3.16
+ %tmp1 = load <8 x i16>* %B
+ call void @llvm.arm.neon.vst3lane.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6)
+ ret void
+}
+
+define void @vst3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
+;CHECK: vst3laneQi32:
+;CHECK: vst3.32
+ %tmp1 = load <4 x i32>* %B
+ call void @llvm.arm.neon.vst3lane.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0)
+ ret void
+}
+
+define void @vst3laneQf(float* %A, <4 x float>* %B) nounwind {
+;CHECK: vst3laneQf:
+;CHECK: vst3.32
+ %tmp1 = load <4 x float>* %B
+ call void @llvm.arm.neon.vst3lane.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+ ret void
+}
+
+declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
+
+declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
+
+
+define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind {
+;CHECK: vst4lanei8:
+;CHECK: vst4.8
+ %tmp1 = load <8 x i8>* %B
+ call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1)
+ ret void
+}
+
+define void @vst4lanei16(i16* %A, <4 x i16>* %B) nounwind {
+;CHECK: vst4lanei16:
+;CHECK: vst4.16
+ %tmp1 = load <4 x i16>* %B
+ call void @llvm.arm.neon.vst4lane.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
+ ret void
+}
+
+define void @vst4lanei32(i32* %A, <2 x i32>* %B) nounwind {
+;CHECK: vst4lanei32:
+;CHECK: vst4.32
+ %tmp1 = load <2 x i32>* %B
+ call void @llvm.arm.neon.vst4lane.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+ ret void
+}
+
+define void @vst4lanef(float* %A, <2 x float>* %B) nounwind {
+;CHECK: vst4lanef:
+;CHECK: vst4.32
+ %tmp1 = load <2 x float>* %B
+ call void @llvm.arm.neon.vst4lane.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+ ret void
+}
+
+define void @vst4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
+;CHECK: vst4laneQi16:
+;CHECK: vst4.16
+ %tmp1 = load <8 x i16>* %B
+ call void @llvm.arm.neon.vst4lane.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7)
+ ret void
+}
+
+define void @vst4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
+;CHECK: vst4laneQi32:
+;CHECK: vst4.32
+ %tmp1 = load <4 x i32>* %B
+ call void @llvm.arm.neon.vst4lane.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2)
+ ret void
+}
+
+define void @vst4laneQf(float* %A, <4 x float>* %B) nounwind {
+;CHECK: vst4laneQf:
+;CHECK: vst4.32
+ %tmp1 = load <4 x float>* %B
+ call void @llvm.arm.neon.vst4lane.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+ ret void
+}
+
+declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
+
+declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
diff --git a/test/CodeGen/ARM/vsub.ll b/test/CodeGen/ARM/vsub.ll
index 85dea41..8f0055f 100644
--- a/test/CodeGen/ARM/vsub.ll
+++ b/test/CodeGen/ARM/vsub.ll
@@ -1,11 +1,8 @@
-; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
-; RUN: grep {vsub\\.i8} %t | count 2
-; RUN: grep {vsub\\.i16} %t | count 2
-; RUN: grep {vsub\\.i32} %t | count 2
-; RUN: grep {vsub\\.i64} %t | count 2
-; RUN: grep {vsub\\.f32} %t | count 2
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
define <8 x i8> @vsubi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vsubi8:
+;CHECK: vsub.i8
%tmp1 = load <8 x i8>* %A
%tmp2 = load <8 x i8>* %B
%tmp3 = sub <8 x i8> %tmp1, %tmp2
@@ -13,6 +10,8 @@ define <8 x i8> @vsubi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
}
define <4 x i16> @vsubi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vsubi16:
+;CHECK: vsub.i16
%tmp1 = load <4 x i16>* %A
%tmp2 = load <4 x i16>* %B
%tmp3 = sub <4 x i16> %tmp1, %tmp2
@@ -20,6 +19,8 @@ define <4 x i16> @vsubi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
}
define <2 x i32> @vsubi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vsubi32:
+;CHECK: vsub.i32
%tmp1 = load <2 x i32>* %A
%tmp2 = load <2 x i32>* %B
%tmp3 = sub <2 x i32> %tmp1, %tmp2
@@ -27,6 +28,8 @@ define <2 x i32> @vsubi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
}
define <1 x i64> @vsubi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK: vsubi64:
+;CHECK: vsub.i64
%tmp1 = load <1 x i64>* %A
%tmp2 = load <1 x i64>* %B
%tmp3 = sub <1 x i64> %tmp1, %tmp2
@@ -34,6 +37,8 @@ define <1 x i64> @vsubi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
}
define <2 x float> @vsubf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vsubf32:
+;CHECK: vsub.f32
%tmp1 = load <2 x float>* %A
%tmp2 = load <2 x float>* %B
%tmp3 = sub <2 x float> %tmp1, %tmp2
@@ -41,6 +46,8 @@ define <2 x float> @vsubf32(<2 x float>* %A, <2 x float>* %B) nounwind {
}
define <16 x i8> @vsubQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vsubQi8:
+;CHECK: vsub.i8
%tmp1 = load <16 x i8>* %A
%tmp2 = load <16 x i8>* %B
%tmp3 = sub <16 x i8> %tmp1, %tmp2
@@ -48,6 +55,8 @@ define <16 x i8> @vsubQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
}
define <8 x i16> @vsubQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vsubQi16:
+;CHECK: vsub.i16
%tmp1 = load <8 x i16>* %A
%tmp2 = load <8 x i16>* %B
%tmp3 = sub <8 x i16> %tmp1, %tmp2
@@ -55,6 +64,8 @@ define <8 x i16> @vsubQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
}
define <4 x i32> @vsubQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vsubQi32:
+;CHECK: vsub.i32
%tmp1 = load <4 x i32>* %A
%tmp2 = load <4 x i32>* %B
%tmp3 = sub <4 x i32> %tmp1, %tmp2
@@ -62,6 +73,8 @@ define <4 x i32> @vsubQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
}
define <2 x i64> @vsubQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vsubQi64:
+;CHECK: vsub.i64
%tmp1 = load <2 x i64>* %A
%tmp2 = load <2 x i64>* %B
%tmp3 = sub <2 x i64> %tmp1, %tmp2
@@ -69,8 +82,196 @@ define <2 x i64> @vsubQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
}
define <4 x float> @vsubQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK: vsubQf32:
+;CHECK: vsub.f32
%tmp1 = load <4 x float>* %A
%tmp2 = load <4 x float>* %B
%tmp3 = sub <4 x float> %tmp1, %tmp2
ret <4 x float> %tmp3
}
+
+define <8 x i8> @vsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vsubhni16:
+;CHECK: vsubhn.i16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vsubhni32:
+;CHECK: vsubhn.i32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vsubhni64:
+;CHECK: vsubhn.i64
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm.neon.vsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @vrsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vrsubhni16:
+;CHECK: vrsubhn.i16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vrsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vrsubhni32:
+;CHECK: vrsubhn.i32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vrsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK: vrsubhni64:
+;CHECK: vrsubhn.i64
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i16> @vsubls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vsubls8:
+;CHECK: vsubl.s8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vsubls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vsubls16:
+;CHECK: vsubl.s16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vsubls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vsubls32:
+;CHECK: vsubl.s32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vsublu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vsublu8:
+;CHECK: vsubl.u8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vsublu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vsublu16:
+;CHECK: vsubl.u16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vsublu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vsublu32:
+;CHECK: vsubl.u32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <8 x i16> @vsubws8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vsubws8:
+;CHECK: vsubw.s8
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vsubws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vsubws16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vsubws16:
+;CHECK: vsubw.s16
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vsubws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vsubws32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vsubws32:
+;CHECK: vsubw.s32
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vsubws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vsubwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vsubwu8:
+;CHECK: vsubw.u8
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i16> @llvm.arm.neon.vsubwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vsubwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vsubwu16:
+;CHECK: vsubw.u16
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i32> @llvm.arm.neon.vsubwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vsubwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vsubwu32:
+;CHECK: vsubw.u32
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i64> @llvm.arm.neon.vsubwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vsubws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vsubws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vsubws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vsubwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vsubwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vsubwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM/vtbl.ll b/test/CodeGen/ARM/vtbl.ll
new file mode 100644
index 0000000..9264987
--- /dev/null
+++ b/test/CodeGen/ARM/vtbl.ll
@@ -0,0 +1,109 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
+%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
+%struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }
+
+define <8 x i8> @vtbl1(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vtbl1:
+;CHECK: vtbl.8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <8 x i8> @vtbl2(<8 x i8>* %A, %struct.__neon_int8x8x2_t* %B) nounwind {
+;CHECK: vtbl2:
+;CHECK: vtbl.8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load %struct.__neon_int8x8x2_t* %B
+ %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
+ %tmp5 = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp1, <8 x i8> %tmp3, <8 x i8> %tmp4)
+ ret <8 x i8> %tmp5
+}
+
+define <8 x i8> @vtbl3(<8 x i8>* %A, %struct.__neon_int8x8x3_t* %B) nounwind {
+;CHECK: vtbl3:
+;CHECK: vtbl.8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load %struct.__neon_int8x8x3_t* %B
+ %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
+ %tmp6 = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> %tmp1, <8 x i8> %tmp3, <8 x i8> %tmp4, <8 x i8> %tmp5)
+ ret <8 x i8> %tmp6
+}
+
+define <8 x i8> @vtbl4(<8 x i8>* %A, %struct.__neon_int8x8x4_t* %B) nounwind {
+;CHECK: vtbl4:
+;CHECK: vtbl.8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load %struct.__neon_int8x8x4_t* %B
+ %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
+ %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
+ %tmp7 = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %tmp1, <8 x i8> %tmp3, <8 x i8> %tmp4, <8 x i8> %tmp5, <8 x i8> %tmp6)
+ ret <8 x i8> %tmp7
+}
+
+define <8 x i8> @vtbx1(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK: vtbx1:
+;CHECK: vtbx.8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = load <8 x i8>* %C
+ %tmp4 = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+ ret <8 x i8> %tmp4
+}
+
+define <8 x i8> @vtbx2(<8 x i8>* %A, %struct.__neon_int8x8x2_t* %B, <8 x i8>* %C) nounwind {
+;CHECK: vtbx2:
+;CHECK: vtbx.8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load %struct.__neon_int8x8x2_t* %B
+ %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
+ %tmp5 = load <8 x i8>* %C
+ %tmp6 = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %tmp1, <8 x i8> %tmp3, <8 x i8> %tmp4, <8 x i8> %tmp5)
+ ret <8 x i8> %tmp6
+}
+
+define <8 x i8> @vtbx3(<8 x i8>* %A, %struct.__neon_int8x8x3_t* %B, <8 x i8>* %C) nounwind {
+;CHECK: vtbx3:
+;CHECK: vtbx.8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load %struct.__neon_int8x8x3_t* %B
+ %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
+ %tmp6 = load <8 x i8>* %C
+ %tmp7 = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %tmp1, <8 x i8> %tmp3, <8 x i8> %tmp4, <8 x i8> %tmp5, <8 x i8> %tmp6)
+ ret <8 x i8> %tmp7
+}
+
+define <8 x i8> @vtbx4(<8 x i8>* %A, %struct.__neon_int8x8x4_t* %B, <8 x i8>* %C) nounwind {
+;CHECK: vtbx4:
+;CHECK: vtbx.8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load %struct.__neon_int8x8x4_t* %B
+ %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
+ %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
+ %tmp7 = load <8 x i8>* %C
+ %tmp8 = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %tmp1, <8 x i8> %tmp3, <8 x i8> %tmp4, <8 x i8> %tmp5, <8 x i8> %tmp6, <8 x i8> %tmp7)
+ ret <8 x i8> %tmp8
+}
+
+declare <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8>, <8 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM/vtrn.ll b/test/CodeGen/ARM/vtrn.ll
new file mode 100644
index 0000000..5122b09
--- /dev/null
+++ b/test/CodeGen/ARM/vtrn.ll
@@ -0,0 +1,97 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vtrni8:
+;CHECK: vtrn.8
+;CHECK-NEXT: vadd.i8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vtrni16:
+;CHECK: vtrn.16
+;CHECK-NEXT: vadd.i16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %tmp5 = add <4 x i16> %tmp3, %tmp4
+ ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK: vtrni32:
+;CHECK: vtrn.32
+;CHECK-NEXT: vadd.i32
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 0, i32 2>
+ %tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 3>
+ %tmp5 = add <2 x i32> %tmp3, %tmp4
+ ret <2 x i32> %tmp5
+}
+
+define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK: vtrnf:
+;CHECK: vtrn.32
+;CHECK-NEXT: vadd.f32
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 0, i32 2>
+ %tmp4 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 1, i32 3>
+ %tmp5 = add <2 x float> %tmp3, %tmp4
+ ret <2 x float> %tmp5
+}
+
+define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vtrnQi8:
+;CHECK: vtrn.8
+;CHECK-NEXT: vadd.i8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+ %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+ %tmp5 = add <16 x i8> %tmp3, %tmp4
+ ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vtrnQi16:
+;CHECK: vtrn.16
+;CHECK-NEXT: vadd.i16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vtrnQi32:
+;CHECK: vtrn.32
+;CHECK-NEXT: vadd.i32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK: vtrnQf:
+;CHECK: vtrn.32
+;CHECK-NEXT: vadd.f32
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %tmp5 = add <4 x float> %tmp3, %tmp4
+ ret <4 x float> %tmp5
+}
diff --git a/test/CodeGen/ARM/vuzp.ll b/test/CodeGen/ARM/vuzp.ll
new file mode 100644
index 0000000..e531718
--- /dev/null
+++ b/test/CodeGen/ARM/vuzp.ll
@@ -0,0 +1,75 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vuzpi8:
+;CHECK: vuzp.8
+;CHECK-NEXT: vadd.i8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vuzpi16:
+;CHECK: vuzp.16
+;CHECK-NEXT: vadd.i16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %tmp5 = add <4 x i16> %tmp3, %tmp4
+ ret <4 x i16> %tmp5
+}
+
+; VUZP.32 is equivalent to VTRN.32 for 64-bit vectors.
+
+define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vuzpQi8:
+;CHECK: vuzp.8
+;CHECK-NEXT: vadd.i8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+ %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+ %tmp5 = add <16 x i8> %tmp3, %tmp4
+ ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vuzpQi16:
+;CHECK: vuzp.16
+;CHECK-NEXT: vadd.i16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vuzpQi32:
+;CHECK: vuzp.32
+;CHECK-NEXT: vadd.i32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK: vuzpQf:
+;CHECK: vuzp.32
+;CHECK-NEXT: vadd.f32
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %tmp5 = add <4 x float> %tmp3, %tmp4
+ ret <4 x float> %tmp5
+}
diff --git a/test/CodeGen/ARM/vzip.ll b/test/CodeGen/ARM/vzip.ll
new file mode 100644
index 0000000..32f7e0d
--- /dev/null
+++ b/test/CodeGen/ARM/vzip.ll
@@ -0,0 +1,75 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+
+define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK: vzipi8:
+;CHECK: vzip.8
+;CHECK-NEXT: vadd.i8
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+ %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vzipi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK: vzipi16:
+;CHECK: vzip.16
+;CHECK-NEXT: vadd.i16
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %tmp5 = add <4 x i16> %tmp3, %tmp4
+ ret <4 x i16> %tmp5
+}
+
+; VZIP.32 is equivalent to VTRN.32 for 64-bit vectors.
+
+define <16 x i8> @vzipQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK: vzipQi8:
+;CHECK: vzip.8
+;CHECK-NEXT: vadd.i8
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+ %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+ %tmp5 = add <16 x i8> %tmp3, %tmp4
+ ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vzipQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK: vzipQi16:
+;CHECK: vzip.16
+;CHECK-NEXT: vadd.i16
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+ %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vzipQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK: vzipQi32:
+;CHECK: vzip.32
+;CHECK-NEXT: vadd.i32
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vzipQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK: vzipQf:
+;CHECK: vzip.32
+;CHECK-NEXT: vadd.f32
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %tmp5 = add <4 x float> %tmp3, %tmp4
+ ret <4 x float> %tmp5
+}
diff --git a/test/CodeGen/ARM/weak.ll b/test/CodeGen/ARM/weak.ll
index dadd1b9..5ac4b8c 100644
--- a/test/CodeGen/ARM/weak.ll
+++ b/test/CodeGen/ARM/weak.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llc -march=arm | grep .weak.*f
-; RUN: llvm-as < %s | llc -march=arm | grep .weak.*h
+; RUN: llc < %s -march=arm | grep .weak.*f
+; RUN: llc < %s -march=arm | grep .weak.*h
define weak i32 @f() {
entry:
diff --git a/test/CodeGen/ARM/weak2.ll b/test/CodeGen/ARM/weak2.ll
index a57a767..cf327bb 100644
--- a/test/CodeGen/ARM/weak2.ll
+++ b/test/CodeGen/ARM/weak2.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=arm | grep .weak
+; RUN: llc < %s -march=arm | grep .weak
define i32 @f(i32 %a) {
entry:
OpenPOWER on IntegriCloud