diff options
Diffstat (limited to 'contrib/llvm/lib/Target/ARM')
-rw-r--r-- | contrib/llvm/lib/Target/ARM/AsmParser/CMakeLists.txt | 7 | ||||
-rw-r--r-- | contrib/llvm/lib/Target/ARM/AsmParser/Makefile | 15 | ||||
-rw-r--r-- | contrib/llvm/lib/Target/ARM/AsmPrinter/CMakeLists.txt | 6 | ||||
-rw-r--r-- | contrib/llvm/lib/Target/ARM/AsmPrinter/Makefile | 15 | ||||
-rw-r--r-- | contrib/llvm/lib/Target/ARM/CMakeLists.txt | 50 | ||||
-rw-r--r-- | contrib/llvm/lib/Target/ARM/Disassembler/Makefile | 16 | ||||
-rw-r--r-- | contrib/llvm/lib/Target/ARM/Makefile | 25 | ||||
-rw-r--r-- | contrib/llvm/lib/Target/ARM/README-Thumb.txt | 248 | ||||
-rw-r--r-- | contrib/llvm/lib/Target/ARM/README-Thumb2.txt | 6 | ||||
-rw-r--r-- | contrib/llvm/lib/Target/ARM/README.txt | 659 | ||||
-rw-r--r-- | contrib/llvm/lib/Target/ARM/TargetInfo/CMakeLists.txt | 7 | ||||
-rw-r--r-- | contrib/llvm/lib/Target/ARM/TargetInfo/Makefile | 15 |
12 files changed, 0 insertions, 1069 deletions
diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/CMakeLists.txt b/contrib/llvm/lib/Target/ARM/AsmParser/CMakeLists.txt deleted file mode 100644 index 9ba7c01..0000000 --- a/contrib/llvm/lib/Target/ARM/AsmParser/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) - -add_llvm_library(LLVMARMAsmParser - ARMAsmLexer.cpp - ARMAsmParser.cpp - ) - diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/Makefile b/contrib/llvm/lib/Target/ARM/AsmParser/Makefile deleted file mode 100644 index 841516f..0000000 --- a/contrib/llvm/lib/Target/ARM/AsmParser/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/ARM/AsmParser/Makefile -------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMARMAsmParser - -# Hack: we need to include 'main' ARM target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/contrib/llvm/lib/Target/ARM/AsmPrinter/CMakeLists.txt b/contrib/llvm/lib/Target/ARM/AsmPrinter/CMakeLists.txt deleted file mode 100644 index 18645c0..0000000 --- a/contrib/llvm/lib/Target/ARM/AsmPrinter/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) - -add_llvm_library(LLVMARMAsmPrinter - ARMInstPrinter.cpp - ) -add_dependencies(LLVMARMAsmPrinter ARMCodeGenTable_gen) diff --git a/contrib/llvm/lib/Target/ARM/AsmPrinter/Makefile b/contrib/llvm/lib/Target/ARM/AsmPrinter/Makefile deleted file mode 100644 index 65d372e..0000000 --- a/contrib/llvm/lib/Target/ARM/AsmPrinter/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/ARM/AsmPrinter/Makefile ------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMARMAsmPrinter - -# Hack: we need to include 'main' arm target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/contrib/llvm/lib/Target/ARM/CMakeLists.txt b/contrib/llvm/lib/Target/ARM/CMakeLists.txt deleted file mode 100644 index 6b4dee5..0000000 --- a/contrib/llvm/lib/Target/ARM/CMakeLists.txt +++ /dev/null @@ -1,50 +0,0 @@ -set(LLVM_TARGET_DEFINITIONS ARM.td) - -tablegen(ARMGenRegisterInfo.h.inc -gen-register-desc-header) -tablegen(ARMGenRegisterNames.inc -gen-register-enums) -tablegen(ARMGenRegisterInfo.inc -gen-register-desc) -tablegen(ARMGenInstrNames.inc -gen-instr-enums) -tablegen(ARMGenInstrInfo.inc -gen-instr-desc) -tablegen(ARMGenCodeEmitter.inc -gen-emitter) -tablegen(ARMGenAsmWriter.inc -gen-asm-writer) -tablegen(ARMGenAsmMatcher.inc -gen-asm-matcher) -tablegen(ARMGenDAGISel.inc -gen-dag-isel) -tablegen(ARMGenFastISel.inc -gen-fast-isel) -tablegen(ARMGenCallingConv.inc -gen-callingconv) -tablegen(ARMGenSubtarget.inc -gen-subtarget) -tablegen(ARMGenEDInfo.inc -gen-enhanced-disassembly-info) - -add_llvm_target(ARMCodeGen - ARMAsmPrinter.cpp - ARMBaseInstrInfo.cpp - ARMBaseRegisterInfo.cpp - ARMCodeEmitter.cpp - ARMConstantIslandPass.cpp - ARMConstantPoolValue.cpp - ARMExpandPseudoInsts.cpp - ARMFastISel.cpp - ARMGlobalMerge.cpp - ARMISelDAGToDAG.cpp - ARMISelLowering.cpp - ARMInstrInfo.cpp - ARMJITInfo.cpp - ARMLoadStoreOptimizer.cpp - ARMMCAsmInfo.cpp - ARMMCInstLower.cpp - ARMRegisterInfo.cpp - ARMSelectionDAGInfo.cpp - ARMSubtarget.cpp - ARMTargetMachine.cpp - ARMTargetObjectFile.cpp - NEONMoveFix.cpp - NEONPreAllocPass.cpp - Thumb1InstrInfo.cpp - Thumb1RegisterInfo.cpp - Thumb2HazardRecognizer.cpp - Thumb2ITBlockPass.cpp - Thumb2InstrInfo.cpp - Thumb2RegisterInfo.cpp - Thumb2SizeReduction.cpp - ) - -target_link_libraries (LLVMARMCodeGen LLVMARMAsmPrinter LLVMSelectionDAG) diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/Makefile b/contrib/llvm/lib/Target/ARM/Disassembler/Makefile deleted file mode 100644 index 031b6ac..0000000 --- a/contrib/llvm/lib/Target/ARM/Disassembler/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -##===- lib/Target/ARM/Disassembler/Makefile ----------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../../.. -LIBRARYNAME = LLVMARMDisassembler - -# Hack: we need to include 'main' arm target directory to grab private headers -CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/contrib/llvm/lib/Target/ARM/Makefile b/contrib/llvm/lib/Target/ARM/Makefile deleted file mode 100644 index b3fcfaf6..0000000 --- a/contrib/llvm/lib/Target/ARM/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -##===- lib/Target/ARM/Makefile -----------------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../.. -LIBRARYNAME = LLVMARMCodeGen -TARGET = ARM - -# Make sure that tblgen is run, first thing. -BUILT_SOURCES = ARMGenRegisterInfo.h.inc ARMGenRegisterNames.inc \ - ARMGenRegisterInfo.inc ARMGenInstrNames.inc \ - ARMGenInstrInfo.inc ARMGenAsmWriter.inc ARMGenAsmMatcher.inc \ - ARMGenDAGISel.inc ARMGenSubtarget.inc \ - ARMGenCodeEmitter.inc ARMGenCallingConv.inc \ - ARMGenDecoderTables.inc ARMGenEDInfo.inc \ - ARMGenFastISel.inc - -DIRS = AsmPrinter AsmParser Disassembler TargetInfo - -include $(LEVEL)/Makefile.common diff --git a/contrib/llvm/lib/Target/ARM/README-Thumb.txt b/contrib/llvm/lib/Target/ARM/README-Thumb.txt deleted file mode 100644 index 6b605bb..0000000 --- a/contrib/llvm/lib/Target/ARM/README-Thumb.txt +++ /dev/null @@ -1,248 +0,0 @@ -//===---------------------------------------------------------------------===// -// Random ideas for the ARM backend (Thumb specific). -//===---------------------------------------------------------------------===// - -* Add support for compiling functions in both ARM and Thumb mode, then taking - the smallest. - -* Add support for compiling individual basic blocks in thumb mode, when in a - larger ARM function. This can be used for presumed cold code, like paths - to abort (failure path of asserts), EH handling code, etc. - -* Thumb doesn't have normal pre/post increment addressing modes, but you can - load/store 32-bit integers with pre/postinc by using load/store multiple - instrs with a single register. - -* Make better use of high registers r8, r10, r11, r12 (ip). Some variants of add - and cmp instructions can use high registers. Also, we can use them as - temporaries to spill values into. - -* In thumb mode, short, byte, and bool preferred alignments are currently set - to 4 to accommodate ISA restriction (i.e. add sp, #imm, imm must be multiple - of 4). - -//===---------------------------------------------------------------------===// - -Potential jumptable improvements: - -* If we know function size is less than (1 << 16) * 2 bytes, we can use 16-bit - jumptable entries (e.g. (L1 - L2) >> 1). Or even smaller entries if the - function is even smaller. This also applies to ARM. - -* Thumb jumptable codegen can improve given some help from the assembler. This - is what we generate right now: - - .set PCRELV0, (LJTI1_0_0-(LPCRELL0+4)) -LPCRELL0: - mov r1, #PCRELV0 - add r1, pc - ldr r0, [r0, r1] - mov pc, r0 - .align 2 -LJTI1_0_0: - .long LBB1_3 - ... - -Note there is another pc relative add that we can take advantage of. - add r1, pc, #imm_8 * 4 - -We should be able to generate: - -LPCRELL0: - add r1, LJTI1_0_0 - ldr r0, [r0, r1] - mov pc, r0 - .align 2 -LJTI1_0_0: - .long LBB1_3 - -if the assembler can translate the add to: - add r1, pc, #((LJTI1_0_0-(LPCRELL0+4))&0xfffffffc) - -Note the assembler also does something similar to constpool load: -LPCRELL0: - ldr r0, LCPI1_0 -=> - ldr r0, pc, #((LCPI1_0-(LPCRELL0+4))&0xfffffffc) - - -//===---------------------------------------------------------------------===// - -We compiles the following: - -define i16 @func_entry_2E_ce(i32 %i) { - switch i32 %i, label %bb12.exitStub [ - i32 0, label %bb4.exitStub - i32 1, label %bb9.exitStub - i32 2, label %bb4.exitStub - i32 3, label %bb4.exitStub - i32 7, label %bb9.exitStub - i32 8, label %bb.exitStub - i32 9, label %bb9.exitStub - ] - -bb12.exitStub: - ret i16 0 - -bb4.exitStub: - ret i16 1 - -bb9.exitStub: - ret i16 2 - -bb.exitStub: - ret i16 3 -} - -into: - -_func_entry_2E_ce: - mov r2, #1 - lsl r2, r0 - cmp r0, #9 - bhi LBB1_4 @bb12.exitStub -LBB1_1: @newFuncRoot - mov r1, #13 - tst r2, r1 - bne LBB1_5 @bb4.exitStub -LBB1_2: @newFuncRoot - ldr r1, LCPI1_0 - tst r2, r1 - bne LBB1_6 @bb9.exitStub -LBB1_3: @newFuncRoot - mov r1, #1 - lsl r1, r1, #8 - tst r2, r1 - bne LBB1_7 @bb.exitStub -LBB1_4: @bb12.exitStub - mov r0, #0 - bx lr -LBB1_5: @bb4.exitStub - mov r0, #1 - bx lr -LBB1_6: @bb9.exitStub - mov r0, #2 - bx lr -LBB1_7: @bb.exitStub - mov r0, #3 - bx lr -LBB1_8: - .align 2 -LCPI1_0: - .long 642 - - -gcc compiles to: - - cmp r0, #9 - @ lr needed for prologue - bhi L2 - ldr r3, L11 - mov r2, #1 - mov r1, r2, asl r0 - ands r0, r3, r2, asl r0 - movne r0, #2 - bxne lr - tst r1, #13 - beq L9 -L3: - mov r0, r2 - bx lr -L9: - tst r1, #256 - movne r0, #3 - bxne lr -L2: - mov r0, #0 - bx lr -L12: - .align 2 -L11: - .long 642 - - -GCC is doing a couple of clever things here: - 1. It is predicating one of the returns. This isn't a clear win though: in - cases where that return isn't taken, it is replacing one condbranch with - two 'ne' predicated instructions. - 2. It is sinking the shift of "1 << i" into the tst, and using ands instead of - tst. This will probably require whole function isel. - 3. GCC emits: - tst r1, #256 - we emit: - mov r1, #1 - lsl r1, r1, #8 - tst r2, r1 - - -//===---------------------------------------------------------------------===// - -When spilling in thumb mode and the sp offset is too large to fit in the ldr / -str offset field, we load the offset from a constpool entry and add it to sp: - -ldr r2, LCPI -add r2, sp -ldr r2, [r2] - -These instructions preserve the condition code which is important if the spill -is between a cmp and a bcc instruction. However, we can use the (potentially) -cheaper sequnce if we know it's ok to clobber the condition register. - -add r2, sp, #255 * 4 -add r2, #132 -ldr r2, [r2, #7 * 4] - -This is especially bad when dynamic alloca is used. The all fixed size stack -objects are referenced off the frame pointer with negative offsets. See -oggenc for an example. - - -//===---------------------------------------------------------------------===// - -Poor codegen test/CodeGen/ARM/select.ll f7: - - ldr r5, LCPI1_0 -LPC0: - add r5, pc - ldr r6, LCPI1_1 - ldr r2, LCPI1_2 - mov r3, r6 - mov lr, pc - bx r5 - -//===---------------------------------------------------------------------===// - -Make register allocator / spiller smarter so we can re-materialize "mov r, imm", -etc. Almost all Thumb instructions clobber condition code. - -//===---------------------------------------------------------------------===// - -Add ldmia, stmia support. - -//===---------------------------------------------------------------------===// - -Thumb load / store address mode offsets are scaled. The values kept in the -instruction operands are pre-scale values. This probably ought to be changed -to avoid extra work when we convert Thumb2 instructions to Thumb1 instructions. - -//===---------------------------------------------------------------------===// - -We need to make (some of the) Thumb1 instructions predicable. That will allow -shrinking of predicated Thumb2 instructions. To allow this, we need to be able -to toggle the 's' bit since they do not set CPSR when they are inside IT blocks. - -//===---------------------------------------------------------------------===// - -Make use of hi register variants of cmp: tCMPhir / tCMPZhir. - -//===---------------------------------------------------------------------===// - -Thumb1 immediate field sometimes keep pre-scaled values. See -Thumb1RegisterInfo::eliminateFrameIndex. This is inconsistent from ARM and -Thumb2. - -//===---------------------------------------------------------------------===// - -Rather than having tBR_JTr print a ".align 2" and constant island pass pad it, -add a target specific ALIGN instruction instead. That way, GetInstSizeInBytes -won't have to over-estimate. It can also be used for loop alignment pass. diff --git a/contrib/llvm/lib/Target/ARM/README-Thumb2.txt b/contrib/llvm/lib/Target/ARM/README-Thumb2.txt deleted file mode 100644 index e7c2552..0000000 --- a/contrib/llvm/lib/Target/ARM/README-Thumb2.txt +++ /dev/null @@ -1,6 +0,0 @@ -//===---------------------------------------------------------------------===// -// Random ideas for the ARM backend (Thumb2 specific). -//===---------------------------------------------------------------------===// - -Make sure jumptable destinations are below the jumptable in order to make use -of tbb / tbh. diff --git a/contrib/llvm/lib/Target/ARM/README.txt b/contrib/llvm/lib/Target/ARM/README.txt deleted file mode 100644 index 9fc3fb9..0000000 --- a/contrib/llvm/lib/Target/ARM/README.txt +++ /dev/null @@ -1,659 +0,0 @@ -//===---------------------------------------------------------------------===// -// Random ideas for the ARM backend. -//===---------------------------------------------------------------------===// - -Reimplement 'select' in terms of 'SEL'. - -* We would really like to support UXTAB16, but we need to prove that the - add doesn't need to overflow between the two 16-bit chunks. - -* Implement pre/post increment support. (e.g. PR935) -* Implement smarter constant generation for binops with large immediates. - -A few ARMv6T2 ops should be pattern matched: BFI, SBFX, and UBFX - -Interesting optimization for PIC codegen on arm-linux: -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43129 - -//===---------------------------------------------------------------------===// - -Crazy idea: Consider code that uses lots of 8-bit or 16-bit values. By the -time regalloc happens, these values are now in a 32-bit register, usually with -the top-bits known to be sign or zero extended. If spilled, we should be able -to spill these to a 8-bit or 16-bit stack slot, zero or sign extending as part -of the reload. - -Doing this reduces the size of the stack frame (important for thumb etc), and -also increases the likelihood that we will be able to reload multiple values -from the stack with a single load. - -//===---------------------------------------------------------------------===// - -The constant island pass is in good shape. Some cleanups might be desirable, -but there is unlikely to be much improvement in the generated code. - -1. There may be some advantage to trying to be smarter about the initial -placement, rather than putting everything at the end. - -2. There might be some compile-time efficiency to be had by representing -consecutive islands as a single block rather than multiple blocks. - -3. Use a priority queue to sort constant pool users in inverse order of - position so we always process the one closed to the end of functions - first. This may simply CreateNewWater. - -//===---------------------------------------------------------------------===// - -Eliminate copysign custom expansion. We are still generating crappy code with -default expansion + if-conversion. - -//===---------------------------------------------------------------------===// - -Eliminate one instruction from: - -define i32 @_Z6slow4bii(i32 %x, i32 %y) { - %tmp = icmp sgt i32 %x, %y - %retval = select i1 %tmp, i32 %x, i32 %y - ret i32 %retval -} - -__Z6slow4bii: - cmp r0, r1 - movgt r1, r0 - mov r0, r1 - bx lr -=> - -__Z6slow4bii: - cmp r0, r1 - movle r0, r1 - bx lr - -//===---------------------------------------------------------------------===// - -Implement long long "X-3" with instructions that fold the immediate in. These -were disabled due to badness with the ARM carry flag on subtracts. - -//===---------------------------------------------------------------------===// - -More load / store optimizations: -1) Better representation for block transfer? This is from Olden/power: - - fldd d0, [r4] - fstd d0, [r4, #+32] - fldd d0, [r4, #+8] - fstd d0, [r4, #+40] - fldd d0, [r4, #+16] - fstd d0, [r4, #+48] - fldd d0, [r4, #+24] - fstd d0, [r4, #+56] - -If we can spare the registers, it would be better to use fldm and fstm here. -Need major register allocator enhancement though. - -2) Can we recognize the relative position of constantpool entries? i.e. Treat - - ldr r0, LCPI17_3 - ldr r1, LCPI17_4 - ldr r2, LCPI17_5 - - as - ldr r0, LCPI17 - ldr r1, LCPI17+4 - ldr r2, LCPI17+8 - - Then the ldr's can be combined into a single ldm. See Olden/power. - -Note for ARM v4 gcc uses ldmia to load a pair of 32-bit values to represent a -double 64-bit FP constant: - - adr r0, L6 - ldmia r0, {r0-r1} - - .align 2 -L6: - .long -858993459 - .long 1074318540 - -3) struct copies appear to be done field by field -instead of by words, at least sometimes: - -struct foo { int x; short s; char c1; char c2; }; -void cpy(struct foo*a, struct foo*b) { *a = *b; } - -llvm code (-O2) - ldrb r3, [r1, #+6] - ldr r2, [r1] - ldrb r12, [r1, #+7] - ldrh r1, [r1, #+4] - str r2, [r0] - strh r1, [r0, #+4] - strb r3, [r0, #+6] - strb r12, [r0, #+7] -gcc code (-O2) - ldmia r1, {r1-r2} - stmia r0, {r1-r2} - -In this benchmark poor handling of aggregate copies has shown up as -having a large effect on size, and possibly speed as well (we don't have -a good way to measure on ARM). - -//===---------------------------------------------------------------------===// - -* Consider this silly example: - -double bar(double x) { - double r = foo(3.1); - return x+r; -} - -_bar: - stmfd sp!, {r4, r5, r7, lr} - add r7, sp, #8 - mov r4, r0 - mov r5, r1 - fldd d0, LCPI1_0 - fmrrd r0, r1, d0 - bl _foo - fmdrr d0, r4, r5 - fmsr s2, r0 - fsitod d1, s2 - faddd d0, d1, d0 - fmrrd r0, r1, d0 - ldmfd sp!, {r4, r5, r7, pc} - -Ignore the prologue and epilogue stuff for a second. Note - mov r4, r0 - mov r5, r1 -the copys to callee-save registers and the fact they are only being used by the -fmdrr instruction. It would have been better had the fmdrr been scheduled -before the call and place the result in a callee-save DPR register. The two -mov ops would not have been necessary. - -//===---------------------------------------------------------------------===// - -Calling convention related stuff: - -* gcc's parameter passing implementation is terrible and we suffer as a result: - -e.g. -struct s { - double d1; - int s1; -}; - -void foo(struct s S) { - printf("%g, %d\n", S.d1, S.s1); -} - -'S' is passed via registers r0, r1, r2. But gcc stores them to the stack, and -then reload them to r1, r2, and r3 before issuing the call (r0 contains the -address of the format string): - - stmfd sp!, {r7, lr} - add r7, sp, #0 - sub sp, sp, #12 - stmia sp, {r0, r1, r2} - ldmia sp, {r1-r2} - ldr r0, L5 - ldr r3, [sp, #8] -L2: - add r0, pc, r0 - bl L_printf$stub - -Instead of a stmia, ldmia, and a ldr, wouldn't it be better to do three moves? - -* Return an aggregate type is even worse: - -e.g. -struct s foo(void) { - struct s S = {1.1, 2}; - return S; -} - - mov ip, r0 - ldr r0, L5 - sub sp, sp, #12 -L2: - add r0, pc, r0 - @ lr needed for prologue - ldmia r0, {r0, r1, r2} - stmia sp, {r0, r1, r2} - stmia ip, {r0, r1, r2} - mov r0, ip - add sp, sp, #12 - bx lr - -r0 (and later ip) is the hidden parameter from caller to store the value in. The -first ldmia loads the constants into r0, r1, r2. The last stmia stores r0, r1, -r2 into the address passed in. However, there is one additional stmia that -stores r0, r1, and r2 to some stack location. The store is dead. - -The llvm-gcc generated code looks like this: - -csretcc void %foo(%struct.s* %agg.result) { -entry: - %S = alloca %struct.s, align 4 ; <%struct.s*> [#uses=1] - %memtmp = alloca %struct.s ; <%struct.s*> [#uses=1] - cast %struct.s* %S to sbyte* ; <sbyte*>:0 [#uses=2] - call void %llvm.memcpy.i32( sbyte* %0, sbyte* cast ({ double, int }* %C.0.904 to sbyte*), uint 12, uint 4 ) - cast %struct.s* %agg.result to sbyte* ; <sbyte*>:1 [#uses=2] - call void %llvm.memcpy.i32( sbyte* %1, sbyte* %0, uint 12, uint 0 ) - cast %struct.s* %memtmp to sbyte* ; <sbyte*>:2 [#uses=1] - call void %llvm.memcpy.i32( sbyte* %2, sbyte* %1, uint 12, uint 0 ) - ret void -} - -llc ends up issuing two memcpy's (the first memcpy becomes 3 loads from -constantpool). Perhaps we should 1) fix llvm-gcc so the memcpy is translated -into a number of load and stores, or 2) custom lower memcpy (of small size) to -be ldmia / stmia. I think option 2 is better but the current register -allocator cannot allocate a chunk of registers at a time. - -A feasible temporary solution is to use specific physical registers at the -lowering time for small (<= 4 words?) transfer size. - -* ARM CSRet calling convention requires the hidden argument to be returned by -the callee. - -//===---------------------------------------------------------------------===// - -We can definitely do a better job on BB placements to eliminate some branches. -It's very common to see llvm generated assembly code that looks like this: - -LBB3: - ... -LBB4: -... - beq LBB3 - b LBB2 - -If BB4 is the only predecessor of BB3, then we can emit BB3 after BB4. We can -then eliminate beq and and turn the unconditional branch to LBB2 to a bne. - -See McCat/18-imp/ComputeBoundingBoxes for an example. - -//===---------------------------------------------------------------------===// - -Pre-/post- indexed load / stores: - -1) We should not make the pre/post- indexed load/store transform if the base ptr -is guaranteed to be live beyond the load/store. This can happen if the base -ptr is live out of the block we are performing the optimization. e.g. - -mov r1, r2 -ldr r3, [r1], #4 -... - -vs. - -ldr r3, [r2] -add r1, r2, #4 -... - -In most cases, this is just a wasted optimization. However, sometimes it can -negatively impact the performance because two-address code is more restrictive -when it comes to scheduling. - -Unfortunately, liveout information is currently unavailable during DAG combine -time. - -2) Consider spliting a indexed load / store into a pair of add/sub + load/store - to solve #1 (in TwoAddressInstructionPass.cpp). - -3) Enhance LSR to generate more opportunities for indexed ops. - -4) Once we added support for multiple result patterns, write indexed loads - patterns instead of C++ instruction selection code. - -5) Use VLDM / VSTM to emulate indexed FP load / store. - -//===---------------------------------------------------------------------===// - -Implement support for some more tricky ways to materialize immediates. For -example, to get 0xffff8000, we can use: - -mov r9, #&3f8000 -sub r9, r9, #&400000 - -//===---------------------------------------------------------------------===// - -We sometimes generate multiple add / sub instructions to update sp in prologue -and epilogue if the inc / dec value is too large to fit in a single immediate -operand. In some cases, perhaps it might be better to load the value from a -constantpool instead. - -//===---------------------------------------------------------------------===// - -GCC generates significantly better code for this function. - -int foo(int StackPtr, unsigned char *Line, unsigned char *Stack, int LineLen) { - int i = 0; - - if (StackPtr != 0) { - while (StackPtr != 0 && i < (((LineLen) < (32768))? (LineLen) : (32768))) - Line[i++] = Stack[--StackPtr]; - if (LineLen > 32768) - { - while (StackPtr != 0 && i < LineLen) - { - i++; - --StackPtr; - } - } - } - return StackPtr; -} - -//===---------------------------------------------------------------------===// - -This should compile to the mlas instruction: -int mlas(int x, int y, int z) { return ((x * y + z) < 0) ? 7 : 13; } - -//===---------------------------------------------------------------------===// - -At some point, we should triage these to see if they still apply to us: - -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19598 -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18560 -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=27016 - -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11831 -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11826 -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11825 -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11824 -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11823 -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11820 -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10982 - -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10242 -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9831 -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9760 -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9759 -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9703 -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9702 -http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9663 - -http://www.inf.u-szeged.hu/gcc-arm/ -http://citeseer.ist.psu.edu/debus04linktime.html - -//===---------------------------------------------------------------------===// - -gcc generates smaller code for this function at -O2 or -Os: - -void foo(signed char* p) { - if (*p == 3) - bar(); - else if (*p == 4) - baz(); - else if (*p == 5) - quux(); -} - -llvm decides it's a good idea to turn the repeated if...else into a -binary tree, as if it were a switch; the resulting code requires -1 -compare-and-branches when *p<=2 or *p==5, the same number if *p==4 -or *p>6, and +1 if *p==3. So it should be a speed win -(on balance). However, the revised code is larger, with 4 conditional -branches instead of 3. - -More seriously, there is a byte->word extend before -each comparison, where there should be only one, and the condition codes -are not remembered when the same two values are compared twice. - -//===---------------------------------------------------------------------===// - -More LSR enhancements possible: - -1. Teach LSR about pre- and post- indexed ops to allow iv increment be merged - in a load / store. -2. Allow iv reuse even when a type conversion is required. For example, i8 - and i32 load / store addressing modes are identical. - - -//===---------------------------------------------------------------------===// - -This: - -int foo(int a, int b, int c, int d) { - long long acc = (long long)a * (long long)b; - acc += (long long)c * (long long)d; - return (int)(acc >> 32); -} - -Should compile to use SMLAL (Signed Multiply Accumulate Long) which multiplies -two signed 32-bit values to produce a 64-bit value, and accumulates this with -a 64-bit value. - -We currently get this with both v4 and v6: - -_foo: - smull r1, r0, r1, r0 - smull r3, r2, r3, r2 - adds r3, r3, r1 - adc r0, r2, r0 - bx lr - -//===---------------------------------------------------------------------===// - -This: - #include <algorithm> - std::pair<unsigned, bool> full_add(unsigned a, unsigned b) - { return std::make_pair(a + b, a + b < a); } - bool no_overflow(unsigned a, unsigned b) - { return !full_add(a, b).second; } - -Should compile to: - -_Z8full_addjj: - adds r2, r1, r2 - movcc r1, #0 - movcs r1, #1 - str r2, [r0, #0] - strb r1, [r0, #4] - mov pc, lr - -_Z11no_overflowjj: - cmn r0, r1 - movcs r0, #0 - movcc r0, #1 - mov pc, lr - -not: - -__Z8full_addjj: - add r3, r2, r1 - str r3, [r0] - mov r2, #1 - mov r12, #0 - cmp r3, r1 - movlo r12, r2 - str r12, [r0, #+4] - bx lr -__Z11no_overflowjj: - add r3, r1, r0 - mov r2, #1 - mov r1, #0 - cmp r3, r0 - movhs r1, r2 - mov r0, r1 - bx lr - -//===---------------------------------------------------------------------===// - -Some of the NEON intrinsics may be appropriate for more general use, either -as target-independent intrinsics or perhaps elsewhere in the ARM backend. -Some of them may also be lowered to target-independent SDNodes, and perhaps -some new SDNodes could be added. - -For example, maximum, minimum, and absolute value operations are well-defined -and standard operations, both for vector and scalar types. - -The current NEON-specific intrinsics for count leading zeros and count one -bits could perhaps be replaced by the target-independent ctlz and ctpop -intrinsics. It may also make sense to add a target-independent "ctls" -intrinsic for "count leading sign bits". Likewise, the backend could use -the target-independent SDNodes for these operations. - -ARMv6 has scalar saturating and halving adds and subtracts. The same -intrinsics could possibly be used for both NEON's vector implementations of -those operations and the ARMv6 scalar versions. - -//===---------------------------------------------------------------------===// - -ARM::MOVCCr is commutable (by flipping the condition). But we need to implement -ARMInstrInfo::commuteInstruction() to support it. - -//===---------------------------------------------------------------------===// - -Split out LDR (literal) from normal ARM LDR instruction. Also consider spliting -LDR into imm12 and so_reg forms. This allows us to clean up some code. e.g. -ARMLoadStoreOptimizer does not need to look at LDR (literal) and LDR (so_reg) -while ARMConstantIslandPass only need to worry about LDR (literal). - -//===---------------------------------------------------------------------===// - -Constant island pass should make use of full range SoImm values for LEApcrel. -Be careful though as the last attempt caused infinite looping on lencod. - -//===---------------------------------------------------------------------===// - -Predication issue. This function: - -extern unsigned array[ 128 ]; -int foo( int x ) { - int y; - y = array[ x & 127 ]; - if ( x & 128 ) - y = 123456789 & ( y >> 2 ); - else - y = 123456789 & y; - return y; -} - -compiles to: - -_foo: - and r1, r0, #127 - ldr r2, LCPI1_0 - ldr r2, [r2] - ldr r1, [r2, +r1, lsl #2] - mov r2, r1, lsr #2 - tst r0, #128 - moveq r2, r1 - ldr r0, LCPI1_1 - and r0, r2, r0 - bx lr - -It would be better to do something like this, to fold the shift into the -conditional move: - - and r1, r0, #127 - ldr r2, LCPI1_0 - ldr r2, [r2] - ldr r1, [r2, +r1, lsl #2] - tst r0, #128 - movne r1, r1, lsr #2 - ldr r0, LCPI1_1 - and r0, r1, r0 - bx lr - -it saves an instruction and a register. - -//===---------------------------------------------------------------------===// - -It might be profitable to cse MOVi16 if there are lots of 32-bit immediates -with the same bottom half. - -//===---------------------------------------------------------------------===// - -Robert Muth started working on an alternate jump table implementation that -does not put the tables in-line in the text. This is more like the llvm -default jump table implementation. This might be useful sometime. Several -revisions of patches are on the mailing list, beginning at: -http://lists.cs.uiuc.edu/pipermail/llvmdev/2009-June/022763.html - -//===---------------------------------------------------------------------===// - -Make use of the "rbit" instruction. - -//===---------------------------------------------------------------------===// - -Take a look at test/CodeGen/Thumb2/machine-licm.ll. ARM should be taught how -to licm and cse the unnecessary load from cp#1. - -//===---------------------------------------------------------------------===// - -The CMN instruction sets the flags like an ADD instruction, while CMP sets -them like a subtract. Therefore to be able to use CMN for comparisons other -than the Z bit, we'll need additional logic to reverse the conditionals -associated with the comparison. Perhaps a pseudo-instruction for the comparison, -with a post-codegen pass to clean up and handle the condition codes? -See PR5694 for testcase. - -//===---------------------------------------------------------------------===// - -Given the following on armv5: -int test1(int A, int B) { - return (A&-8388481)|(B&8388480); -} - -We currently generate: - ldr r2, .LCPI0_0 - and r0, r0, r2 - ldr r2, .LCPI0_1 - and r1, r1, r2 - orr r0, r1, r0 - bx lr - -We should be able to replace the second ldr+and with a bic (i.e. reuse the -constant which was already loaded). Not sure what's necessary to do that. - -//===---------------------------------------------------------------------===// - -The code generated for bswap on armv4/5 (CPUs without rev) is less than ideal: - -int a(int x) { return __builtin_bswap32(x); } - -a: - mov r1, #255, 24 - mov r2, #255, 16 - and r1, r1, r0, lsr #8 - and r2, r2, r0, lsl #8 - orr r1, r1, r0, lsr #24 - orr r0, r2, r0, lsl #24 - orr r0, r0, r1 - bx lr - -Something like the following would be better (fewer instructions/registers): - eor r1, r0, r0, ror #16 - bic r1, r1, #0xff0000 - mov r1, r1, lsr #8 - eor r0, r1, r0, ror #8 - bx lr - -A custom Thumb version would also be a slight improvement over the generic -version. - -//===---------------------------------------------------------------------===// - -Consider the following simple C code: - -void foo(unsigned char *a, unsigned char *b, int *c) { - if ((*a | *b) == 0) *c = 0; -} - -currently llvm-gcc generates something like this (nice branchless code I'd say): - - ldrb r0, [r0] - ldrb r1, [r1] - orr r0, r1, r0 - tst r0, #255 - moveq r0, #0 - streq r0, [r2] - bx lr - -Note that both "tst" and "moveq" are redundant. - -//===---------------------------------------------------------------------===// - diff --git a/contrib/llvm/lib/Target/ARM/TargetInfo/CMakeLists.txt b/contrib/llvm/lib/Target/ARM/TargetInfo/CMakeLists.txt deleted file mode 100644 index 3910bb0..0000000 --- a/contrib/llvm/lib/Target/ARM/TargetInfo/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) - -add_llvm_library(LLVMARMInfo - ARMTargetInfo.cpp - ) - -add_dependencies(LLVMARMInfo ARMCodeGenTable_gen) diff --git a/contrib/llvm/lib/Target/ARM/TargetInfo/Makefile b/contrib/llvm/lib/Target/ARM/TargetInfo/Makefile deleted file mode 100644 index 6292ab1..0000000 --- a/contrib/llvm/lib/Target/ARM/TargetInfo/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/ARM/TargetInfo/Makefile ------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMARMInfo - -# Hack: we need to include 'main' target directory to grab private headers -CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common |