summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/ARM
diff options
context:
space:
mode:
authordim <dim@FreeBSD.org>2010-10-11 17:22:16 +0000
committerdim <dim@FreeBSD.org>2010-10-11 17:22:16 +0000
commit1fc65a65fe54635d0e564559ba5a7b8a8a42d4d6 (patch)
treede75a464c5dac7eceb2dbbad8b4d4e1479d79e08 /contrib/llvm/lib/Target/ARM
parentf4f7191cd223adebacee3fad260ed60935be9cb9 (diff)
downloadFreeBSD-src-1fc65a65fe54635d0e564559ba5a7b8a8a42d4d6.zip
FreeBSD-src-1fc65a65fe54635d0e564559ba5a7b8a8a42d4d6.tar.gz
Remove more unneeded files and directories from contrib/llvm. This
still allows us to build tblgen and clang, and further reduces the footprint in the tree. Approved by: rpaulo (mentor)
Diffstat (limited to 'contrib/llvm/lib/Target/ARM')
-rw-r--r--contrib/llvm/lib/Target/ARM/AsmParser/CMakeLists.txt7
-rw-r--r--contrib/llvm/lib/Target/ARM/AsmParser/Makefile15
-rw-r--r--contrib/llvm/lib/Target/ARM/AsmPrinter/CMakeLists.txt6
-rw-r--r--contrib/llvm/lib/Target/ARM/AsmPrinter/Makefile15
-rw-r--r--contrib/llvm/lib/Target/ARM/CMakeLists.txt50
-rw-r--r--contrib/llvm/lib/Target/ARM/Disassembler/Makefile16
-rw-r--r--contrib/llvm/lib/Target/ARM/Makefile25
-rw-r--r--contrib/llvm/lib/Target/ARM/README-Thumb.txt248
-rw-r--r--contrib/llvm/lib/Target/ARM/README-Thumb2.txt6
-rw-r--r--contrib/llvm/lib/Target/ARM/README.txt659
-rw-r--r--contrib/llvm/lib/Target/ARM/TargetInfo/CMakeLists.txt7
-rw-r--r--contrib/llvm/lib/Target/ARM/TargetInfo/Makefile15
12 files changed, 0 insertions, 1069 deletions
diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/CMakeLists.txt b/contrib/llvm/lib/Target/ARM/AsmParser/CMakeLists.txt
deleted file mode 100644
index 9ba7c01..0000000
--- a/contrib/llvm/lib/Target/ARM/AsmParser/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARMAsmParser
- ARMAsmLexer.cpp
- ARMAsmParser.cpp
- )
-
diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/Makefile b/contrib/llvm/lib/Target/ARM/AsmParser/Makefile
deleted file mode 100644
index 841516f..0000000
--- a/contrib/llvm/lib/Target/ARM/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM/AsmParser/Makefile -------------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARMAsmParser
-
-# Hack: we need to include 'main' ARM target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/contrib/llvm/lib/Target/ARM/AsmPrinter/CMakeLists.txt b/contrib/llvm/lib/Target/ARM/AsmPrinter/CMakeLists.txt
deleted file mode 100644
index 18645c0..0000000
--- a/contrib/llvm/lib/Target/ARM/AsmPrinter/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARMAsmPrinter
- ARMInstPrinter.cpp
- )
-add_dependencies(LLVMARMAsmPrinter ARMCodeGenTable_gen)
diff --git a/contrib/llvm/lib/Target/ARM/AsmPrinter/Makefile b/contrib/llvm/lib/Target/ARM/AsmPrinter/Makefile
deleted file mode 100644
index 65d372e..0000000
--- a/contrib/llvm/lib/Target/ARM/AsmPrinter/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM/AsmPrinter/Makefile ------------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARMAsmPrinter
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/contrib/llvm/lib/Target/ARM/CMakeLists.txt b/contrib/llvm/lib/Target/ARM/CMakeLists.txt
deleted file mode 100644
index 6b4dee5..0000000
--- a/contrib/llvm/lib/Target/ARM/CMakeLists.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-set(LLVM_TARGET_DEFINITIONS ARM.td)
-
-tablegen(ARMGenRegisterInfo.h.inc -gen-register-desc-header)
-tablegen(ARMGenRegisterNames.inc -gen-register-enums)
-tablegen(ARMGenRegisterInfo.inc -gen-register-desc)
-tablegen(ARMGenInstrNames.inc -gen-instr-enums)
-tablegen(ARMGenInstrInfo.inc -gen-instr-desc)
-tablegen(ARMGenCodeEmitter.inc -gen-emitter)
-tablegen(ARMGenAsmWriter.inc -gen-asm-writer)
-tablegen(ARMGenAsmMatcher.inc -gen-asm-matcher)
-tablegen(ARMGenDAGISel.inc -gen-dag-isel)
-tablegen(ARMGenFastISel.inc -gen-fast-isel)
-tablegen(ARMGenCallingConv.inc -gen-callingconv)
-tablegen(ARMGenSubtarget.inc -gen-subtarget)
-tablegen(ARMGenEDInfo.inc -gen-enhanced-disassembly-info)
-
-add_llvm_target(ARMCodeGen
- ARMAsmPrinter.cpp
- ARMBaseInstrInfo.cpp
- ARMBaseRegisterInfo.cpp
- ARMCodeEmitter.cpp
- ARMConstantIslandPass.cpp
- ARMConstantPoolValue.cpp
- ARMExpandPseudoInsts.cpp
- ARMFastISel.cpp
- ARMGlobalMerge.cpp
- ARMISelDAGToDAG.cpp
- ARMISelLowering.cpp
- ARMInstrInfo.cpp
- ARMJITInfo.cpp
- ARMLoadStoreOptimizer.cpp
- ARMMCAsmInfo.cpp
- ARMMCInstLower.cpp
- ARMRegisterInfo.cpp
- ARMSelectionDAGInfo.cpp
- ARMSubtarget.cpp
- ARMTargetMachine.cpp
- ARMTargetObjectFile.cpp
- NEONMoveFix.cpp
- NEONPreAllocPass.cpp
- Thumb1InstrInfo.cpp
- Thumb1RegisterInfo.cpp
- Thumb2HazardRecognizer.cpp
- Thumb2ITBlockPass.cpp
- Thumb2InstrInfo.cpp
- Thumb2RegisterInfo.cpp
- Thumb2SizeReduction.cpp
- )
-
-target_link_libraries (LLVMARMCodeGen LLVMARMAsmPrinter LLVMSelectionDAG)
diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/Makefile b/contrib/llvm/lib/Target/ARM/Disassembler/Makefile
deleted file mode 100644
index 031b6ac..0000000
--- a/contrib/llvm/lib/Target/ARM/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/ARM/Disassembler/Makefile ----------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARMDisassembler
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/contrib/llvm/lib/Target/ARM/Makefile b/contrib/llvm/lib/Target/ARM/Makefile
deleted file mode 100644
index b3fcfaf6..0000000
--- a/contrib/llvm/lib/Target/ARM/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-##===- lib/Target/ARM/Makefile -----------------------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMARMCodeGen
-TARGET = ARM
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = ARMGenRegisterInfo.h.inc ARMGenRegisterNames.inc \
- ARMGenRegisterInfo.inc ARMGenInstrNames.inc \
- ARMGenInstrInfo.inc ARMGenAsmWriter.inc ARMGenAsmMatcher.inc \
- ARMGenDAGISel.inc ARMGenSubtarget.inc \
- ARMGenCodeEmitter.inc ARMGenCallingConv.inc \
- ARMGenDecoderTables.inc ARMGenEDInfo.inc \
- ARMGenFastISel.inc
-
-DIRS = AsmPrinter AsmParser Disassembler TargetInfo
-
-include $(LEVEL)/Makefile.common
diff --git a/contrib/llvm/lib/Target/ARM/README-Thumb.txt b/contrib/llvm/lib/Target/ARM/README-Thumb.txt
deleted file mode 100644
index 6b605bb..0000000
--- a/contrib/llvm/lib/Target/ARM/README-Thumb.txt
+++ /dev/null
@@ -1,248 +0,0 @@
-//===---------------------------------------------------------------------===//
-// Random ideas for the ARM backend (Thumb specific).
-//===---------------------------------------------------------------------===//
-
-* Add support for compiling functions in both ARM and Thumb mode, then taking
- the smallest.
-
-* Add support for compiling individual basic blocks in thumb mode, when in a
- larger ARM function. This can be used for presumed cold code, like paths
- to abort (failure path of asserts), EH handling code, etc.
-
-* Thumb doesn't have normal pre/post increment addressing modes, but you can
- load/store 32-bit integers with pre/postinc by using load/store multiple
- instrs with a single register.
-
-* Make better use of high registers r8, r10, r11, r12 (ip). Some variants of add
- and cmp instructions can use high registers. Also, we can use them as
- temporaries to spill values into.
-
-* In thumb mode, short, byte, and bool preferred alignments are currently set
- to 4 to accommodate ISA restriction (i.e. add sp, #imm, imm must be multiple
- of 4).
-
-//===---------------------------------------------------------------------===//
-
-Potential jumptable improvements:
-
-* If we know function size is less than (1 << 16) * 2 bytes, we can use 16-bit
- jumptable entries (e.g. (L1 - L2) >> 1). Or even smaller entries if the
- function is even smaller. This also applies to ARM.
-
-* Thumb jumptable codegen can improve given some help from the assembler. This
- is what we generate right now:
-
- .set PCRELV0, (LJTI1_0_0-(LPCRELL0+4))
-LPCRELL0:
- mov r1, #PCRELV0
- add r1, pc
- ldr r0, [r0, r1]
- mov pc, r0
- .align 2
-LJTI1_0_0:
- .long LBB1_3
- ...
-
-Note there is another pc relative add that we can take advantage of.
- add r1, pc, #imm_8 * 4
-
-We should be able to generate:
-
-LPCRELL0:
- add r1, LJTI1_0_0
- ldr r0, [r0, r1]
- mov pc, r0
- .align 2
-LJTI1_0_0:
- .long LBB1_3
-
-if the assembler can translate the add to:
- add r1, pc, #((LJTI1_0_0-(LPCRELL0+4))&0xfffffffc)
-
-Note the assembler also does something similar to constpool load:
-LPCRELL0:
- ldr r0, LCPI1_0
-=>
- ldr r0, pc, #((LCPI1_0-(LPCRELL0+4))&0xfffffffc)
-
-
-//===---------------------------------------------------------------------===//
-
-We compiles the following:
-
-define i16 @func_entry_2E_ce(i32 %i) {
- switch i32 %i, label %bb12.exitStub [
- i32 0, label %bb4.exitStub
- i32 1, label %bb9.exitStub
- i32 2, label %bb4.exitStub
- i32 3, label %bb4.exitStub
- i32 7, label %bb9.exitStub
- i32 8, label %bb.exitStub
- i32 9, label %bb9.exitStub
- ]
-
-bb12.exitStub:
- ret i16 0
-
-bb4.exitStub:
- ret i16 1
-
-bb9.exitStub:
- ret i16 2
-
-bb.exitStub:
- ret i16 3
-}
-
-into:
-
-_func_entry_2E_ce:
- mov r2, #1
- lsl r2, r0
- cmp r0, #9
- bhi LBB1_4 @bb12.exitStub
-LBB1_1: @newFuncRoot
- mov r1, #13
- tst r2, r1
- bne LBB1_5 @bb4.exitStub
-LBB1_2: @newFuncRoot
- ldr r1, LCPI1_0
- tst r2, r1
- bne LBB1_6 @bb9.exitStub
-LBB1_3: @newFuncRoot
- mov r1, #1
- lsl r1, r1, #8
- tst r2, r1
- bne LBB1_7 @bb.exitStub
-LBB1_4: @bb12.exitStub
- mov r0, #0
- bx lr
-LBB1_5: @bb4.exitStub
- mov r0, #1
- bx lr
-LBB1_6: @bb9.exitStub
- mov r0, #2
- bx lr
-LBB1_7: @bb.exitStub
- mov r0, #3
- bx lr
-LBB1_8:
- .align 2
-LCPI1_0:
- .long 642
-
-
-gcc compiles to:
-
- cmp r0, #9
- @ lr needed for prologue
- bhi L2
- ldr r3, L11
- mov r2, #1
- mov r1, r2, asl r0
- ands r0, r3, r2, asl r0
- movne r0, #2
- bxne lr
- tst r1, #13
- beq L9
-L3:
- mov r0, r2
- bx lr
-L9:
- tst r1, #256
- movne r0, #3
- bxne lr
-L2:
- mov r0, #0
- bx lr
-L12:
- .align 2
-L11:
- .long 642
-
-
-GCC is doing a couple of clever things here:
- 1. It is predicating one of the returns. This isn't a clear win though: in
- cases where that return isn't taken, it is replacing one condbranch with
- two 'ne' predicated instructions.
- 2. It is sinking the shift of "1 << i" into the tst, and using ands instead of
- tst. This will probably require whole function isel.
- 3. GCC emits:
- tst r1, #256
- we emit:
- mov r1, #1
- lsl r1, r1, #8
- tst r2, r1
-
-
-//===---------------------------------------------------------------------===//
-
-When spilling in thumb mode and the sp offset is too large to fit in the ldr /
-str offset field, we load the offset from a constpool entry and add it to sp:
-
-ldr r2, LCPI
-add r2, sp
-ldr r2, [r2]
-
-These instructions preserve the condition code which is important if the spill
-is between a cmp and a bcc instruction. However, we can use the (potentially)
-cheaper sequnce if we know it's ok to clobber the condition register.
-
-add r2, sp, #255 * 4
-add r2, #132
-ldr r2, [r2, #7 * 4]
-
-This is especially bad when dynamic alloca is used. The all fixed size stack
-objects are referenced off the frame pointer with negative offsets. See
-oggenc for an example.
-
-
-//===---------------------------------------------------------------------===//
-
-Poor codegen test/CodeGen/ARM/select.ll f7:
-
- ldr r5, LCPI1_0
-LPC0:
- add r5, pc
- ldr r6, LCPI1_1
- ldr r2, LCPI1_2
- mov r3, r6
- mov lr, pc
- bx r5
-
-//===---------------------------------------------------------------------===//
-
-Make register allocator / spiller smarter so we can re-materialize "mov r, imm",
-etc. Almost all Thumb instructions clobber condition code.
-
-//===---------------------------------------------------------------------===//
-
-Add ldmia, stmia support.
-
-//===---------------------------------------------------------------------===//
-
-Thumb load / store address mode offsets are scaled. The values kept in the
-instruction operands are pre-scale values. This probably ought to be changed
-to avoid extra work when we convert Thumb2 instructions to Thumb1 instructions.
-
-//===---------------------------------------------------------------------===//
-
-We need to make (some of the) Thumb1 instructions predicable. That will allow
-shrinking of predicated Thumb2 instructions. To allow this, we need to be able
-to toggle the 's' bit since they do not set CPSR when they are inside IT blocks.
-
-//===---------------------------------------------------------------------===//
-
-Make use of hi register variants of cmp: tCMPhir / tCMPZhir.
-
-//===---------------------------------------------------------------------===//
-
-Thumb1 immediate field sometimes keep pre-scaled values. See
-Thumb1RegisterInfo::eliminateFrameIndex. This is inconsistent from ARM and
-Thumb2.
-
-//===---------------------------------------------------------------------===//
-
-Rather than having tBR_JTr print a ".align 2" and constant island pass pad it,
-add a target specific ALIGN instruction instead. That way, GetInstSizeInBytes
-won't have to over-estimate. It can also be used for loop alignment pass.
diff --git a/contrib/llvm/lib/Target/ARM/README-Thumb2.txt b/contrib/llvm/lib/Target/ARM/README-Thumb2.txt
deleted file mode 100644
index e7c2552..0000000
--- a/contrib/llvm/lib/Target/ARM/README-Thumb2.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-//===---------------------------------------------------------------------===//
-// Random ideas for the ARM backend (Thumb2 specific).
-//===---------------------------------------------------------------------===//
-
-Make sure jumptable destinations are below the jumptable in order to make use
-of tbb / tbh.
diff --git a/contrib/llvm/lib/Target/ARM/README.txt b/contrib/llvm/lib/Target/ARM/README.txt
deleted file mode 100644
index 9fc3fb9..0000000
--- a/contrib/llvm/lib/Target/ARM/README.txt
+++ /dev/null
@@ -1,659 +0,0 @@
-//===---------------------------------------------------------------------===//
-// Random ideas for the ARM backend.
-//===---------------------------------------------------------------------===//
-
-Reimplement 'select' in terms of 'SEL'.
-
-* We would really like to support UXTAB16, but we need to prove that the
- add doesn't need to overflow between the two 16-bit chunks.
-
-* Implement pre/post increment support. (e.g. PR935)
-* Implement smarter constant generation for binops with large immediates.
-
-A few ARMv6T2 ops should be pattern matched: BFI, SBFX, and UBFX
-
-Interesting optimization for PIC codegen on arm-linux:
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43129
-
-//===---------------------------------------------------------------------===//
-
-Crazy idea: Consider code that uses lots of 8-bit or 16-bit values. By the
-time regalloc happens, these values are now in a 32-bit register, usually with
-the top-bits known to be sign or zero extended. If spilled, we should be able
-to spill these to a 8-bit or 16-bit stack slot, zero or sign extending as part
-of the reload.
-
-Doing this reduces the size of the stack frame (important for thumb etc), and
-also increases the likelihood that we will be able to reload multiple values
-from the stack with a single load.
-
-//===---------------------------------------------------------------------===//
-
-The constant island pass is in good shape. Some cleanups might be desirable,
-but there is unlikely to be much improvement in the generated code.
-
-1. There may be some advantage to trying to be smarter about the initial
-placement, rather than putting everything at the end.
-
-2. There might be some compile-time efficiency to be had by representing
-consecutive islands as a single block rather than multiple blocks.
-
-3. Use a priority queue to sort constant pool users in inverse order of
- position so we always process the one closed to the end of functions
- first. This may simply CreateNewWater.
-
-//===---------------------------------------------------------------------===//
-
-Eliminate copysign custom expansion. We are still generating crappy code with
-default expansion + if-conversion.
-
-//===---------------------------------------------------------------------===//
-
-Eliminate one instruction from:
-
-define i32 @_Z6slow4bii(i32 %x, i32 %y) {
- %tmp = icmp sgt i32 %x, %y
- %retval = select i1 %tmp, i32 %x, i32 %y
- ret i32 %retval
-}
-
-__Z6slow4bii:
- cmp r0, r1
- movgt r1, r0
- mov r0, r1
- bx lr
-=>
-
-__Z6slow4bii:
- cmp r0, r1
- movle r0, r1
- bx lr
-
-//===---------------------------------------------------------------------===//
-
-Implement long long "X-3" with instructions that fold the immediate in. These
-were disabled due to badness with the ARM carry flag on subtracts.
-
-//===---------------------------------------------------------------------===//
-
-More load / store optimizations:
-1) Better representation for block transfer? This is from Olden/power:
-
- fldd d0, [r4]
- fstd d0, [r4, #+32]
- fldd d0, [r4, #+8]
- fstd d0, [r4, #+40]
- fldd d0, [r4, #+16]
- fstd d0, [r4, #+48]
- fldd d0, [r4, #+24]
- fstd d0, [r4, #+56]
-
-If we can spare the registers, it would be better to use fldm and fstm here.
-Need major register allocator enhancement though.
-
-2) Can we recognize the relative position of constantpool entries? i.e. Treat
-
- ldr r0, LCPI17_3
- ldr r1, LCPI17_4
- ldr r2, LCPI17_5
-
- as
- ldr r0, LCPI17
- ldr r1, LCPI17+4
- ldr r2, LCPI17+8
-
- Then the ldr's can be combined into a single ldm. See Olden/power.
-
-Note for ARM v4 gcc uses ldmia to load a pair of 32-bit values to represent a
-double 64-bit FP constant:
-
- adr r0, L6
- ldmia r0, {r0-r1}
-
- .align 2
-L6:
- .long -858993459
- .long 1074318540
-
-3) struct copies appear to be done field by field
-instead of by words, at least sometimes:
-
-struct foo { int x; short s; char c1; char c2; };
-void cpy(struct foo*a, struct foo*b) { *a = *b; }
-
-llvm code (-O2)
- ldrb r3, [r1, #+6]
- ldr r2, [r1]
- ldrb r12, [r1, #+7]
- ldrh r1, [r1, #+4]
- str r2, [r0]
- strh r1, [r0, #+4]
- strb r3, [r0, #+6]
- strb r12, [r0, #+7]
-gcc code (-O2)
- ldmia r1, {r1-r2}
- stmia r0, {r1-r2}
-
-In this benchmark poor handling of aggregate copies has shown up as
-having a large effect on size, and possibly speed as well (we don't have
-a good way to measure on ARM).
-
-//===---------------------------------------------------------------------===//
-
-* Consider this silly example:
-
-double bar(double x) {
- double r = foo(3.1);
- return x+r;
-}
-
-_bar:
- stmfd sp!, {r4, r5, r7, lr}
- add r7, sp, #8
- mov r4, r0
- mov r5, r1
- fldd d0, LCPI1_0
- fmrrd r0, r1, d0
- bl _foo
- fmdrr d0, r4, r5
- fmsr s2, r0
- fsitod d1, s2
- faddd d0, d1, d0
- fmrrd r0, r1, d0
- ldmfd sp!, {r4, r5, r7, pc}
-
-Ignore the prologue and epilogue stuff for a second. Note
- mov r4, r0
- mov r5, r1
-the copys to callee-save registers and the fact they are only being used by the
-fmdrr instruction. It would have been better had the fmdrr been scheduled
-before the call and place the result in a callee-save DPR register. The two
-mov ops would not have been necessary.
-
-//===---------------------------------------------------------------------===//
-
-Calling convention related stuff:
-
-* gcc's parameter passing implementation is terrible and we suffer as a result:
-
-e.g.
-struct s {
- double d1;
- int s1;
-};
-
-void foo(struct s S) {
- printf("%g, %d\n", S.d1, S.s1);
-}
-
-'S' is passed via registers r0, r1, r2. But gcc stores them to the stack, and
-then reload them to r1, r2, and r3 before issuing the call (r0 contains the
-address of the format string):
-
- stmfd sp!, {r7, lr}
- add r7, sp, #0
- sub sp, sp, #12
- stmia sp, {r0, r1, r2}
- ldmia sp, {r1-r2}
- ldr r0, L5
- ldr r3, [sp, #8]
-L2:
- add r0, pc, r0
- bl L_printf$stub
-
-Instead of a stmia, ldmia, and a ldr, wouldn't it be better to do three moves?
-
-* Return an aggregate type is even worse:
-
-e.g.
-struct s foo(void) {
- struct s S = {1.1, 2};
- return S;
-}
-
- mov ip, r0
- ldr r0, L5
- sub sp, sp, #12
-L2:
- add r0, pc, r0
- @ lr needed for prologue
- ldmia r0, {r0, r1, r2}
- stmia sp, {r0, r1, r2}
- stmia ip, {r0, r1, r2}
- mov r0, ip
- add sp, sp, #12
- bx lr
-
-r0 (and later ip) is the hidden parameter from caller to store the value in. The
-first ldmia loads the constants into r0, r1, r2. The last stmia stores r0, r1,
-r2 into the address passed in. However, there is one additional stmia that
-stores r0, r1, and r2 to some stack location. The store is dead.
-
-The llvm-gcc generated code looks like this:
-
-csretcc void %foo(%struct.s* %agg.result) {
-entry:
- %S = alloca %struct.s, align 4 ; <%struct.s*> [#uses=1]
- %memtmp = alloca %struct.s ; <%struct.s*> [#uses=1]
- cast %struct.s* %S to sbyte* ; <sbyte*>:0 [#uses=2]
- call void %llvm.memcpy.i32( sbyte* %0, sbyte* cast ({ double, int }* %C.0.904 to sbyte*), uint 12, uint 4 )
- cast %struct.s* %agg.result to sbyte* ; <sbyte*>:1 [#uses=2]
- call void %llvm.memcpy.i32( sbyte* %1, sbyte* %0, uint 12, uint 0 )
- cast %struct.s* %memtmp to sbyte* ; <sbyte*>:2 [#uses=1]
- call void %llvm.memcpy.i32( sbyte* %2, sbyte* %1, uint 12, uint 0 )
- ret void
-}
-
-llc ends up issuing two memcpy's (the first memcpy becomes 3 loads from
-constantpool). Perhaps we should 1) fix llvm-gcc so the memcpy is translated
-into a number of load and stores, or 2) custom lower memcpy (of small size) to
-be ldmia / stmia. I think option 2 is better but the current register
-allocator cannot allocate a chunk of registers at a time.
-
-A feasible temporary solution is to use specific physical registers at the
-lowering time for small (<= 4 words?) transfer size.
-
-* ARM CSRet calling convention requires the hidden argument to be returned by
-the callee.
-
-//===---------------------------------------------------------------------===//
-
-We can definitely do a better job on BB placements to eliminate some branches.
-It's very common to see llvm generated assembly code that looks like this:
-
-LBB3:
- ...
-LBB4:
-...
- beq LBB3
- b LBB2
-
-If BB4 is the only predecessor of BB3, then we can emit BB3 after BB4. We can
-then eliminate beq and and turn the unconditional branch to LBB2 to a bne.
-
-See McCat/18-imp/ComputeBoundingBoxes for an example.
-
-//===---------------------------------------------------------------------===//
-
-Pre-/post- indexed load / stores:
-
-1) We should not make the pre/post- indexed load/store transform if the base ptr
-is guaranteed to be live beyond the load/store. This can happen if the base
-ptr is live out of the block we are performing the optimization. e.g.
-
-mov r1, r2
-ldr r3, [r1], #4
-...
-
-vs.
-
-ldr r3, [r2]
-add r1, r2, #4
-...
-
-In most cases, this is just a wasted optimization. However, sometimes it can
-negatively impact the performance because two-address code is more restrictive
-when it comes to scheduling.
-
-Unfortunately, liveout information is currently unavailable during DAG combine
-time.
-
-2) Consider spliting a indexed load / store into a pair of add/sub + load/store
- to solve #1 (in TwoAddressInstructionPass.cpp).
-
-3) Enhance LSR to generate more opportunities for indexed ops.
-
-4) Once we added support for multiple result patterns, write indexed loads
- patterns instead of C++ instruction selection code.
-
-5) Use VLDM / VSTM to emulate indexed FP load / store.
-
-//===---------------------------------------------------------------------===//
-
-Implement support for some more tricky ways to materialize immediates. For
-example, to get 0xffff8000, we can use:
-
-mov r9, #&3f8000
-sub r9, r9, #&400000
-
-//===---------------------------------------------------------------------===//
-
-We sometimes generate multiple add / sub instructions to update sp in prologue
-and epilogue if the inc / dec value is too large to fit in a single immediate
-operand. In some cases, perhaps it might be better to load the value from a
-constantpool instead.
-
-//===---------------------------------------------------------------------===//
-
-GCC generates significantly better code for this function.
-
-int foo(int StackPtr, unsigned char *Line, unsigned char *Stack, int LineLen) {
- int i = 0;
-
- if (StackPtr != 0) {
- while (StackPtr != 0 && i < (((LineLen) < (32768))? (LineLen) : (32768)))
- Line[i++] = Stack[--StackPtr];
- if (LineLen > 32768)
- {
- while (StackPtr != 0 && i < LineLen)
- {
- i++;
- --StackPtr;
- }
- }
- }
- return StackPtr;
-}
-
-//===---------------------------------------------------------------------===//
-
-This should compile to the mlas instruction:
-int mlas(int x, int y, int z) { return ((x * y + z) < 0) ? 7 : 13; }
-
-//===---------------------------------------------------------------------===//
-
-At some point, we should triage these to see if they still apply to us:
-
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19598
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18560
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=27016
-
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11831
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11826
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11825
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11824
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11823
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11820
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10982
-
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10242
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9831
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9760
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9759
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9703
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9702
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9663
-
-http://www.inf.u-szeged.hu/gcc-arm/
-http://citeseer.ist.psu.edu/debus04linktime.html
-
-//===---------------------------------------------------------------------===//
-
-gcc generates smaller code for this function at -O2 or -Os:
-
-void foo(signed char* p) {
- if (*p == 3)
- bar();
- else if (*p == 4)
- baz();
- else if (*p == 5)
- quux();
-}
-
-llvm decides it's a good idea to turn the repeated if...else into a
-binary tree, as if it were a switch; the resulting code requires -1
-compare-and-branches when *p<=2 or *p==5, the same number if *p==4
-or *p>6, and +1 if *p==3. So it should be a speed win
-(on balance). However, the revised code is larger, with 4 conditional
-branches instead of 3.
-
-More seriously, there is a byte->word extend before
-each comparison, where there should be only one, and the condition codes
-are not remembered when the same two values are compared twice.
-
-//===---------------------------------------------------------------------===//
-
-More LSR enhancements possible:
-
-1. Teach LSR about pre- and post- indexed ops to allow iv increment be merged
- in a load / store.
-2. Allow iv reuse even when a type conversion is required. For example, i8
- and i32 load / store addressing modes are identical.
-
-
-//===---------------------------------------------------------------------===//
-
-This:
-
-int foo(int a, int b, int c, int d) {
- long long acc = (long long)a * (long long)b;
- acc += (long long)c * (long long)d;
- return (int)(acc >> 32);
-}
-
-Should compile to use SMLAL (Signed Multiply Accumulate Long) which multiplies
-two signed 32-bit values to produce a 64-bit value, and accumulates this with
-a 64-bit value.
-
-We currently get this with both v4 and v6:
-
-_foo:
- smull r1, r0, r1, r0
- smull r3, r2, r3, r2
- adds r3, r3, r1
- adc r0, r2, r0
- bx lr
-
-//===---------------------------------------------------------------------===//
-
-This:
- #include <algorithm>
- std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
- { return std::make_pair(a + b, a + b < a); }
- bool no_overflow(unsigned a, unsigned b)
- { return !full_add(a, b).second; }
-
-Should compile to:
-
-_Z8full_addjj:
- adds r2, r1, r2
- movcc r1, #0
- movcs r1, #1
- str r2, [r0, #0]
- strb r1, [r0, #4]
- mov pc, lr
-
-_Z11no_overflowjj:
- cmn r0, r1
- movcs r0, #0
- movcc r0, #1
- mov pc, lr
-
-not:
-
-__Z8full_addjj:
- add r3, r2, r1
- str r3, [r0]
- mov r2, #1
- mov r12, #0
- cmp r3, r1
- movlo r12, r2
- str r12, [r0, #+4]
- bx lr
-__Z11no_overflowjj:
- add r3, r1, r0
- mov r2, #1
- mov r1, #0
- cmp r3, r0
- movhs r1, r2
- mov r0, r1
- bx lr
-
-//===---------------------------------------------------------------------===//
-
-Some of the NEON intrinsics may be appropriate for more general use, either
-as target-independent intrinsics or perhaps elsewhere in the ARM backend.
-Some of them may also be lowered to target-independent SDNodes, and perhaps
-some new SDNodes could be added.
-
-For example, maximum, minimum, and absolute value operations are well-defined
-and standard operations, both for vector and scalar types.
-
-The current NEON-specific intrinsics for count leading zeros and count one
-bits could perhaps be replaced by the target-independent ctlz and ctpop
-intrinsics. It may also make sense to add a target-independent "ctls"
-intrinsic for "count leading sign bits". Likewise, the backend could use
-the target-independent SDNodes for these operations.
-
-ARMv6 has scalar saturating and halving adds and subtracts. The same
-intrinsics could possibly be used for both NEON's vector implementations of
-those operations and the ARMv6 scalar versions.
-
-//===---------------------------------------------------------------------===//
-
-ARM::MOVCCr is commutable (by flipping the condition). But we need to implement
-ARMInstrInfo::commuteInstruction() to support it.
-
-//===---------------------------------------------------------------------===//
-
-Split out LDR (literal) from normal ARM LDR instruction. Also consider spliting
-LDR into imm12 and so_reg forms. This allows us to clean up some code. e.g.
-ARMLoadStoreOptimizer does not need to look at LDR (literal) and LDR (so_reg)
-while ARMConstantIslandPass only need to worry about LDR (literal).
-
-//===---------------------------------------------------------------------===//
-
-Constant island pass should make use of full range SoImm values for LEApcrel.
-Be careful though as the last attempt caused infinite looping on lencod.
-
-//===---------------------------------------------------------------------===//
-
-Predication issue. This function:
-
-extern unsigned array[ 128 ];
-int foo( int x ) {
- int y;
- y = array[ x & 127 ];
- if ( x & 128 )
- y = 123456789 & ( y >> 2 );
- else
- y = 123456789 & y;
- return y;
-}
-
-compiles to:
-
-_foo:
- and r1, r0, #127
- ldr r2, LCPI1_0
- ldr r2, [r2]
- ldr r1, [r2, +r1, lsl #2]
- mov r2, r1, lsr #2
- tst r0, #128
- moveq r2, r1
- ldr r0, LCPI1_1
- and r0, r2, r0
- bx lr
-
-It would be better to do something like this, to fold the shift into the
-conditional move:
-
- and r1, r0, #127
- ldr r2, LCPI1_0
- ldr r2, [r2]
- ldr r1, [r2, +r1, lsl #2]
- tst r0, #128
- movne r1, r1, lsr #2
- ldr r0, LCPI1_1
- and r0, r1, r0
- bx lr
-
-it saves an instruction and a register.
-
-//===---------------------------------------------------------------------===//
-
-It might be profitable to cse MOVi16 if there are lots of 32-bit immediates
-with the same bottom half.
-
-//===---------------------------------------------------------------------===//
-
-Robert Muth started working on an alternate jump table implementation that
-does not put the tables in-line in the text. This is more like the llvm
-default jump table implementation. This might be useful sometime. Several
-revisions of patches are on the mailing list, beginning at:
-http://lists.cs.uiuc.edu/pipermail/llvmdev/2009-June/022763.html
-
-//===---------------------------------------------------------------------===//
-
-Make use of the "rbit" instruction.
-
-//===---------------------------------------------------------------------===//
-
-Take a look at test/CodeGen/Thumb2/machine-licm.ll. ARM should be taught how
-to licm and cse the unnecessary load from cp#1.
-
-//===---------------------------------------------------------------------===//
-
-The CMN instruction sets the flags like an ADD instruction, while CMP sets
-them like a subtract. Therefore to be able to use CMN for comparisons other
-than the Z bit, we'll need additional logic to reverse the conditionals
-associated with the comparison. Perhaps a pseudo-instruction for the comparison,
-with a post-codegen pass to clean up and handle the condition codes?
-See PR5694 for testcase.
-
-//===---------------------------------------------------------------------===//
-
-Given the following on armv5:
-int test1(int A, int B) {
- return (A&-8388481)|(B&8388480);
-}
-
-We currently generate:
- ldr r2, .LCPI0_0
- and r0, r0, r2
- ldr r2, .LCPI0_1
- and r1, r1, r2
- orr r0, r1, r0
- bx lr
-
-We should be able to replace the second ldr+and with a bic (i.e. reuse the
-constant which was already loaded). Not sure what's necessary to do that.
-
-//===---------------------------------------------------------------------===//
-
-The code generated for bswap on armv4/5 (CPUs without rev) is less than ideal:
-
-int a(int x) { return __builtin_bswap32(x); }
-
-a:
- mov r1, #255, 24
- mov r2, #255, 16
- and r1, r1, r0, lsr #8
- and r2, r2, r0, lsl #8
- orr r1, r1, r0, lsr #24
- orr r0, r2, r0, lsl #24
- orr r0, r0, r1
- bx lr
-
-Something like the following would be better (fewer instructions/registers):
- eor r1, r0, r0, ror #16
- bic r1, r1, #0xff0000
- mov r1, r1, lsr #8
- eor r0, r1, r0, ror #8
- bx lr
-
-A custom Thumb version would also be a slight improvement over the generic
-version.
-
-//===---------------------------------------------------------------------===//
-
-Consider the following simple C code:
-
-void foo(unsigned char *a, unsigned char *b, int *c) {
- if ((*a | *b) == 0) *c = 0;
-}
-
-currently llvm-gcc generates something like this (nice branchless code I'd say):
-
- ldrb r0, [r0]
- ldrb r1, [r1]
- orr r0, r1, r0
- tst r0, #255
- moveq r0, #0
- streq r0, [r2]
- bx lr
-
-Note that both "tst" and "moveq" are redundant.
-
-//===---------------------------------------------------------------------===//
-
diff --git a/contrib/llvm/lib/Target/ARM/TargetInfo/CMakeLists.txt b/contrib/llvm/lib/Target/ARM/TargetInfo/CMakeLists.txt
deleted file mode 100644
index 3910bb0..0000000
--- a/contrib/llvm/lib/Target/ARM/TargetInfo/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARMInfo
- ARMTargetInfo.cpp
- )
-
-add_dependencies(LLVMARMInfo ARMCodeGenTable_gen)
diff --git a/contrib/llvm/lib/Target/ARM/TargetInfo/Makefile b/contrib/llvm/lib/Target/ARM/TargetInfo/Makefile
deleted file mode 100644
index 6292ab1..0000000
--- a/contrib/llvm/lib/Target/ARM/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM/TargetInfo/Makefile ------------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARMInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
OpenPOWER on IntegriCloud