summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/PowerPC
diff options
context:
space:
mode:
authordim <dim@FreeBSD.org>2010-10-11 17:22:16 +0000
committerdim <dim@FreeBSD.org>2010-10-11 17:22:16 +0000
commit1fc65a65fe54635d0e564559ba5a7b8a8a42d4d6 (patch)
treede75a464c5dac7eceb2dbbad8b4d4e1479d79e08 /contrib/llvm/lib/Target/PowerPC
parentf4f7191cd223adebacee3fad260ed60935be9cb9 (diff)
downloadFreeBSD-src-1fc65a65fe54635d0e564559ba5a7b8a8a42d4d6.zip
FreeBSD-src-1fc65a65fe54635d0e564559ba5a7b8a8a42d4d6.tar.gz
Remove more unneeded files and directories from contrib/llvm. This
still allows us to build tblgen and clang, and further reduces the footprint in the tree. Approved by: rpaulo (mentor)
Diffstat (limited to 'contrib/llvm/lib/Target/PowerPC')
-rw-r--r--contrib/llvm/lib/Target/PowerPC/AsmPrinter/CMakeLists.txt6
-rw-r--r--contrib/llvm/lib/Target/PowerPC/AsmPrinter/Makefile15
-rw-r--r--contrib/llvm/lib/Target/PowerPC/CMakeLists.txt30
-rw-r--r--contrib/llvm/lib/Target/PowerPC/Makefile23
-rw-r--r--contrib/llvm/lib/Target/PowerPC/README.txt914
-rw-r--r--contrib/llvm/lib/Target/PowerPC/README_ALTIVEC.txt211
-rw-r--r--contrib/llvm/lib/Target/PowerPC/TargetInfo/CMakeLists.txt7
-rw-r--r--contrib/llvm/lib/Target/PowerPC/TargetInfo/Makefile15
8 files changed, 0 insertions, 1221 deletions
diff --git a/contrib/llvm/lib/Target/PowerPC/AsmPrinter/CMakeLists.txt b/contrib/llvm/lib/Target/PowerPC/AsmPrinter/CMakeLists.txt
deleted file mode 100644
index 42cd486..0000000
--- a/contrib/llvm/lib/Target/PowerPC/AsmPrinter/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMPowerPCAsmPrinter
- PPCAsmPrinter.cpp
- )
-add_dependencies(LLVMPowerPCAsmPrinter PowerPCCodeGenTable_gen)
diff --git a/contrib/llvm/lib/Target/PowerPC/AsmPrinter/Makefile b/contrib/llvm/lib/Target/PowerPC/AsmPrinter/Makefile
deleted file mode 100644
index bd5dce1..0000000
--- a/contrib/llvm/lib/Target/PowerPC/AsmPrinter/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/PowerPC/AsmPrinter/Makefile --------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMPowerPCAsmPrinter
-
-# Hack: we need to include 'main' PowerPC target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/contrib/llvm/lib/Target/PowerPC/CMakeLists.txt b/contrib/llvm/lib/Target/PowerPC/CMakeLists.txt
deleted file mode 100644
index 7ffc5eb..0000000
--- a/contrib/llvm/lib/Target/PowerPC/CMakeLists.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-set(LLVM_TARGET_DEFINITIONS PPC.td)
-
-tablegen(PPCGenInstrNames.inc -gen-instr-enums)
-tablegen(PPCGenRegisterNames.inc -gen-register-enums)
-tablegen(PPCGenAsmWriter.inc -gen-asm-writer)
-tablegen(PPCGenCodeEmitter.inc -gen-emitter)
-tablegen(PPCGenRegisterInfo.h.inc -gen-register-desc-header)
-tablegen(PPCGenRegisterInfo.inc -gen-register-desc)
-tablegen(PPCGenInstrInfo.inc -gen-instr-desc)
-tablegen(PPCGenDAGISel.inc -gen-dag-isel)
-tablegen(PPCGenCallingConv.inc -gen-callingconv)
-tablegen(PPCGenSubtarget.inc -gen-subtarget)
-
-add_llvm_target(PowerPCCodeGen
- PPCBranchSelector.cpp
- PPCCodeEmitter.cpp
- PPCHazardRecognizers.cpp
- PPCInstrInfo.cpp
- PPCISelDAGToDAG.cpp
- PPCISelLowering.cpp
- PPCJITInfo.cpp
- PPCMCAsmInfo.cpp
- PPCPredicates.cpp
- PPCRegisterInfo.cpp
- PPCSubtarget.cpp
- PPCTargetMachine.cpp
- PPCSelectionDAGInfo.cpp
- )
-
-target_link_libraries (LLVMPowerPCCodeGen LLVMSelectionDAG)
diff --git a/contrib/llvm/lib/Target/PowerPC/Makefile b/contrib/llvm/lib/Target/PowerPC/Makefile
deleted file mode 100644
index 1265f1d..0000000
--- a/contrib/llvm/lib/Target/PowerPC/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-##===- lib/Target/PowerPC/Makefile -------------------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMPowerPCCodeGen
-TARGET = PPC
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = PPCGenInstrNames.inc PPCGenRegisterNames.inc \
- PPCGenAsmWriter.inc PPCGenCodeEmitter.inc \
- PPCGenRegisterInfo.h.inc PPCGenRegisterInfo.inc \
- PPCGenInstrInfo.inc PPCGenDAGISel.inc \
- PPCGenSubtarget.inc PPCGenCallingConv.inc
-
-DIRS = AsmPrinter TargetInfo
-
-include $(LEVEL)/Makefile.common
diff --git a/contrib/llvm/lib/Target/PowerPC/README.txt b/contrib/llvm/lib/Target/PowerPC/README.txt
deleted file mode 100644
index 3465779..0000000
--- a/contrib/llvm/lib/Target/PowerPC/README.txt
+++ /dev/null
@@ -1,914 +0,0 @@
-//===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
-
-TODO:
-* gpr0 allocation
-* implement do-loop -> bdnz transform
-* lmw/stmw pass a la arm load store optimizer for prolog/epilog
-
-===-------------------------------------------------------------------------===
-
-On PPC64, this:
-
-long f2 (long x) { return 0xfffffff000000000UL; }
-long f3 (long x) { return 0x1ffffffffUL; }
-
-could compile into:
-
-_f2:
- li r3,-1
- rldicr r3,r3,0,27
- blr
-_f3:
- li r3,-1
- rldicl r3,r3,0,31
- blr
-
-we produce:
-
-_f2:
- lis r2, 4095
- ori r2, r2, 65535
- sldi r3, r2, 36
- blr
-_f3:
- li r2, 1
- sldi r2, r2, 32
- oris r2, r2, 65535
- ori r3, r2, 65535
- blr
-
-
-===-------------------------------------------------------------------------===
-
-Support 'update' load/store instructions. These are cracked on the G5, but are
-still a codesize win.
-
-With preinc enabled, this:
-
-long *%test4(long *%X, long *%dest) {
- %Y = getelementptr long* %X, int 4
- %A = load long* %Y
- store long %A, long* %dest
- ret long* %Y
-}
-
-compiles to:
-
-_test4:
- mr r2, r3
- lwzu r5, 32(r2)
- lwz r3, 36(r3)
- stw r5, 0(r4)
- stw r3, 4(r4)
- mr r3, r2
- blr
-
-with -sched=list-burr, I get:
-
-_test4:
- lwz r2, 36(r3)
- lwzu r5, 32(r3)
- stw r2, 4(r4)
- stw r5, 0(r4)
- blr
-
-===-------------------------------------------------------------------------===
-
-We compile the hottest inner loop of viterbi to:
-
- li r6, 0
- b LBB1_84 ;bb432.i
-LBB1_83: ;bb420.i
- lbzx r8, r5, r7
- addi r6, r7, 1
- stbx r8, r4, r7
-LBB1_84: ;bb432.i
- mr r7, r6
- cmplwi cr0, r7, 143
- bne cr0, LBB1_83 ;bb420.i
-
-The CBE manages to produce:
-
- li r0, 143
- mtctr r0
-loop:
- lbzx r2, r2, r11
- stbx r0, r2, r9
- addi r2, r2, 1
- bdz later
- b loop
-
-This could be much better (bdnz instead of bdz) but it still beats us. If we
-produced this with bdnz, the loop would be a single dispatch group.
-
-===-------------------------------------------------------------------------===
-
-Compile:
-
-void foo(int *P) {
- if (P) *P = 0;
-}
-
-into:
-
-_foo:
- cmpwi cr0,r3,0
- beqlr cr0
- li r0,0
- stw r0,0(r3)
- blr
-
-This is effectively a simple form of predication.
-
-===-------------------------------------------------------------------------===
-
-Lump the constant pool for each function into ONE pic object, and reference
-pieces of it as offsets from the start. For functions like this (contrived
-to have lots of constants obviously):
-
-double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
-
-We generate:
-
-_X:
- lis r2, ha16(.CPI_X_0)
- lfd f0, lo16(.CPI_X_0)(r2)
- lis r2, ha16(.CPI_X_1)
- lfd f2, lo16(.CPI_X_1)(r2)
- fmadd f0, f1, f0, f2
- lis r2, ha16(.CPI_X_2)
- lfd f1, lo16(.CPI_X_2)(r2)
- lis r2, ha16(.CPI_X_3)
- lfd f2, lo16(.CPI_X_3)(r2)
- fmadd f1, f0, f1, f2
- blr
-
-It would be better to materialize .CPI_X into a register, then use immediates
-off of the register to avoid the lis's. This is even more important in PIC
-mode.
-
-Note that this (and the static variable version) is discussed here for GCC:
-http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
-
-Here's another example (the sgn function):
-double testf(double a) {
- return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
-}
-
-it produces a BB like this:
-LBB1_1: ; cond_true
- lis r2, ha16(LCPI1_0)
- lfs f0, lo16(LCPI1_0)(r2)
- lis r2, ha16(LCPI1_1)
- lis r3, ha16(LCPI1_2)
- lfs f2, lo16(LCPI1_2)(r3)
- lfs f3, lo16(LCPI1_1)(r2)
- fsub f0, f0, f1
- fsel f1, f0, f2, f3
- blr
-
-===-------------------------------------------------------------------------===
-
-PIC Code Gen IPO optimization:
-
-Squish small scalar globals together into a single global struct, allowing the
-address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
-of the GOT on targets with one).
-
-Note that this is discussed here for GCC:
-http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
-
-===-------------------------------------------------------------------------===
-
-Implement Newton-Rhapson method for improving estimate instructions to the
-correct accuracy, and implementing divide as multiply by reciprocal when it has
-more than one use. Itanium would want this too.
-
-===-------------------------------------------------------------------------===
-
-Compile offsets from allocas:
-
-int *%test() {
- %X = alloca { int, int }
- %Y = getelementptr {int,int}* %X, int 0, uint 1
- ret int* %Y
-}
-
-into a single add, not two:
-
-_test:
- addi r2, r1, -8
- addi r3, r2, 4
- blr
-
---> important for C++.
-
-===-------------------------------------------------------------------------===
-
-No loads or stores of the constants should be needed:
-
-struct foo { double X, Y; };
-void xxx(struct foo F);
-void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
-
-===-------------------------------------------------------------------------===
-
-Darwin Stub removal:
-
-We still generate calls to foo$stub, and stubs, on Darwin. This is not
-necessary when building with the Leopard (10.5) or later linker, as stubs are
-generated by ld when necessary. Parameterizing this based on the deployment
-target (-mmacosx-version-min) is probably enough. x86-32 does this right, see
-its logic.
-
-===-------------------------------------------------------------------------===
-
-Darwin Stub LICM optimization:
-
-Loops like this:
-
- for (...) bar();
-
-Have to go through an indirect stub if bar is external or linkonce. It would
-be better to compile it as:
-
- fp = &bar;
- for (...) fp();
-
-which only computes the address of bar once (instead of each time through the
-stub). This is Darwin specific and would have to be done in the code generator.
-Probably not a win on x86.
-
-===-------------------------------------------------------------------------===
-
-Simple IPO for argument passing, change:
- void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
-
-the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
-of arguments get assigned to r3 through r10. That is, if you have a function
-foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
-argument bytes for r4 and r5. The trick then would be to shuffle the argument
-order for functions we can internalize so that the maximum number of
-integers/pointers get passed in regs before you see any of the fp arguments.
-
-Instead of implementing this, it would actually probably be easier to just
-implement a PPC fastcc, where we could do whatever we wanted to the CC,
-including having this work sanely.
-
-===-------------------------------------------------------------------------===
-
-Fix Darwin FP-In-Integer Registers ABI
-
-Darwin passes doubles in structures in integer registers, which is very very
-bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
-that percolates these things out of functions.
-
-Check out how horrible this is:
-http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
-
-This is an extension of "interprocedural CC unmunging" that can't be done with
-just fastcc.
-
-===-------------------------------------------------------------------------===
-
-Compile this:
-
-int foo(int a) {
- int b = (a < 8);
- if (b) {
- return b * 3; // ignore the fact that this is always 3.
- } else {
- return 2;
- }
-}
-
-into something not this:
-
-_foo:
-1) cmpwi cr7, r3, 8
- mfcr r2, 1
- rlwinm r2, r2, 29, 31, 31
-1) cmpwi cr0, r3, 7
- bgt cr0, LBB1_2 ; UnifiedReturnBlock
-LBB1_1: ; then
- rlwinm r2, r2, 0, 31, 31
- mulli r3, r2, 3
- blr
-LBB1_2: ; UnifiedReturnBlock
- li r3, 2
- blr
-
-In particular, the two compares (marked 1) could be shared by reversing one.
-This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
-same operands (but backwards) exists. In this case, this wouldn't save us
-anything though, because the compares still wouldn't be shared.
-
-===-------------------------------------------------------------------------===
-
-We should custom expand setcc instead of pretending that we have it. That
-would allow us to expose the access of the crbit after the mfcr, allowing
-that access to be trivially folded into other ops. A simple example:
-
-int foo(int a, int b) { return (a < b) << 4; }
-
-compiles into:
-
-_foo:
- cmpw cr7, r3, r4
- mfcr r2, 1
- rlwinm r2, r2, 29, 31, 31
- slwi r3, r2, 4
- blr
-
-===-------------------------------------------------------------------------===
-
-Fold add and sub with constant into non-extern, non-weak addresses so this:
-
-static int a;
-void bar(int b) { a = b; }
-void foo(unsigned char *c) {
- *c = a;
-}
-
-So that
-
-_foo:
- lis r2, ha16(_a)
- la r2, lo16(_a)(r2)
- lbz r2, 3(r2)
- stb r2, 0(r3)
- blr
-
-Becomes
-
-_foo:
- lis r2, ha16(_a+3)
- lbz r2, lo16(_a+3)(r2)
- stb r2, 0(r3)
- blr
-
-===-------------------------------------------------------------------------===
-
-We generate really bad code for this:
-
-int f(signed char *a, _Bool b, _Bool c) {
- signed char t = 0;
- if (b) t = *a;
- if (c) *a = t;
-}
-
-===-------------------------------------------------------------------------===
-
-This:
-int test(unsigned *P) { return *P >> 24; }
-
-Should compile to:
-
-_test:
- lbz r3,0(r3)
- blr
-
-not:
-
-_test:
- lwz r2, 0(r3)
- srwi r3, r2, 24
- blr
-
-===-------------------------------------------------------------------------===
-
-On the G5, logical CR operations are more expensive in their three
-address form: ops that read/write the same register are half as expensive as
-those that read from two registers that are different from their destination.
-
-We should model this with two separate instructions. The isel should generate
-the "two address" form of the instructions. When the register allocator
-detects that it needs to insert a copy due to the two-addresness of the CR
-logical op, it will invoke PPCInstrInfo::convertToThreeAddress. At this point
-we can convert to the "three address" instruction, to save code space.
-
-This only matters when we start generating cr logical ops.
-
-===-------------------------------------------------------------------------===
-
-We should compile these two functions to the same thing:
-
-#include <stdlib.h>
-void f(int a, int b, int *P) {
- *P = (a-b)>=0?(a-b):(b-a);
-}
-void g(int a, int b, int *P) {
- *P = abs(a-b);
-}
-
-Further, they should compile to something better than:
-
-_g:
- subf r2, r4, r3
- subfic r3, r2, 0
- cmpwi cr0, r2, -1
- bgt cr0, LBB2_2 ; entry
-LBB2_1: ; entry
- mr r2, r3
-LBB2_2: ; entry
- stw r2, 0(r5)
- blr
-
-GCC produces:
-
-_g:
- subf r4,r4,r3
- srawi r2,r4,31
- xor r0,r2,r4
- subf r0,r2,r0
- stw r0,0(r5)
- blr
-
-... which is much nicer.
-
-This theoretically may help improve twolf slightly (used in dimbox.c:142?).
-
-===-------------------------------------------------------------------------===
-
-PR5945: This:
-define i32 @clamp0g(i32 %a) {
-entry:
- %cmp = icmp slt i32 %a, 0
- %sel = select i1 %cmp, i32 0, i32 %a
- ret i32 %sel
-}
-
-Is compile to this with the PowerPC (32-bit) backend:
-
-_clamp0g:
- cmpwi cr0, r3, 0
- li r2, 0
- blt cr0, LBB1_2
-; BB#1: ; %entry
- mr r2, r3
-LBB1_2: ; %entry
- mr r3, r2
- blr
-
-This could be reduced to the much simpler:
-
-_clamp0g:
- srawi r2, r3, 31
- andc r3, r3, r2
- blr
-
-===-------------------------------------------------------------------------===
-
-int foo(int N, int ***W, int **TK, int X) {
- int t, i;
-
- for (t = 0; t < N; ++t)
- for (i = 0; i < 4; ++i)
- W[t / X][i][t % X] = TK[i][t];
-
- return 5;
-}
-
-We generate relatively atrocious code for this loop compared to gcc.
-
-We could also strength reduce the rem and the div:
-http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
-
-===-------------------------------------------------------------------------===
-
-float foo(float X) { return (int)(X); }
-
-Currently produces:
-
-_foo:
- fctiwz f0, f1
- stfd f0, -8(r1)
- lwz r2, -4(r1)
- extsw r2, r2
- std r2, -16(r1)
- lfd f0, -16(r1)
- fcfid f0, f0
- frsp f1, f0
- blr
-
-We could use a target dag combine to turn the lwz/extsw into an lwa when the
-lwz has a single use. Since LWA is cracked anyway, this would be a codesize
-win only.
-
-===-------------------------------------------------------------------------===
-
-We generate ugly code for this:
-
-void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
- unsigned code = 0;
- if(dx < -dw) code |= 1;
- if(dx > dw) code |= 2;
- if(dy < -dw) code |= 4;
- if(dy > dw) code |= 8;
- if(dz < -dw) code |= 16;
- if(dz > dw) code |= 32;
- *ret = code;
-}
-
-===-------------------------------------------------------------------------===
-
-Complete the signed i32 to FP conversion code using 64-bit registers
-transformation, good for PI. See PPCISelLowering.cpp, this comment:
-
- // FIXME: disable this lowered code. This generates 64-bit register values,
- // and we don't model the fact that the top part is clobbered by calls. We
- // need to flag these together so that the value isn't live across a call.
- //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
-
-Also, if the registers are spilled to the stack, we have to ensure that all
-64-bits of them are save/restored, otherwise we will miscompile the code. It
-sounds like we need to get the 64-bit register classes going.
-
-===-------------------------------------------------------------------------===
-
-%struct.B = type { i8, [3 x i8] }
-
-define void @bar(%struct.B* %b) {
-entry:
- %tmp = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1]
- %tmp = load i32* %tmp ; <uint> [#uses=1]
- %tmp3 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1]
- %tmp4 = load i32* %tmp3 ; <uint> [#uses=1]
- %tmp8 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=2]
- %tmp9 = load i32* %tmp8 ; <uint> [#uses=1]
- %tmp4.mask17 = shl i32 %tmp4, i8 1 ; <uint> [#uses=1]
- %tmp1415 = and i32 %tmp4.mask17, 2147483648 ; <uint> [#uses=1]
- %tmp.masked = and i32 %tmp, 2147483648 ; <uint> [#uses=1]
- %tmp11 = or i32 %tmp1415, %tmp.masked ; <uint> [#uses=1]
- %tmp12 = and i32 %tmp9, 2147483647 ; <uint> [#uses=1]
- %tmp13 = or i32 %tmp12, %tmp11 ; <uint> [#uses=1]
- store i32 %tmp13, i32* %tmp8
- ret void
-}
-
-We emit:
-
-_foo:
- lwz r2, 0(r3)
- slwi r4, r2, 1
- or r4, r4, r2
- rlwimi r2, r4, 0, 0, 0
- stw r2, 0(r3)
- blr
-
-We could collapse a bunch of those ORs and ANDs and generate the following
-equivalent code:
-
-_foo:
- lwz r2, 0(r3)
- rlwinm r4, r2, 1, 0, 0
- or r2, r2, r4
- stw r2, 0(r3)
- blr
-
-===-------------------------------------------------------------------------===
-
-We compile:
-
-unsigned test6(unsigned x) {
- return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
-}
-
-into:
-
-_test6:
- lis r2, 255
- rlwinm r3, r3, 16, 0, 31
- ori r2, r2, 255
- and r3, r3, r2
- blr
-
-GCC gets it down to:
-
-_test6:
- rlwinm r0,r3,16,8,15
- rlwinm r3,r3,16,24,31
- or r3,r3,r0
- blr
-
-
-===-------------------------------------------------------------------------===
-
-Consider a function like this:
-
-float foo(float X) { return X + 1234.4123f; }
-
-The FP constant ends up in the constant pool, so we need to get the LR register.
- This ends up producing code like this:
-
-_foo:
-.LBB_foo_0: ; entry
- mflr r11
-*** stw r11, 8(r1)
- bl "L00000$pb"
-"L00000$pb":
- mflr r2
- addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
- lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
- fadds f1, f1, f0
-*** lwz r11, 8(r1)
- mtlr r11
- blr
-
-This is functional, but there is no reason to spill the LR register all the way
-to the stack (the two marked instrs): spilling it to a GPR is quite enough.
-
-Implementing this will require some codegen improvements. Nate writes:
-
-"So basically what we need to support the "no stack frame save and restore" is a
-generalization of the LR optimization to "callee-save regs".
-
-Currently, we have LR marked as a callee-save reg. The register allocator sees
-that it's callee save, and spills it directly to the stack.
-
-Ideally, something like this would happen:
-
-LR would be in a separate register class from the GPRs. The class of LR would be
-marked "unspillable". When the register allocator came across an unspillable
-reg, it would ask "what is the best class to copy this into that I *can* spill"
-If it gets a class back, which it will in this case (the gprs), it grabs a free
-register of that class. If it is then later necessary to spill that reg, so be
-it.
-
-===-------------------------------------------------------------------------===
-
-We compile this:
-int test(_Bool X) {
- return X ? 524288 : 0;
-}
-
-to:
-_test:
- cmplwi cr0, r3, 0
- lis r2, 8
- li r3, 0
- beq cr0, LBB1_2 ;entry
-LBB1_1: ;entry
- mr r3, r2
-LBB1_2: ;entry
- blr
-
-instead of:
-_test:
- addic r2,r3,-1
- subfe r0,r2,r3
- slwi r3,r0,19
- blr
-
-This sort of thing occurs a lot due to globalopt.
-
-===-------------------------------------------------------------------------===
-
-We compile:
-
-define i32 @bar(i32 %x) nounwind readnone ssp {
-entry:
- %0 = icmp eq i32 %x, 0 ; <i1> [#uses=1]
- %neg = sext i1 %0 to i32 ; <i32> [#uses=1]
- ret i32 %neg
-}
-
-to:
-
-_bar:
- cntlzw r2, r3
- slwi r2, r2, 26
- srawi r3, r2, 31
- blr
-
-it would be better to produce:
-
-_bar:
- addic r3,r3,-1
- subfe r3,r3,r3
- blr
-
-===-------------------------------------------------------------------------===
-
-We currently compile 32-bit bswap:
-
-declare i32 @llvm.bswap.i32(i32 %A)
-define i32 @test(i32 %A) {
- %B = call i32 @llvm.bswap.i32(i32 %A)
- ret i32 %B
-}
-
-to:
-
-_test:
- rlwinm r2, r3, 24, 16, 23
- slwi r4, r3, 24
- rlwimi r2, r3, 8, 24, 31
- rlwimi r4, r3, 8, 8, 15
- rlwimi r4, r2, 0, 16, 31
- mr r3, r4
- blr
-
-it would be more efficient to produce:
-
-_foo: mr r0,r3
- rlwinm r3,r3,8,0xffffffff
- rlwimi r3,r0,24,0,7
- rlwimi r3,r0,24,16,23
- blr
-
-===-------------------------------------------------------------------------===
-
-test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
-
-__ZNK4llvm5APInt17countLeadingZerosEv:
- ld r2, 0(r3)
- cntlzd r2, r2
- or r2, r2, r2 <<-- silly.
- addi r3, r2, -64
- blr
-
-The dead or is a 'truncate' from 64- to 32-bits.
-
-===-------------------------------------------------------------------------===
-
-We generate horrible ppc code for this:
-
-#define N 2000000
-double a[N],c[N];
-void simpleloop() {
- int j;
- for (j=0; j<N; j++)
- c[j] = a[j];
-}
-
-LBB1_1: ;bb
- lfdx f0, r3, r4
- addi r5, r5, 1 ;; Extra IV for the exit value compare.
- stfdx f0, r2, r4
- addi r4, r4, 8
-
- xoris r6, r5, 30 ;; This is due to a large immediate.
- cmplwi cr0, r6, 33920
- bne cr0, LBB1_1
-
-//===---------------------------------------------------------------------===//
-
-This:
- #include <algorithm>
- inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
- { return std::make_pair(a + b, a + b < a); }
- bool no_overflow(unsigned a, unsigned b)
- { return !full_add(a, b).second; }
-
-Should compile to:
-
-__Z11no_overflowjj:
- add r4,r3,r4
- subfc r3,r3,r4
- li r3,0
- adde r3,r3,r3
- blr
-
-(or better) not:
-
-__Z11no_overflowjj:
- add r2, r4, r3
- cmplw cr7, r2, r3
- mfcr r2
- rlwinm r2, r2, 29, 31, 31
- xori r3, r2, 1
- blr
-
-//===---------------------------------------------------------------------===//
-
-We compile some FP comparisons into an mfcr with two rlwinms and an or. For
-example:
-#include <math.h>
-int test(double x, double y) { return islessequal(x, y);}
-int test2(double x, double y) { return islessgreater(x, y);}
-int test3(double x, double y) { return !islessequal(x, y);}
-
-Compiles into (all three are similar, but the bits differ):
-
-_test:
- fcmpu cr7, f1, f2
- mfcr r2
- rlwinm r3, r2, 29, 31, 31
- rlwinm r2, r2, 31, 31, 31
- or r3, r2, r3
- blr
-
-GCC compiles this into:
-
- _test:
- fcmpu cr7,f1,f2
- cror 30,28,30
- mfcr r3
- rlwinm r3,r3,31,1
- blr
-
-which is more efficient and can use mfocr. See PR642 for some more context.
-
-//===---------------------------------------------------------------------===//
-
-void foo(float *data, float d) {
- long i;
- for (i = 0; i < 8000; i++)
- data[i] = d;
-}
-void foo2(float *data, float d) {
- long i;
- data--;
- for (i = 0; i < 8000; i++) {
- data[1] = d;
- data++;
- }
-}
-
-These compile to:
-
-_foo:
- li r2, 0
-LBB1_1: ; bb
- addi r4, r2, 4
- stfsx f1, r3, r2
- cmplwi cr0, r4, 32000
- mr r2, r4
- bne cr0, LBB1_1 ; bb
- blr
-_foo2:
- li r2, 0
-LBB2_1: ; bb
- addi r4, r2, 4
- stfsx f1, r3, r2
- cmplwi cr0, r4, 32000
- mr r2, r4
- bne cr0, LBB2_1 ; bb
- blr
-
-The 'mr' could be eliminated to folding the add into the cmp better.
-
-//===---------------------------------------------------------------------===//
-Codegen for the following (low-probability) case deteriorated considerably
-when the correctness fixes for unordered comparisons went in (PR 642, 58871).
-It should be possible to recover the code quality described in the comments.
-
-; RUN: llvm-as < %s | llc -march=ppc32 | grep or | count 3
-; This should produce one 'or' or 'cror' instruction per function.
-
-; RUN: llvm-as < %s | llc -march=ppc32 | grep mfcr | count 3
-; PR2964
-
-define i32 @test(double %x, double %y) nounwind {
-entry:
- %tmp3 = fcmp ole double %x, %y ; <i1> [#uses=1]
- %tmp345 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
- ret i32 %tmp345
-}
-
-define i32 @test2(double %x, double %y) nounwind {
-entry:
- %tmp3 = fcmp one double %x, %y ; <i1> [#uses=1]
- %tmp345 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
- ret i32 %tmp345
-}
-
-define i32 @test3(double %x, double %y) nounwind {
-entry:
- %tmp3 = fcmp ugt double %x, %y ; <i1> [#uses=1]
- %tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
- ret i32 %tmp34
-}
-//===----------------------------------------------------------------------===//
-; RUN: llvm-as < %s | llc -march=ppc32 | not grep fneg
-
-; This could generate FSEL with appropriate flags (FSEL is not IEEE-safe, and
-; should not be generated except with -enable-finite-only-fp-math or the like).
-; With the correctness fixes for PR642 (58871) LowerSELECT_CC would need to
-; recognize a more elaborate tree than a simple SETxx.
-
-define double @test_FNEG_sel(double %A, double %B, double %C) {
- %D = fsub double -0.000000e+00, %A ; <double> [#uses=1]
- %Cond = fcmp ugt double %D, -0.000000e+00 ; <i1> [#uses=1]
- %E = select i1 %Cond, double %B, double %C ; <double> [#uses=1]
- ret double %E
-}
-
-//===----------------------------------------------------------------------===//
-The save/restore sequence for CR in prolog/epilog is terrible:
-- Each CR subreg is saved individually, rather than doing one save as a unit.
-- On Darwin, the save is done after the decrement of SP, which means the offset
-from SP of the save slot can be too big for a store instruction, which means we
-need an additional register (currently hacked in 96015+96020; the solution there
-is correct, but poor).
-- On SVR4 the same thing can happen, and I don't think saving before the SP
-decrement is safe on that target, as there is no red zone. This is currently
-broken AFAIK, although it's not a target I can exercise.
-The following demonstrates the problem:
-extern void bar(char *p);
-void foo() {
- char x[100000];
- bar(x);
- __asm__("" ::: "cr2");
-}
diff --git a/contrib/llvm/lib/Target/PowerPC/README_ALTIVEC.txt b/contrib/llvm/lib/Target/PowerPC/README_ALTIVEC.txt
deleted file mode 100644
index 1e4c6fb..0000000
--- a/contrib/llvm/lib/Target/PowerPC/README_ALTIVEC.txt
+++ /dev/null
@@ -1,211 +0,0 @@
-//===- README_ALTIVEC.txt - Notes for improving Altivec code gen ----------===//
-
-Implement PPCInstrInfo::isLoadFromStackSlot/isStoreToStackSlot for vector
-registers, to generate better spill code.
-
-//===----------------------------------------------------------------------===//
-
-The first should be a single lvx from the constant pool, the second should be
-a xor/stvx:
-
-void foo(void) {
- int x[8] __attribute__((aligned(128))) = { 1, 1, 1, 17, 1, 1, 1, 1 };
- bar (x);
-}
-
-#include <string.h>
-void foo(void) {
- int x[8] __attribute__((aligned(128)));
- memset (x, 0, sizeof (x));
- bar (x);
-}
-
-//===----------------------------------------------------------------------===//
-
-Altivec: Codegen'ing MUL with vector FMADD should add -0.0, not 0.0:
-http://gcc.gnu.org/bugzilla/show_bug.cgi?id=8763
-
-When -ffast-math is on, we can use 0.0.
-
-//===----------------------------------------------------------------------===//
-
- Consider this:
- v4f32 Vector;
- v4f32 Vector2 = { Vector.X, Vector.X, Vector.X, Vector.X };
-
-Since we know that "Vector" is 16-byte aligned and we know the element offset
-of ".X", we should change the load into a lve*x instruction, instead of doing
-a load/store/lve*x sequence.
-
-//===----------------------------------------------------------------------===//
-
-For functions that use altivec AND have calls, we are VRSAVE'ing all call
-clobbered regs.
-
-//===----------------------------------------------------------------------===//
-
-Implement passing vectors by value into calls and receiving them as arguments.
-
-//===----------------------------------------------------------------------===//
-
-GCC apparently tries to codegen { C1, C2, Variable, C3 } as a constant pool load
-of C1/C2/C3, then a load and vperm of Variable.
-
-//===----------------------------------------------------------------------===//
-
-We need a way to teach tblgen that some operands of an intrinsic are required to
-be constants. The verifier should enforce this constraint.
-
-//===----------------------------------------------------------------------===//
-
-We currently codegen SCALAR_TO_VECTOR as a store of the scalar to a 16-byte
-aligned stack slot, followed by a load/vperm. We should probably just store it
-to a scalar stack slot, then use lvsl/vperm to load it. If the value is already
-in memory this is a big win.
-
-//===----------------------------------------------------------------------===//
-
-extract_vector_elt of an arbitrary constant vector can be done with the
-following instructions:
-
-vTemp = vec_splat(v0,2); // 2 is the element the src is in.
-vec_ste(&destloc,0,vTemp);
-
-We can do an arbitrary non-constant value by using lvsr/perm/ste.
-
-//===----------------------------------------------------------------------===//
-
-If we want to tie instruction selection into the scheduler, we can do some
-constant formation with different instructions. For example, we can generate
-"vsplti -1" with "vcmpequw R,R" and 1,1,1,1 with "vsubcuw R,R", and 0,0,0,0 with
-"vsplti 0" or "vxor", each of which use different execution units, thus could
-help scheduling.
-
-This is probably only reasonable for a post-pass scheduler.
-
-//===----------------------------------------------------------------------===//
-
-For this function:
-
-void test(vector float *A, vector float *B) {
- vector float C = (vector float)vec_cmpeq(*A, *B);
- if (!vec_any_eq(*A, *B))
- *B = (vector float){0,0,0,0};
- *A = C;
-}
-
-we get the following basic block:
-
- ...
- lvx v2, 0, r4
- lvx v3, 0, r3
- vcmpeqfp v4, v3, v2
- vcmpeqfp. v2, v3, v2
- bne cr6, LBB1_2 ; cond_next
-
-The vcmpeqfp/vcmpeqfp. instructions currently cannot be merged when the
-vcmpeqfp. result is used by a branch. This can be improved.
-
-//===----------------------------------------------------------------------===//
-
-The code generated for this is truly aweful:
-
-vector float test(float a, float b) {
- return (vector float){ 0.0, a, 0.0, 0.0};
-}
-
-LCPI1_0: ; float
- .space 4
- .text
- .globl _test
- .align 4
-_test:
- mfspr r2, 256
- oris r3, r2, 4096
- mtspr 256, r3
- lis r3, ha16(LCPI1_0)
- addi r4, r1, -32
- stfs f1, -16(r1)
- addi r5, r1, -16
- lfs f0, lo16(LCPI1_0)(r3)
- stfs f0, -32(r1)
- lvx v2, 0, r4
- lvx v3, 0, r5
- vmrghw v3, v3, v2
- vspltw v2, v2, 0
- vmrghw v2, v2, v3
- mtspr 256, r2
- blr
-
-//===----------------------------------------------------------------------===//
-
-int foo(vector float *x, vector float *y) {
- if (vec_all_eq(*x,*y)) return 3245;
- else return 12;
-}
-
-A predicate compare being used in a select_cc should have the same peephole
-applied to it as a predicate compare used by a br_cc. There should be no
-mfcr here:
-
-_foo:
- mfspr r2, 256
- oris r5, r2, 12288
- mtspr 256, r5
- li r5, 12
- li r6, 3245
- lvx v2, 0, r4
- lvx v3, 0, r3
- vcmpeqfp. v2, v3, v2
- mfcr r3, 2
- rlwinm r3, r3, 25, 31, 31
- cmpwi cr0, r3, 0
- bne cr0, LBB1_2 ; entry
-LBB1_1: ; entry
- mr r6, r5
-LBB1_2: ; entry
- mr r3, r6
- mtspr 256, r2
- blr
-
-//===----------------------------------------------------------------------===//
-
-CodeGen/PowerPC/vec_constants.ll has an and operation that should be
-codegen'd to andc. The issue is that the 'all ones' build vector is
-SelectNodeTo'd a VSPLTISB instruction node before the and/xor is selected
-which prevents the vnot pattern from matching.
-
-
-//===----------------------------------------------------------------------===//
-
-An alternative to the store/store/load approach for illegal insert element
-lowering would be:
-
-1. store element to any ol' slot
-2. lvx the slot
-3. lvsl 0; splat index; vcmpeq to generate a select mask
-4. lvsl slot + x; vperm to rotate result into correct slot
-5. vsel result together.
-
-//===----------------------------------------------------------------------===//
-
-Should codegen branches on vec_any/vec_all to avoid mfcr. Two examples:
-
-#include <altivec.h>
- int f(vector float a, vector float b)
- {
- int aa = 0;
- if (vec_all_ge(a, b))
- aa |= 0x1;
- if (vec_any_ge(a,b))
- aa |= 0x2;
- return aa;
-}
-
-vector float f(vector float a, vector float b) {
- if (vec_any_eq(a, b))
- return a;
- else
- return b;
-}
-
diff --git a/contrib/llvm/lib/Target/PowerPC/TargetInfo/CMakeLists.txt b/contrib/llvm/lib/Target/PowerPC/TargetInfo/CMakeLists.txt
deleted file mode 100644
index 058d599..0000000
--- a/contrib/llvm/lib/Target/PowerPC/TargetInfo/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMPowerPCInfo
- PowerPCTargetInfo.cpp
- )
-
-add_dependencies(LLVMPowerPCInfo PowerPCCodeGenTable_gen)
diff --git a/contrib/llvm/lib/Target/PowerPC/TargetInfo/Makefile b/contrib/llvm/lib/Target/PowerPC/TargetInfo/Makefile
deleted file mode 100644
index a101aa4..0000000
--- a/contrib/llvm/lib/Target/PowerPC/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/PowerPC/TargetInfo/Makefile --------------*- Makefile -*-===##
-#
-# The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMPowerPCInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
OpenPOWER on IntegriCloud