diff options
Diffstat (limited to 'contrib/llvm/lib/Target/X86/README-X86-64.txt')
-rw-r--r-- | contrib/llvm/lib/Target/X86/README-X86-64.txt | 273 |
1 files changed, 0 insertions, 273 deletions
diff --git a/contrib/llvm/lib/Target/X86/README-X86-64.txt b/contrib/llvm/lib/Target/X86/README-X86-64.txt deleted file mode 100644 index 78c4dc0..0000000 --- a/contrib/llvm/lib/Target/X86/README-X86-64.txt +++ /dev/null @@ -1,273 +0,0 @@ -//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===// - -AMD64 Optimization Manual 8.2 has some nice information about optimizing integer -multiplication by a constant. How much of it applies to Intel's X86-64 -implementation? There are definite trade-offs to consider: latency vs. register -pressure vs. code size. - -//===---------------------------------------------------------------------===// - -Are we better off using branches instead of cmove to implement FP to -unsigned i64? - -_conv: - ucomiss LC0(%rip), %xmm0 - cvttss2siq %xmm0, %rdx - jb L3 - subss LC0(%rip), %xmm0 - movabsq $-9223372036854775808, %rax - cvttss2siq %xmm0, %rdx - xorq %rax, %rdx -L3: - movq %rdx, %rax - ret - -instead of - -_conv: - movss LCPI1_0(%rip), %xmm1 - cvttss2siq %xmm0, %rcx - movaps %xmm0, %xmm2 - subss %xmm1, %xmm2 - cvttss2siq %xmm2, %rax - movabsq $-9223372036854775808, %rdx - xorq %rdx, %rax - ucomiss %xmm1, %xmm0 - cmovb %rcx, %rax - ret - -Seems like the jb branch has high likelyhood of being taken. It would have -saved a few instructions. - -//===---------------------------------------------------------------------===// - -Poor codegen: - -int X[2]; -int b; -void test(void) { - memset(X, b, 2*sizeof(X[0])); -} - -llc: - movq _b@GOTPCREL(%rip), %rax - movzbq (%rax), %rax - movq %rax, %rcx - shlq $8, %rcx - orq %rax, %rcx - movq %rcx, %rax - shlq $16, %rax - orq %rcx, %rax - movq %rax, %rcx - shlq $32, %rcx - movq _X@GOTPCREL(%rip), %rdx - orq %rax, %rcx - movq %rcx, (%rdx) - ret - -gcc: - movq _b@GOTPCREL(%rip), %rax - movabsq $72340172838076673, %rdx - movzbq (%rax), %rax - imulq %rdx, %rax - movq _X@GOTPCREL(%rip), %rdx - movq %rax, (%rdx) - ret - -And the codegen is even worse for the following -(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33103): - void fill1(char *s, int a) - { - __builtin_memset(s, a, 15); - } - -For this version, we duplicate the computation of the constant to store. - -//===---------------------------------------------------------------------===// - -It's not possible to reference AH, BH, CH, and DH registers in an instruction -requiring REX prefix. However, divb and mulb both produce results in AH. If isel -emits a CopyFromReg which gets turned into a movb and that can be allocated a -r8b - r15b. - -To get around this, isel emits a CopyFromReg from AX and then right shift it -down by 8 and truncate it. It's not pretty but it works. We need some register -allocation magic to make the hack go away (e.g. putting additional constraints -on the result of the movb). - -//===---------------------------------------------------------------------===// - -The x86-64 ABI for hidden-argument struct returns requires that the -incoming value of %rdi be copied into %rax by the callee upon return. - -The idea is that it saves callers from having to remember this value, -which would often require a callee-saved register. Callees usually -need to keep this value live for most of their body anyway, so it -doesn't add a significant burden on them. - -We currently implement this in codegen, however this is suboptimal -because it means that it would be quite awkward to implement the -optimization for callers. - -A better implementation would be to relax the LLVM IR rules for sret -arguments to allow a function with an sret argument to have a non-void -return type, and to have the front-end to set up the sret argument value -as the return value of the function. The front-end could more easily -emit uses of the returned struct value to be in terms of the function's -lowered return value, and it would free non-C frontends from a -complication only required by a C-based ABI. - -//===---------------------------------------------------------------------===// - -We get a redundant zero extension for code like this: - -int mask[1000]; -int foo(unsigned x) { - if (x < 10) - x = x * 45; - else - x = x * 78; - return mask[x]; -} - -_foo: -LBB1_0: ## entry - cmpl $9, %edi - jbe LBB1_3 ## bb -LBB1_1: ## bb1 - imull $78, %edi, %eax -LBB1_2: ## bb2 - movl %eax, %eax <---- - movq _mask@GOTPCREL(%rip), %rcx - movl (%rcx,%rax,4), %eax - ret -LBB1_3: ## bb - imull $45, %edi, %eax - jmp LBB1_2 ## bb2 - -Before regalloc, we have: - - %reg1025<def> = IMUL32rri8 %reg1024, 45, %EFLAGS<imp-def> - JMP mbb<bb2,0x203afb0> - Successors according to CFG: 0x203afb0 (#3) - -bb1: 0x203af60, LLVM BB @0x1e02310, ID#2: - Predecessors according to CFG: 0x203aec0 (#0) - %reg1026<def> = IMUL32rri8 %reg1024, 78, %EFLAGS<imp-def> - Successors according to CFG: 0x203afb0 (#3) - -bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3: - Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2) - %reg1027<def> = PHI %reg1025, mbb<bb,0x203af10>, - %reg1026, mbb<bb1,0x203af60> - %reg1029<def> = MOVZX64rr32 %reg1027 - -so we'd have to know that IMUL32rri8 leaves the high word zero extended and to -be able to recognize the zero extend. This could also presumably be implemented -if we have whole-function selectiondags. - -//===---------------------------------------------------------------------===// - -Take the following C code -(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43640): - -struct u1 -{ - float x; - float y; -}; - -float foo(struct u1 u) -{ - return u.x + u.y; -} - -Optimizes to the following IR: -define float @foo(double %u.0) nounwind readnone { -entry: - %tmp8 = bitcast double %u.0 to i64 ; <i64> [#uses=2] - %tmp6 = trunc i64 %tmp8 to i32 ; <i32> [#uses=1] - %tmp7 = bitcast i32 %tmp6 to float ; <float> [#uses=1] - %tmp2 = lshr i64 %tmp8, 32 ; <i64> [#uses=1] - %tmp3 = trunc i64 %tmp2 to i32 ; <i32> [#uses=1] - %tmp4 = bitcast i32 %tmp3 to float ; <float> [#uses=1] - %0 = fadd float %tmp7, %tmp4 ; <float> [#uses=1] - ret float %0 -} - -And current llvm-gcc/clang output: - movd %xmm0, %rax - movd %eax, %xmm1 - shrq $32, %rax - movd %eax, %xmm0 - addss %xmm1, %xmm0 - ret - -We really shouldn't move the floats to RAX, only to immediately move them -straight back to the XMM registers. - -There really isn't any good way to handle this purely in IR optimizers; it -could possibly be handled by changing the output of the fronted, though. It -would also be feasible to add a x86-specific DAGCombine to optimize the -bitcast+trunc+(lshr+)bitcast combination. - -//===---------------------------------------------------------------------===// - -Take the following code -(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653): -extern unsigned long table[]; -unsigned long foo(unsigned char *p) { - unsigned long tag = *p; - return table[tag >> 4] + table[tag & 0xf]; -} - -Current code generated: - movzbl (%rdi), %eax - movq %rax, %rcx - andq $240, %rcx - shrq %rcx - andq $15, %rax - movq table(,%rax,8), %rax - addq table(%rcx), %rax - ret - -Issues: -1. First movq should be movl; saves a byte. -2. Both andq's should be andl; saves another two bytes. I think this was - implemented at one point, but subsequently regressed. -3. shrq should be shrl; saves another byte. -4. The first andq can be completely eliminated by using a slightly more - expensive addressing mode. - -//===---------------------------------------------------------------------===// - -Consider the following (contrived testcase, but contains common factors): - -#include <stdarg.h> -int test(int x, ...) { - int sum, i; - va_list l; - va_start(l, x); - for (i = 0; i < x; i++) - sum += va_arg(l, int); - va_end(l); - return sum; -} - -Testcase given in C because fixing it will likely involve changing the IR -generated for it. The primary issue with the result is that it doesn't do any -of the optimizations which are possible if we know the address of a va_list -in the current function is never taken: -1. We shouldn't spill the XMM registers because we only call va_arg with "int". -2. It would be nice if we could scalarrepl the va_list. -3. Probably overkill, but it'd be cool if we could peel off the first five -iterations of the loop. - -Other optimizations involving functions which use va_arg on floats which don't -have the address of a va_list taken: -1. Conversely to the above, we shouldn't spill general registers if we only - call va_arg on "double". -2. If we know nothing more than 64 bits wide is read from the XMM registers, - we can change the spilling code to reduce the amount of stack used by half. - -//===---------------------------------------------------------------------===// |