summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/X86/README-X86-64.txt
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/X86/README-X86-64.txt')
-rw-r--r--contrib/llvm/lib/Target/X86/README-X86-64.txt273
1 files changed, 273 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/X86/README-X86-64.txt b/contrib/llvm/lib/Target/X86/README-X86-64.txt
new file mode 100644
index 0000000..78c4dc0
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/README-X86-64.txt
@@ -0,0 +1,273 @@
+//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
+
+AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
+multiplication by a constant. How much of it applies to Intel's X86-64
+implementation? There are definite trade-offs to consider: latency vs. register
+pressure vs. code size.
+
+//===---------------------------------------------------------------------===//
+
+Are we better off using branches instead of cmove to implement FP to
+unsigned i64?
+
+_conv:
+ ucomiss LC0(%rip), %xmm0
+ cvttss2siq %xmm0, %rdx
+ jb L3
+ subss LC0(%rip), %xmm0
+ movabsq $-9223372036854775808, %rax
+ cvttss2siq %xmm0, %rdx
+ xorq %rax, %rdx
+L3:
+ movq %rdx, %rax
+ ret
+
+instead of
+
+_conv:
+ movss LCPI1_0(%rip), %xmm1
+ cvttss2siq %xmm0, %rcx
+ movaps %xmm0, %xmm2
+ subss %xmm1, %xmm2
+ cvttss2siq %xmm2, %rax
+ movabsq $-9223372036854775808, %rdx
+ xorq %rdx, %rax
+ ucomiss %xmm1, %xmm0
+ cmovb %rcx, %rax
+ ret
+
+Seems like the jb branch has high likelyhood of being taken. It would have
+saved a few instructions.
+
+//===---------------------------------------------------------------------===//
+
+Poor codegen:
+
+int X[2];
+int b;
+void test(void) {
+ memset(X, b, 2*sizeof(X[0]));
+}
+
+llc:
+ movq _b@GOTPCREL(%rip), %rax
+ movzbq (%rax), %rax
+ movq %rax, %rcx
+ shlq $8, %rcx
+ orq %rax, %rcx
+ movq %rcx, %rax
+ shlq $16, %rax
+ orq %rcx, %rax
+ movq %rax, %rcx
+ shlq $32, %rcx
+ movq _X@GOTPCREL(%rip), %rdx
+ orq %rax, %rcx
+ movq %rcx, (%rdx)
+ ret
+
+gcc:
+ movq _b@GOTPCREL(%rip), %rax
+ movabsq $72340172838076673, %rdx
+ movzbq (%rax), %rax
+ imulq %rdx, %rax
+ movq _X@GOTPCREL(%rip), %rdx
+ movq %rax, (%rdx)
+ ret
+
+And the codegen is even worse for the following
+(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33103):
+ void fill1(char *s, int a)
+ {
+ __builtin_memset(s, a, 15);
+ }
+
+For this version, we duplicate the computation of the constant to store.
+
+//===---------------------------------------------------------------------===//
+
+It's not possible to reference AH, BH, CH, and DH registers in an instruction
+requiring REX prefix. However, divb and mulb both produce results in AH. If isel
+emits a CopyFromReg which gets turned into a movb and that can be allocated a
+r8b - r15b.
+
+To get around this, isel emits a CopyFromReg from AX and then right shift it
+down by 8 and truncate it. It's not pretty but it works. We need some register
+allocation magic to make the hack go away (e.g. putting additional constraints
+on the result of the movb).
+
+//===---------------------------------------------------------------------===//
+
+The x86-64 ABI for hidden-argument struct returns requires that the
+incoming value of %rdi be copied into %rax by the callee upon return.
+
+The idea is that it saves callers from having to remember this value,
+which would often require a callee-saved register. Callees usually
+need to keep this value live for most of their body anyway, so it
+doesn't add a significant burden on them.
+
+We currently implement this in codegen, however this is suboptimal
+because it means that it would be quite awkward to implement the
+optimization for callers.
+
+A better implementation would be to relax the LLVM IR rules for sret
+arguments to allow a function with an sret argument to have a non-void
+return type, and to have the front-end to set up the sret argument value
+as the return value of the function. The front-end could more easily
+emit uses of the returned struct value to be in terms of the function's
+lowered return value, and it would free non-C frontends from a
+complication only required by a C-based ABI.
+
+//===---------------------------------------------------------------------===//
+
+We get a redundant zero extension for code like this:
+
+int mask[1000];
+int foo(unsigned x) {
+ if (x < 10)
+ x = x * 45;
+ else
+ x = x * 78;
+ return mask[x];
+}
+
+_foo:
+LBB1_0: ## entry
+ cmpl $9, %edi
+ jbe LBB1_3 ## bb
+LBB1_1: ## bb1
+ imull $78, %edi, %eax
+LBB1_2: ## bb2
+ movl %eax, %eax <----
+ movq _mask@GOTPCREL(%rip), %rcx
+ movl (%rcx,%rax,4), %eax
+ ret
+LBB1_3: ## bb
+ imull $45, %edi, %eax
+ jmp LBB1_2 ## bb2
+
+Before regalloc, we have:
+
+ %reg1025<def> = IMUL32rri8 %reg1024, 45, %EFLAGS<imp-def>
+ JMP mbb<bb2,0x203afb0>
+ Successors according to CFG: 0x203afb0 (#3)
+
+bb1: 0x203af60, LLVM BB @0x1e02310, ID#2:
+ Predecessors according to CFG: 0x203aec0 (#0)
+ %reg1026<def> = IMUL32rri8 %reg1024, 78, %EFLAGS<imp-def>
+ Successors according to CFG: 0x203afb0 (#3)
+
+bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3:
+ Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2)
+ %reg1027<def> = PHI %reg1025, mbb<bb,0x203af10>,
+ %reg1026, mbb<bb1,0x203af60>
+ %reg1029<def> = MOVZX64rr32 %reg1027
+
+so we'd have to know that IMUL32rri8 leaves the high word zero extended and to
+be able to recognize the zero extend. This could also presumably be implemented
+if we have whole-function selectiondags.
+
+//===---------------------------------------------------------------------===//
+
+Take the following C code
+(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43640):
+
+struct u1
+{
+ float x;
+ float y;
+};
+
+float foo(struct u1 u)
+{
+ return u.x + u.y;
+}
+
+Optimizes to the following IR:
+define float @foo(double %u.0) nounwind readnone {
+entry:
+ %tmp8 = bitcast double %u.0 to i64 ; <i64> [#uses=2]
+ %tmp6 = trunc i64 %tmp8 to i32 ; <i32> [#uses=1]
+ %tmp7 = bitcast i32 %tmp6 to float ; <float> [#uses=1]
+ %tmp2 = lshr i64 %tmp8, 32 ; <i64> [#uses=1]
+ %tmp3 = trunc i64 %tmp2 to i32 ; <i32> [#uses=1]
+ %tmp4 = bitcast i32 %tmp3 to float ; <float> [#uses=1]
+ %0 = fadd float %tmp7, %tmp4 ; <float> [#uses=1]
+ ret float %0
+}
+
+And current llvm-gcc/clang output:
+ movd %xmm0, %rax
+ movd %eax, %xmm1
+ shrq $32, %rax
+ movd %eax, %xmm0
+ addss %xmm1, %xmm0
+ ret
+
+We really shouldn't move the floats to RAX, only to immediately move them
+straight back to the XMM registers.
+
+There really isn't any good way to handle this purely in IR optimizers; it
+could possibly be handled by changing the output of the fronted, though. It
+would also be feasible to add a x86-specific DAGCombine to optimize the
+bitcast+trunc+(lshr+)bitcast combination.
+
+//===---------------------------------------------------------------------===//
+
+Take the following code
+(from http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34653):
+extern unsigned long table[];
+unsigned long foo(unsigned char *p) {
+ unsigned long tag = *p;
+ return table[tag >> 4] + table[tag & 0xf];
+}
+
+Current code generated:
+ movzbl (%rdi), %eax
+ movq %rax, %rcx
+ andq $240, %rcx
+ shrq %rcx
+ andq $15, %rax
+ movq table(,%rax,8), %rax
+ addq table(%rcx), %rax
+ ret
+
+Issues:
+1. First movq should be movl; saves a byte.
+2. Both andq's should be andl; saves another two bytes. I think this was
+ implemented at one point, but subsequently regressed.
+3. shrq should be shrl; saves another byte.
+4. The first andq can be completely eliminated by using a slightly more
+ expensive addressing mode.
+
+//===---------------------------------------------------------------------===//
+
+Consider the following (contrived testcase, but contains common factors):
+
+#include <stdarg.h>
+int test(int x, ...) {
+ int sum, i;
+ va_list l;
+ va_start(l, x);
+ for (i = 0; i < x; i++)
+ sum += va_arg(l, int);
+ va_end(l);
+ return sum;
+}
+
+Testcase given in C because fixing it will likely involve changing the IR
+generated for it. The primary issue with the result is that it doesn't do any
+of the optimizations which are possible if we know the address of a va_list
+in the current function is never taken:
+1. We shouldn't spill the XMM registers because we only call va_arg with "int".
+2. It would be nice if we could scalarrepl the va_list.
+3. Probably overkill, but it'd be cool if we could peel off the first five
+iterations of the loop.
+
+Other optimizations involving functions which use va_arg on floats which don't
+have the address of a va_list taken:
+1. Conversely to the above, we shouldn't spill general registers if we only
+ call va_arg on "double".
+2. If we know nothing more than 64 bits wide is read from the XMM registers,
+ we can change the spilling code to reduce the amount of stack used by half.
+
+//===---------------------------------------------------------------------===//
OpenPOWER on IntegriCloud