summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/X86/README.txt
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/X86/README.txt')
-rw-r--r--contrib/llvm/lib/Target/X86/README.txt104
1 files changed, 97 insertions, 7 deletions
diff --git a/contrib/llvm/lib/Target/X86/README.txt b/contrib/llvm/lib/Target/X86/README.txt
index efc0cd8..a305ae6 100644
--- a/contrib/llvm/lib/Target/X86/README.txt
+++ b/contrib/llvm/lib/Target/X86/README.txt
@@ -1135,13 +1135,6 @@ void test(double *P) {
//===---------------------------------------------------------------------===//
-handling llvm.memory.barrier on pre SSE2 cpus
-
-should generate:
-lock ; mov %esp, %esp
-
-//===---------------------------------------------------------------------===//
-
The generated code on x86 for checking for signed overflow on a multiply the
obvious way is much longer than it needs to be.
@@ -1870,3 +1863,100 @@ The code produced by gcc is 3 bytes shorter. This sort of construct often
shows up with bitfields.
//===---------------------------------------------------------------------===//
+
+Take the following C code:
+int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
+
+We generate the following IR with clang:
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+ %tmp = xor i32 %b, %a ; <i32> [#uses=1]
+ %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1]
+ %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1]
+ %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
+ ret i32 %conv5
+}
+
+And the following x86 code:
+ xorl %esi, %edi
+ testb $-1, %dil
+ sete %al
+ movzbl %al, %eax
+ ret
+
+A cmpb instead of the xorl+testb would be one instruction shorter.
+
+//===---------------------------------------------------------------------===//
+
+Given the following C code:
+int f(int a, int b) { return (signed char)a == (signed char)b; }
+
+We generate the following IR with clang:
+define i32 @f(i32 %a, i32 %b) nounwind readnone {
+entry:
+ %sext = shl i32 %a, 24 ; <i32> [#uses=1]
+ %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1]
+ %sext6 = shl i32 %b, 24 ; <i32> [#uses=1]
+ %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1]
+ %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1]
+ %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
+ ret i32 %conv5
+}
+
+And the following x86 code:
+ movsbl %sil, %eax
+ movsbl %dil, %ecx
+ cmpl %eax, %ecx
+ sete %al
+ movzbl %al, %eax
+ ret
+
+
+It should be possible to eliminate the sign extensions.
+
+//===---------------------------------------------------------------------===//
+
+LLVM misses a load+store narrowing opportunity in this code:
+
+%struct.bf = type { i64, i16, i16, i32 }
+
+@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2]
+
+define void @t1() nounwind ssp {
+entry:
+ %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]
+ %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
+ %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2]
+ %3 = load i32* %2, align 1 ; <i32> [#uses=1]
+ %4 = and i32 %3, -65537 ; <i32> [#uses=1]
+ store i32 %4, i32* %2, align 1
+ %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]
+ %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
+ %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2]
+ %8 = load i32* %7, align 1 ; <i32> [#uses=1]
+ %9 = and i32 %8, -131073 ; <i32> [#uses=1]
+ store i32 %9, i32* %7, align 1
+ ret void
+}
+
+LLVM currently emits this:
+
+ movq bfi(%rip), %rax
+ andl $-65537, 8(%rax)
+ movq bfi(%rip), %rax
+ andl $-131073, 8(%rax)
+ ret
+
+It could narrow the loads and stores to emit this:
+
+ movq bfi(%rip), %rax
+ andb $-2, 10(%rax)
+ movq bfi(%rip), %rax
+ andb $-3, 10(%rax)
+ ret
+
+The trouble is that there is a TokenFactor between the store and the
+load, making it non-trivial to determine if there's anything between
+the load and the store which would prohibit narrowing.
+
+//===---------------------------------------------------------------------===//
OpenPOWER on IntegriCloud